## Modeling

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

import re
import string

df_train = pd.read_csv("train_set.csv", index_col='Original Index')
df_valid = pd.read_csv("valid_set.csv", index_col='Original Index')
df_test = pd.read_csv("test_set.csv", index_col='Original Index')

X_train = df_train[[col for col in df_train.columns if col != 'POI']]
y_train = df_train['POI']

X_valid = df_valid[[col for col in df_valid.columns if col != 'POI']]
y_valid = df_valid['POI']

X_test = df_test[[col for col in df_test.columns if col != 'POI']]
y_test = df_test['POI']

print(f"Training: {X_train.shape[0]}, Validation: {X_valid.shape[0]}, Test: {X_test.shape[0]}")

import nltk
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# get NLTK's stopwords
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words("english"))

# POS tagging
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

# lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
 
lemmatizer = WordNetLemmatizer()

def pos_normalizer(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV   
    return wordnet.NOUN
  
# stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# n-gram generator
from nltk import ngrams

In [34]:
# lemmatize, unigram
def preprocess_1(s):
    # tokenize
    tokens = [token for token in word_tokenize(s)]
    
    # remove numbers and apostrophes and convert to lowercase
    tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

    # assign POS
    pos_tags = pos_tag(tokens)

    # lemmatize
    lemmas = [lemmatizer.lemmatize(token, pos=pos_normalizer(pos)) for token, pos in pos_tags]

    # remove stopwords and punctuation
    tokens = [token for token in lemmas if (token not in stopwords and token not in string.punctuation)]

    # n-gram
    n = 1
    feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
    return feats

# stem, unigram
def preprocess_2(s):
    # tokenize
    tokens = [token for token in word_tokenize(s)]
    
    # remove numbers and apostrophes and convert to lowercase
    tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

    # assign POS
    pos_tags = pos_tag(tokens)
 
    # stem
    stems = [stemmer.stem(word) for word in tokens]

    # remove stopwords and punctuation
    tokens = [token for token in stems if (token not in stopwords and token not in string.punctuation)]

    # n-gram
    n = 1
    feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
    return feats

# lemmatize, bigram
def preprocess_3(s):
    # tokenize
    tokens = [token for token in word_tokenize(s)]
    
    # remove numbers and apostrophes and convert to lowercase
    tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

    # assign POS
    pos_tags = pos_tag(tokens)

    # lemmatize
    lemmas = [lemmatizer.lemmatize(token, pos=pos_normalizer(pos)) for token, pos in pos_tags]

    # remove stopwords and punctuation
    tokens = [token for token in lemmas if (token not in stopwords and token not in string.punctuation)]

    # n-gram
    n = 2
    feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
    return feats

# stem, bigram
def preprocess_4(s):
    # tokenize
    tokens = [token for token in word_tokenize(s)]
    
    # remove numbers and apostrophes and convert to lowercase
    tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

    # assign POS
    pos_tags = pos_tag(tokens)
 
    # stem
    stems = [stemmer.stem(word) for word in tokens]

    # remove stopwords and punctuation
    tokens = [token for token in stems if (token not in stopwords and token not in string.punctuation)]

    # n-gram
    n = 2
    feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
    return feats

# # lemmatize, unigram, no stopword removal
# def preprocess_5(s):
#     # tokenize
#     tokens = [token for token in word_tokenize(s)]
    
#     # remove numbers and apostrophes and convert to lowercase
#     tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

#     # assign POS
#     pos_tags = pos_tag(tokens)

#     # lemmatize
#     lemmas = [lemmatizer.lemmatize(token, pos=pos_normalizer(pos)) for token, pos in pos_tags]

#     # remove punctuation
#     tokens = [token for token in lemmas if (token not in string.punctuation)]

#     # n-gram
#     n = 1
#     feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
#     return feats

# # stem, unigram, no stopword removal
# def preprocess_6(s):
#     # tokenize
#     tokens = [token for token in word_tokenize(s)]
    
#     # remove numbers and apostrophes and convert to lowercase
#     tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

#     # assign POS
#     pos_tags = pos_tag(tokens)
 
#     # stem
#     stems = [stemmer.stem(word) for word in tokens]

#     # remove punctuation
#     tokens = [token for token in stems if (token not in string.punctuation)]

#     # n-gram
#     n = 1
#     feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
#     return feats

# # lemmatize, bigram, no stopword removal
# def preprocess_7(s):
#     # tokenize
#     tokens = [token for token in word_tokenize(s)]
    
#     # remove numbers and apostrophes and convert to lowercase
#     tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

#     # assign POS
#     pos_tags = pos_tag(tokens)

#     # lemmatize
#     lemmas = [lemmatizer.lemmatize(token, pos=pos_normalizer(pos)) for token, pos in pos_tags]

#     # remove punctuation
#     tokens = [token for token in lemmas if (token not in string.punctuation)]

#     # n-gram
#     n = 2
#     feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
#     return feats

# # stem, bigram, no stopword removal
# def preprocess_8(s):
#     # tokenize
#     tokens = [token for token in word_tokenize(s)]
    
#     # remove numbers and apostrophes and convert to lowercase
#     tokens = [token.lower() for token in tokens if len(re.findall("[“”'’`\d]+", token)) == 0]

#     # assign POS
#     pos_tags = pos_tag(tokens)
 
#     # stem
#     stems = [stemmer.stem(word) for word in tokens]

#     # remove punctuation
#     tokens = [token for token in stems if (token not in string.punctuation)]

#     # n-gram
#     n = 2
#     feats = [" ".join(gram) for gram in list(ngrams(tokens, n))]
    
#     return feats

In [36]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression

In [37]:
def train_model(X_train, y_train, preprocess_func=preprocess_1, model_type="naive", hyperparam=1):
    
    # preprocess the training set
    X_processed = [preprocess_func(x) for x in X_train]
    
    # initialize vectorizer
    vectorizer = TfidfVectorizer(analyzer=(lambda x: x))
        
    # fit vectorizer
    X = vectorizer.fit_transform(X_processed)
    
    # initialize the classifier
    if model_type == "naive":
        # clf = MultinomialNB(alpha=hyperparam)
        clf = ComplementNB(alpha=hyperparam)
    elif model_type == "svm":
        clf = SVC(C=hyperparam)
    elif model_type == "logistic":
        clf = LogisticRegression(C=hyperparam)
    
    # fit the model
    clf.fit(X, y_train)
    
    return clf, vectorizer

def get_preds(X, clf, vectorizer, preprocess_func):
    X_processed = [preprocess_func(x) for x in X]
    X_vectorized = vectorizer.transform(X_processed)
    return clf.predict(X_vectorized)

def compute_f1(X, y_true, clf, vectorizer, preprocess_func):
    y_pred = get_preds(X, clf, vectorizer, preprocess_func)
    return precision_recall_fscore_support(y_true, y_pred)[2]

def compute_acc(X, y_true, clf, vectorizer, preprocess_func):
    X_processed = [preprocess_func(x) for x in X]
    X_vectorized = vectorizer.transform(X_processed)
    return clf.score(X_vectorized, y_true)

In [38]:
email_col = 'Classify Email'

preprocessors = [preprocess_1, preprocess_2, preprocess_3, preprocess_4]
model_types = ["naive", "logistic"]
hyperparams = [0.5, 1, 10]
# model_types = ["logistic"]
# hyperparams = [10, 20, 50, 100]

preprocessing_dict = {
    preprocess_1 : "Lemmatize + Unigram",
    preprocess_2 : "Stem + Unigram",
    preprocess_3 : "Lemmatize + Bigram",
    preprocess_4 : "Stem + Bigram"
}

best_model = {
    "preprocessor": preprocess_1,
    "model": "naive",
    "hyperparam": 0.5,
    "valid_acc": 0
}

for preprocessor in preprocessors:
    print(f"Preprocessor: {preprocessing_dict[preprocessor]}")
    for model_type in model_types:
        print(f"Model: {model_type}")
        for hyperparam in hyperparams:
            if model_type == "naive":
                print(f"alpha = {hyperparam}")
            else:
                print(f"C = {hyperparam}")
                
            clf, vectorizer = train_model(X_train[email_col], y_train, preprocess_func=preprocessor, model_type=model_type, hyperparam=hyperparam)
            
            valid_f1 = compute_f1(X_valid[email_col], y_valid, clf, vectorizer, preprocessor)
            valid_acc = compute_acc(X_valid[email_col], y_valid, clf, vectorizer, preprocessor)

            print(f"\tValidation f1: {np.round(valid_f1[1], 3)*100}") 
            print(f"\tValidation accuracy: {np.round(valid_acc, 3)*100}") 
            
            # save model, if it's the best
            if valid_acc > best_model['valid_acc']:
                best_model["preprocessor"] = preprocessor
                best_model["model"] = model_type
                best_model["hyperparam"] = hyperparam
                best_model["valid_acc"] = valid_acc
        print()
    print()

Preprocessor: Lemmatize + Unigram
Model: naive
alpha = 0.5
	Validation f1: 37.3
	Validation accuracy: 91.7
alpha = 1
	Validation f1: 17.5
	Validation accuracy: 91.4
alpha = 10
	Validation f1: 1.0999999999999999
	Validation accuracy: 91.10000000000001

Model: logistic
C = 0.5
	Validation f1: 39.0
	Validation accuracy: 93.60000000000001
C = 1
	Validation f1: 55.1
	Validation accuracy: 94.69999999999999
C = 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Validation f1: 74.7
	Validation accuracy: 96.39999999999999


Preprocessor: Stem + Unigram
Model: naive
alpha = 0.5
	Validation f1: 40.6
	Validation accuracy: 91.7
alpha = 1
	Validation f1: 17.1
	Validation accuracy: 91.2
alpha = 10
	Validation f1: 0.0
	Validation accuracy: 91.10000000000001

Model: logistic
C = 0.5
	Validation f1: 40.699999999999996
	Validation accuracy: 93.60000000000001
C = 1
	Validation f1: 55.800000000000004
	Validation accuracy: 94.69999999999999
C = 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Validation f1: 74.9
	Validation accuracy: 96.39999999999999


Preprocessor: Lemmatize + Bigram
Model: naive
alpha = 0.5
	Validation f1: 47.0
	Validation accuracy: 91.9
alpha = 1
	Validation f1: 43.5
	Validation accuracy: 91.9
alpha = 10
	Validation f1: 41.9
	Validation accuracy: 92.0

Model: logistic
C = 0.5
	Validation f1: 43.9
	Validation accuracy: 94.0
C = 1
	Validation f1: 53.6
	Validation accuracy: 94.5
C = 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Validation f1: 67.7
	Validation accuracy: 95.7


Preprocessor: Stem + Bigram
Model: naive
alpha = 0.5
	Validation f1: 44.3
	Validation accuracy: 91.60000000000001
alpha = 1
	Validation f1: 38.7
	Validation accuracy: 91.5
alpha = 10
	Validation f1: 40.2
	Validation accuracy: 92.0

Model: logistic
C = 0.5
	Validation f1: 40.2
	Validation accuracy: 93.7
C = 1
	Validation f1: 50.9
	Validation accuracy: 94.39999999999999
C = 10
	Validation f1: 67.7
	Validation accuracy: 95.7




In [39]:
print("Best model:")
print(f"\tPreprocessing: {preprocessing_dict[best_model['preprocessor']]}")
print(f"\tModel: {best_model['model']}")
if best_model['model'] == 'naive':
    print(f"\tHyperparameter: alpha = {best_model['hyperparam']}")
else:
    print(f"\tHyperparameter: C = {best_model['hyperparam']}")

clf, vectorizer = train_model(X_train[email_col], y_train, preprocess_func=best_model["preprocessor"], model_type=best_model["model"], hyperparam=best_model["hyperparam"])
test_f1 = compute_f1(X_test[email_col], y_test, clf, vectorizer, best_model['preprocessor'])
test_acc = compute_acc(X_test[email_col], y_test, clf, vectorizer, best_model['preprocessor'])
print(f"Final test f1: {test_f1[1]}")
print(f"Final test accuracy: {test_acc}")

Best model:
	Preprocessing: Lemmatize + Unigram
	Model: logistic
	Hyperparameter: C = 10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Final test f1: 0.7111111111111111
Final test accuracy: 0.9607250755287009


In [40]:
# preprocess_func = best_model["preprocessor"]
# model_type = best_model["model"]
# hyperparam = best_model["hyperparam"]

# # preprocess the training set
# X_processed = [preprocess_func(x) for x in X_train[email_col]]

# # initialize vectorizer
# vectorizer = TfidfVectorizer(analyzer=(lambda x: x))

# # fit vectorizer
# X = vectorizer.fit_transform(X_processed)

# # initialize the classifier
# if model_type == "naive":
#     # clf = MultinomialNB(alpha=hyperparam)
#     clf = ComplementNB(alpha=hyperparam)
# elif model_type == "svm":
#     clf = SVC(C=hyperparam)
# elif model_type == "logistic":
#     clf = LogisticRegression(C=hyperparam)

# # fit the model
# clf.fit(X, y_train)

# # evaluate on the validation set
# X_valid_processed = [preprocess_func(x) for x in X_valid[email_col]]
# X_v = vectorizer.transform(X_valid_processed)

# # evaluate on the test set
# X_test_processed = [preprocess_func(x) for x in X_test[email_col]]
# X_t = vectorizer.transform(X_test_processed)

# y_pred = clf.predict(X_t)

In [41]:
y_pred = get_preds(X_test[email_col], clf, vectorizer, best_model['preprocessor'])

In [42]:
inv_vocab = {vectorizer.vocabulary_[key]:key for key in vectorizer.vocabulary_}

In [43]:
coefs = pd.DataFrame(clf.coef_.reshape((-1,)), columns=["Parameter"])

In [44]:
coefs['Parameter Positive'] = coefs['Parameter'].apply(lambda x: 0 if x < 0 else 1)
coefs["|Parameter|"] = np.abs(coefs['Parameter'])

coefs['Word'] = [inv_vocab[key] for key in coefs.index]

In [48]:
coefs.sort_values('|Parameter|', ascending=False).head(20)

Unnamed: 0,Parameter,Parameter Positive,|Parameter|,Word
9647,17.380396,1,17.380396,regard
3612,8.750445,1,8.750445,dont
52,6.352372,1,6.352372,.....
11817,6.118327,1,6.118327,thxs
5682,6.086401,1,6.086401,however
8787,-6.073085,0,6.073085,pm
4382,5.804222,1,5.804222,expense
3589,5.524953,1,5.524953,doesnt
3935,-5.498056,0,5.498056,email
4060,5.437156,1,5.437156,ensure


In [49]:
prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred)

In [50]:
print(f"Precision:\n\tNon-POI = {np.round(prec[0], 4)}\n\tPOI = {np.round(prec[1], 4)}")
print(f"Recall:\n\tNon-POI = {np.round(rec[0], 4)}\n\tPOI = {np.round(rec[1], 4)}")
print(f"F1:\n\tNon-POI = {np.round(f1[0], 4)}\n\tPOI = {np.round(f1[1], 4)}")
print(f"Support:\n\tNon-POI = {np.round(supp[0], 4)}\n\tPOI = {np.round(supp[1], 4)}")

Precision:
	Non-POI = 0.9642
	POI = 0.8989
Recall:
	Non-POI = 0.9941
	POI = 0.5882
F1:
	Non-POI = 0.9789
	POI = 0.7111
Support:
	Non-POI = 3038
	POI = 272


In [75]:
# Lots of false negatives -> we're missing a lot of positives

NameError: name 'df' is not defined

In [65]:
# poi_idx = y_test
# exec_idx = (X_test['Exec 300']) & (~y_test)
# norm_idx = (~X_test['Exec 300']) & (~y_test)

# y_pred_poi = get_preds(X_test.loc[poi_idx, email_col], clf, vectorizer, best_model['preprocessor'])
# y_pred_exec = get_preds(X_test.loc[exec_idx, email_col], clf, vectorizer, best_model['preprocessor'])
# y_pred_norm = get_preds(X_test.loc[norm_idx, email_col], clf, vectorizer, best_model['preprocessor'])

In [69]:
# y_true_poi = y_test[poi_idx]
# y_true_exec = y_test[exec_idx]
# y_true_norm = y_test[norm_idx]

In [72]:
# k = {
#     "POI": (y_true_poi, y_pred_poi), 
#     "Exec": (y_true_exec, y_pred_exec), 
#     "Normal": (y_true_norm, y_pred_norm) 
# }

In [74]:
# for type_ in k:
#     print(type_)
#     y_t, y_p = k[type_]
#     prec, rec, f1, supp = precision_recall_fscore_support(y_t, y_p)
    
#     print(f"Precision:\n\tNon-POI = {np.round(prec[0], 4)}\n\tPOI = {np.round(prec[1], 4)}")
#     print(f"Recall:\n\tNon-POI = {np.round(rec[0], 4)}\n\tPOI = {np.round(rec[1], 4)}")
#     print(f"F1:\n\tNon-POI = {np.round(f1[0], 4)}\n\tPOI = {np.round(f1[1], 4)}")
#     print(f"Support:\n\tNon-POI = {np.round(supp[0], 4)}\n\tPOI = {np.round(supp[1], 4)}")
#     print()

POI
Precision:
	Non-POI = 0.0
	POI = 1.0
Recall:
	Non-POI = 0.0
	POI = 0.5882
F1:
	Non-POI = 0.0
	POI = 0.7407
Support:
	Non-POI = 0
	POI = 272

Exec
Precision:
	Non-POI = 1.0
	POI = 0.0
Recall:
	Non-POI = 0.989
	POI = 0.0
F1:
	Non-POI = 0.9945
	POI = 0.0
Support:
	Non-POI = 363
	POI = 0

Normal
Precision:
	Non-POI = 1.0
	POI = 0.0
Recall:
	Non-POI = 0.9948
	POI = 0.0
F1:
	Non-POI = 0.9974
	POI = 0.0
Support:
	Non-POI = 2675
	POI = 0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
