In [1]:
#!pip install -U scikit-learn
#!pip install pandas

In [1]:
import os
import glob
import numpy as np

'''
Function that recursively reads text files with
a specific extension under a given root directory
'''

#Get root folder, Extensions that will be read, Class Label
def read_data(directory,file_extension,class_label):
    
    data = []
    index = []
    #Recursively read file paths starting from root directory, ending with the specisfied extension, 2 levels deep
    for filename in glob.glob(directory+'/**/*'+file_extension, recursive=True):
        email_content = ""                                                   #This is our email string
        with open(filename, 'r',encoding='utf-8', errors='ignore') as email: #Open the file in the located file path
            for line in email:                        #Read the email line by line and get rid of tabs and new lines
                line = line.replace('\n', ' ')
                line = line.replace('\t', ' ')
                email_content+= line                               #Append the line to the email string
        email.close()                                              #Close the file
        data.append({'text': email_content, 'label': class_label}) #Append a dictionary with email string and class label
        index.append(filename)                                     #Append the path
                    
    return data,index

In [2]:
import pandas as pd
from sklearn.utils import shuffle

'''
Read spam and ham emails, convert to pandas,
concatenate and random shuffle
--Enron dataset is downloaded from http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/index.html (pre-processed form)
'''

Enron_dataset_dir = os.getcwd() + "\\Enron_dataset" #Path
Ham_data,index_hamdata = read_data(Enron_dataset_dir,".ham.txt","Ham")           #Create array with Ham emails
Spam_data,index_spamdata = read_data(Enron_dataset_dir,".spam.txt","Spam")       #Create array with Spam emails
data = pd.DataFrame(Ham_data,index =index_hamdata)                               #Make them a dataframe with their path as the index
data = data.append(pd.DataFrame(Spam_data,index =index_spamdata))                #Append the spam data
#shufle dataframe      
data = shuffle(data,random_state = 456987)                                       #Shuffle array to seem professional :P
pd.set_option("max_colwidth",2000)                                               #Set max columns to not overload Jupyter
data

Unnamed: 0,label,text
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron4\spam\1306.2004-05-29.GP.spam.txt,Spam,Subject: re : are you still online ? click here to be removed
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron4\spam\0347.2004-02-05.GP.spam.txt,Spam,"Subject: hi if you are paying more than 3 . 6 % on your mortgage , we can save you money ! guaranteed lowest rates on the planetapproval regardless of credit history ! start saving todayshow me the lowest rates to stop receiving offers here"
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron6\spam\2646.2005-01-11.BG.spam.txt,Spam,"Subject: urgent hi , i hereby wish to inform you that i am interested to purchase your , pci cardteac cd - w 54 e cd - r / rw burner , sony sdt - 5000 dds 2 ( 4 / 8 gb ) dat drive , sony 15 cl dds dat cleaning tape . what is your best offer ? are you the real owner ? what is the condition ? payment will be by money order . dont worried yourself about the pick up . i will take care of that when payment is done . if its still available for sale , provide the details below with which the payment will be sent to . . . offeri look forward to read from you soonest . cheers _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ do you yahoo ! ? tired of spam ? yahoo ! mail has the best spam protection around http : / / mail . yahoo . com"
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron2\ham\3439.2000-11-27.kaminski.ham.txt,Ham,"Subject: transmission roundtable meeting the meeting will be held on december 8 , 2000 from 11 : 30 am to 1 : 00 pm in conference room eb 19 cl . box lunches will be served . your choices are listed below : salads : roasted chicken cobb salad , grilled chicken caesar salad , classic chef salad sandwiches : turkey , roast beef , ham , chicken salad , tuna salad or club sandwich . served on homemade white or wheat bread please email your lunch choice to me by monday , december 4 , 2000 . thanks and regards , anita dupont"
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron3\ham\4567.2001-12-13.kitchen.ham.txt,Ham,"Subject: ask jeeves louise , for the record , ask jeeves was a better investment than enron ! - dan"
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron6\ham\3534.2001-05-02.lokay.ham.txt,Ham,"Subject: fw : capacity at rio puerco fyi - rio puerco receipt point capacity increase ! - - - - - original message - - - - - from : "" whittaker , barbara "" @ enron [ mailto : imceanotes - + 22 whittaker + 2 c + 20 barbara + 22 + 20 + 3 cbwhitta + 40 pnm + 2 ecom + 3 e + 40 enron @ enron . com ] sent : wednesday , may 02 , 2001 11 : 36 am to : lorraine lindberg ( e - mail ) cc : fenton , mark ; cross , jimmy ; thompson , chuck ( gsc ) subject : capacity at rio puerco lorraine : pnm has installed its redonda # 2 compressor to boost the capacity at the transwestern / pnm rio puerco interconnect . please revise the operational capacity indicated on transwestern ' s ebb for deliveries from pnm into transwestern from 80 , 000 mmbtus to 110 , 000 mmbtus . thank you for your assistance . barbara whittaker"
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron2\ham\0779.2000-03-19.kaminski.ham.txt,Ham,"Subject: re : presentation will do - - thank you very much . dawn from : dawn scovill , event coordinator designs event consulting dawn @ perfectmeeting . com - - - - - original message - - - - - from : vince j kaminski to : cc : ; vince j kaminski ; sent : friday , march 17 , 2000 5 : 38 pm subject : re : presentation > > > david , > > i am leaving for vacation this weekend and i haven ' t received the copy of your > presentation yet . > the window during which i could make changes to my presentation is closing very > fast . let ' s > do the following : i shall keep my presentation as is ( this means that i shall > use the copy of my > presentation i sent to dawn scovill and you this week ) . if there is an overlap > between our presentations , so be it . > > dawn , please use the copy of my presentation i sent you earlier this week . > > vince > > > - - - - - - - - - - - - - - - - - - - - - - forwarded by vince j kaminski / hou / ect on 03 / 17 / 2000 04 : 33 > pm - - - - - - - - - - - - - - - - - - - - - - - - - - - > > > vince j kaminski > 03 / 16 / 2000 08 : 02 am > > to : "" dawn scovill "" @ enron > cc : sobotkad @ kochind . com , vince j kaminski / hou / ect @ ect > subject : re : presentation ( document link : vince j kaminski ) > > dawn , > > i met david sobotka from koch this morning and we talked about coordinating our > presentations . > this means there will be changes intended to avoid overlaps . sorry for that . the > portions of my presentation > will survive ( those about valuation paradigms ) and i shall add a few more pages > on accounting treatment of weather derivatives > plus more specific examples . david will cover primarily market evolution + plus > examples of some > standard structures , and we shall both give more interesting examples of > specific deals executed by our companies . > > i shall send you an updated version of my part next week . let me know what the > deadline is . > > vince > > > > "" dawn scovill "" on 03 / 14 / 2000 07 ..."
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron6\spam\2574.2005-01-07.BG.spam.txt,Spam,Subject: lonely girls who are home for new year 100 % free adult personals you need to get laid ! ! ! ! ! ! ! ! ! ! ! we can help . http : / / dating . viewing . at real profiles of people in your area that just want sex ! 100 % free ! ! ! ! ! http : / / dating . viewing . at - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - remove your self
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron6\ham\0381.2000-07-14.lokay.ham.txt,Ham,Subject: anadarko and union pacific resources merged company will be named anadarko petroleum shareholders of anadarko petroleum and union pacific resources voted to approve a $ 5 . 4 billion merger of the two companies yesterday . the merger calls for upr shareholders to receive 0 . 455 shares of anadarko common stock for each of their upr shares . anadarko shareholders also voted to increase the size of the company ' s board to 15 members from nine . the combined company will be named anadarko petroleum .
C:\Users\Infernal\Msc Data Science\Text Analytics\Demos\Lab 2\Enron_dataset\enron5\spam\0498.2002-07-16.SA_and_HP.spam.txt,Spam,"Subject: ~ ~ 80 to 95 % below wholesale 28534 buy products 70 to 80 % below wholesale new , and in quantities you need a single unit , a pallet , or a truckload 1 . do you consider yourself financially secure ? 2 . are you still looking for a very real business that can provide you and your family with the lifestyle you desire and deserve ? or , just a second income in your spare time ? 3 . would you invest $ 66 . 50 ( a 25 % discount off of our regular price of $ 99 . 95 during this limited time promotion . ) in a business that could make you financially secure ? if so , read on : this is not a get rich quick scheme , but , it is a way for you to get into a business with a minimal investment and , may be your first step towards a rewarding first , or second income . for the longest time , only those , who were able to make large investments were able to take advantage of this type of a business . we have made it possible for everyone to do it , and at a price everyone can afford . corporate america has conditioned us to believe that security comes from employment . yet layoffs are hitting an all time high , major corporations are failing and we hope to never become the victims of this downsizing , career burn out , illness or injury . there was a time , when finding another job was not a problem , but today , the frightening reality for a lot of people is that the plastic in their wallets determines the quality of their lives . the hard facts show that our economy has moved from the industrial age into the information , service and retail age . no longer can you depend on the corporation to provide your family with the security you seek . if you are tired of living from paycheck to paycheck , and are willing to work a few hours per week , than this may be for you . please read further : we will show you how you can buy , new , not out of date , products for pennies on the wholesale dollar . we will not just send a list , or catalog of where you can buy , but ac..."


In [None]:
data.shape
data['label'].value_counts()

In [None]:
'''
Split dataset into train (70%) & test (30%)
Transform labels to one hot encoding 
'''

import itertools
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

train, test = train_test_split(data, test_size = 0.3,random_state=1596, stratify=data[['label']].values.tolist()) #0.7 to the train, 0.3 to the test 

X_train = list(itertools.chain.from_iterable(train[['text']].values.tolist())) #Extract x_train from the train set
X_test = list(itertools.chain.from_iterable(test[['text']].values.tolist()))   #Extract x_test from the test set

y_train = list(itertools.chain.from_iterable(train[['label']].values.tolist())) #Extract training labels
le = preprocessing.LabelEncoder()                                               #Initialize Labelencoder
le.fit(y_train)                                                                 
y_train = le.transform(y_train)                                                 #Y_train classes become 0 and 1
y_test = le.transform(list(itertools.chain.from_iterable(test[['label']].values.tolist()))) #Same for y_test

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pprint

#Use unigram & bi-gram tf*idf features
#Create all possible unigrams and bigram. Create TF-IDF between terms and documents. For each term, sum the scores, take 5000 best.
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features = 5000,sublinear_tf=True, stop_words='english', lowercase=True)

#Rows are the emails, Columns are the terms.
#It also seems to drop all emails with 0 score in all 5000 columns.
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

pprint.pprint(vectorizer.get_feature_names())

In [22]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

KBest = SelectKBest(mutual_info_classif, k=4500).fit(X_train_tfidf, y_train)
x_train_tfidf = KBest.transform(X_train_tfidf)
x_test_tfidf = KBest.transform(X_test_tfidf)

In [23]:
#Baseline classifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

base = DummyClassifier(strategy='most_frequent')
base.fit(x_train_tfidf, y_train)
predictions = base.predict(x_train_tfidf)
score = f1_score(y_train, predictions)
print("train f1-score:",score)

predictions_test = base.predict(x_test_tfidf)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)
#Dummy classifier, it shows that train preds and test preds are near the same accuracy. No overfitting.

train f1-score: 0.674882793857556
test f1-score: 0.6748329621380846

test data confusion matrix


Predicted,1
True,Unnamed: 1_level_1
0,4964
1,5151


In [24]:
#Naive-Bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

clf = MultinomialNB()
clf.fit(x_train_tfidf, y_train)
predictions = clf.predict(x_train_tfidf)
score = f1_score(y_train,predictions)
print("train f1-score:",score)

predictions_test = clf.predict(x_test_tfidf)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

train f1-score: 0.9835631184531943
test f1-score: 0.9832046332046332

test data confusion matrix


Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4848,116
1,58,5093


In [25]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression(solver="liblinear")
clf.fit(x_train_tfidf, y_train)
predictions = clf.predict(x_train_tfidf)
score = f1_score(y_train,predictions)
print("train f1-score:",score)

predictions_test = clf.predict(x_test_tfidf)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

train f1-score: 0.9915667631252584
test f1-score: 0.9873880812554154

test data confusion matrix


Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4856,108
1,23,5128


In [26]:
# Reduce dimensionality using svd
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=4321)
x_train_svd = svd.fit_transform(x_train_tfidf)
x_test_svd = svd.transform(x_test_tfidf)

In [27]:
from sklearn import svm
from sklearn.metrics import f1_score


clf = svm.SVC(kernel='linear')
clf.fit(x_train_tfidf, y_train)
predictions = clf.predict(x_train_tfidf)
score = f1_score(y_train,predictions)
print("train f1-score:",score)

predictions_test = clf.predict(x_test_tfidf)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

train f1-score: 0.9952319747916579
test f1-score: 0.9900434992750121

test data confusion matrix


Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4891,73
1,30,5121


In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_train_svd, y_train)
predictions = clf.predict(x_train_svd)
score = f1_score(y_train,predictions)
print("train f1-score:",score)

predictions_test = clf.predict(x_test_svd)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

train f1-score: 0.9836160052692244
test f1-score: 0.9774839513270097

test data confusion matrix


Predicted,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4779,185
1,50,5101


In [None]:
#MLP Classifier with 2 hidden layers
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

clf = MLPClassifier(solver='adam',hidden_layer_sizes=(200,200),early_stopping=True,random_state=123456)
clf.fit(X_train_tfidf, y_train)

predictions = clf.predict(X_train_tfidf)
score = f1_score(y_train,predictions)
print("train f1-score:",score)

predictions_test = clf.predict(X_test_tfidf)
score = f1_score(y_test, predictions_test)
print("test f1-score:",score)
print()
print("test data confusion matrix")
y_true = pd.Series(y_test, name='True')
y_pred = pd.Series(predictions_test, name='Predicted')
pd.crosstab(y_true, y_pred)

In [None]:
#Hyper-parameter tuning via grid search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


hyper_parameter_grid = [{'solver': ['newton-cg','liblinear'],'C': [1, 10, 100, 1000]}]              
clf = GridSearchCV(LogisticRegression(), hyper_parameter_grid, cv=5,scoring='f1',n_jobs=1)
clf.fit(X_train_tfidf, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+%0.03f) for %r"% (mean, std * 2, params))
    print()

print()
print("Test data classification report:")
print()
y_true, y_pred = y_test, clf.predict(X_test_tfidf)
print(classification_report(y_true, y_pred))
score = f1_score(y_true, y_pred)
print("test f1-score:",score)

In [None]:
#Learning curves with cross-validtion
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("F1-score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs,scoring='f1', train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="b")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="orange")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="b",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="orange",
             label="Cross-validation score")

    plt.legend(loc="lower right")
    return plt


X, y = X_train_tfidf,y_train

title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = MultinomialNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=2)
plt.show()

title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=2)
plt.show()

In [None]:
#Precision-Recall curves
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB


#disable warnings for large float numbers of estimated probabilities
np.seterr(all='ignore')
estimators = {'Logistic Regression':LogisticRegression(solver='liblinear'), 'Naive Bayes ':MultinomialNB()}

for (name,estimator) in estimators.items():
    
    model =estimator
    model.fit(X_train_tfidf,y_train)
    pred = model.predict_proba(X_test_tfidf)
    precision, recall, thresholds = precision_recall_curve(y_test, pred[:,1])
    area = auc(recall, precision)

    plt.plot(recall, precision, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall %s: AUC=%0.2f' % (name,area))
    plt.legend(loc="lower left")
    plt.show()