In [None]:
from pandas import DataFrame
from pandas import Series
from sklearn.utils import shuffle
import pandas as pd
import os
import math
from nltk import ngrams
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

#Data are taken from http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/index.html (pre-processed form)

HAM_DIRECTORY = "/home/manolis/Desktop/Enron/ham"
SPAM_DIRECTORY = "/home/manolis/Desktop/Enron/spam"

def read_files(directory,class_label):
    data = []
    index = []
    for filename in os.listdir(directory):
        email_content = ""
        with open(directory+"/"+filename, 'r',encoding='utf-8', errors='ignore') as email:
            for line in email:
                line = line.strip('\n')
                line = line.strip('\t')
                email_content+= line

        email.close()
        data.append({'text': email_content, 'label': class_label})
        index.append(filename)
    return data,index

def count_ngrams(ngrams):
    cnt = Counter()
    for ngram in ngrams:
        cnt[ngram] += 1
    return cnt

Ham_data,index_hamdata = read_files(HAM_DIRECTORY,"Ham")
Spam_data,index_spamdata = read_files(SPAM_DIRECTORY,"Spam")
data_frame = DataFrame(Ham_data,index =index_hamdata)
data_frame = data_frame.append(DataFrame(Spam_data,index =index_spamdata))
#shufle dataframe
data_frame = shuffle(data_frame,random_state = 456987)

train, test = train_test_split(data_frame, train_size = 0.8,random_state=4542)


unigrams =  (ngram for sent in train.ix[:,1] for ngram in ngrams(sent.split(),1))
unigrams_final = { k:v for k, v in count_ngrams(unigrams).items() if v>100}


bigrams =  (ngram for sent in train.ix[:,1] for ngram in ngrams(sent.split(),2))
bigrams_final = { k:v for k, v in count_ngrams(bigrams).items() if v>100}


IDF = {}
for k in bigrams_final.keys():
    IDF[k]=0

for k in unigrams_final.keys():
    IDF[k[0]]=0


for idx in range(len(train)):
    grams_1 = ngrams(train['text'][idx].split(),1)
    grams_1set = []
    for gram in grams_1:
        if (gram not in grams_1set):
            try:
                IDF[gram[0]]+=1
                grams_1set.append(gram)  
            except KeyError:
                 continue
    grams_2 = ngrams(train['text'][idx].split(),2)
    grams_2set = []
    for gram in grams_2:
        if (gram not in grams_2set):
            try:
                IDF[gram]+=1
                grams_2set.append(gram)  
            except KeyError:
                 continue
           
    


for k,v in IDF.items():
    IDF[k] = math.log(len(train)/(1+ v))

tfIDFvector_train = {}
for key in IDF.keys():
    tfIDFvector_train[key] = np.zeros(len(train))
    

for idx in range(len(train)):
    grams_1 = ngrams(train['text'][idx].split(),1)
    grams_1_counter = count_ngrams(grams_1)
    for k,v in grams_1_counter.items():
        try:
            tfIDFvector_train[k][idx] = v*IDF[k]
        except KeyError:
            continue
    grams_2 = ngrams(train['text'][idx].split(),2)
    grams_2_counter = count_ngrams(grams_2)
    for k,v in grams_2_counter.items():
        try:
            tfIDFvector_train[k][idx] = v*IDF[k]
        except KeyError:
            continue
            
tfidf_dataframe = DataFrame(tfIDFvector_train)
tfidf_dataframe.apply(pd.to_numeric)
tfidf_dataframe = tfidf_dataframe.set_index(train.index)
tfidf_dataframe['label'] = train['label'].astype(str)
tfidf_dataframe.index= range(len(tfidf_dataframe))
tfidf_dataframe['label'] = tfidf_dataframe['label'].astype('category')


tfIDFvector_test= {}
for key in IDF.keys():
    tfIDFvector_test[key] = np.zeros(len(test))
    

for idx in range(len(test)):
    grams_1 = ngrams(test['text'][idx].split(),1)
    grams_1_counter = count_ngrams(grams_1)
    for k,v in grams_1_counter.items():
        try:
            tfIDFvector_test[k][idx] = v*IDF[k]
        except KeyError:
            continue
    grams_2 = ngrams(test['text'][idx].split(),2)
    grams_2_counter = count_ngrams(grams_2)
    for k,v in grams_2_counter.items():
        try:
            tfIDFvector_test[k][idx] = v*IDF[k]
        except KeyError:
            continue

tfidf_dataframe_test = DataFrame(tfIDFvector_test)
tfidf_dataframe_test.apply(pd.to_numeric)
tfidf_dataframe_test = tfidf_dataframe_test.set_index(test.index)
tfidf_dataframe_test['label'] = test['label'].astype(str)
tfidf_dataframe_test.index= range(len(tfidf_dataframe_test))
tfidf_dataframe_test['label'] = tfidf_dataframe_test['label'].astype('category')




train_data = tfidf_dataframe.ix[:, tfidf_dataframe.columns != 'label']
train_labels = Series(tfidf_dataframe['label'])
test_data = tfidf_dataframe_test.ix[:, tfidf_dataframe_test.columns != 'label']
test_labels = Series(tfidf_dataframe_test['label'])

#Check that train and test sets has the same columns and the same order of columns
cols = train_data.columns.tolist()
cols2 = test_data.columns.tolist()
cols ==cols2

In [None]:
#Baseline classifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score

base = DummyClassifier(strategy='most_frequent')
base.fit(train_data, train_labels)
predictions = base.predict(train_data)
score = f1_score(train_labels, predictions,pos_label='Spam')
print("train score:",score)

predictions_test = base.predict(test_data)
score = f1_score(test_labels, predictions_test,pos_label='Spam')
print("test score:",score)
print()
print("test data confusion matrix")
pd.crosstab(test_labels, predictions_test, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

clf = MultinomialNB()
clf.fit(train_data, train_labels)
predictions = clf.predict(train_data)
score = f1_score(train_labels, predictions,pos_label='Spam')
print("train score:",score)

predictions_test = clf.predict(test_data)
score = f1_score(test_labels, predictions_test,pos_label='Spam')
print("test score:",score)
print()
print("test data confusion matrix")
pd.crosstab(test_labels, predictions_test, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression()

lr.fit(train_data, train_labels)
predictions = lr.predict(train_data)
score = f1_score(train_labels, predictions,pos_label='Spam')
print("train score:",score)

predictions_test = lr.predict(test_data)
score = f1_score(test_labels, predictions_test,pos_label='Spam')
print("test score:",score)
print()
print("test data confusion matrix")
pd.crosstab(test_labels, predictions_test, rownames=['True'], colnames=['Predicted'])


In [None]:
#Reduce dimensionality in order to use svm and k-nn algorithms
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

train_data.shape
clf = ExtraTreesClassifier()
rdim = clf.fit(train_data, train_labels)
model = SelectFromModel(rdim, prefit=True)
train_data_new = model.transform(train_data)
test_data_new = model.transform(test_data)
print(test_data.shape)
print(test_data_new.shape)


In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_data_new, train_labels)

predictions = clf_svm.predict(train_data_new)
score = f1_score(train_labels, predictions,pos_label='Spam')
print("train score:",score)

predictions_test = clf_svm.predict(test_data_new)
score = f1_score(test_labels, predictions_test,pos_label='Spam')
print("test score:",score)
print()
print("test data confusion matrix")
pd.crosstab(test_labels, predictions_test, rownames=['True'], colnames=['Predicted'])

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(train_data_new,train_labels)
predictions = neigh.predict(train_data_new)
predictions_test = neigh.predict(test_data_new)

score = f1_score(train_labels, predictions,pos_label='Spam')
print("train score:",score)

score = f1_score(test_labels, predictions_test,pos_label='Spam')
print("test score:",score)
print()
print("test data confusion matrix")
pd.crosstab(test_labels, predictions_test, rownames=['True'], colnames=['Predicted'])


# train score: 0.918331914894
# test score: 0.888389314996

# test data confusion matrix

# Out[36]:
# Predicted 	Ham 	Spam
# True 		
# Ham 	2538 	755
# Spam 	89 	3359

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


X, y = train_data.as_matrix(),train_labels.as_matrix()

title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = MultinomialNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=2)

title = "Learning Curves (Logistic Regression)"
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=2)

plt.show()

In [None]:
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB


#disable warnings for large float numbers of estimated probabilities
np.seterr(all='ignore')

y_test= label_binarize(test_labels.as_matrix(), classes=['Ham','Spam'])

estimators = {'Logistic Regression':LogisticRegression(), 'Naive Bayes ':MultinomialNB()}

for (name,estimator) in estimators.items():
    
    model =estimator
    model.fit(train_data_new,train_labels)
    pred = model.predict_proba(test_data_new)
    precision, recall, thresholds = precision_recall_curve(y_test, pred[:,1])
    area = auc(recall, precision)

    plt.plot(recall, precision, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall %s: AUC=%0.2f' % (name,area))
    plt.legend(loc="lower left")
    plt.show()