In [1]:
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk import word_tokenize, pos_tag
import pyphen
from sklearn.svm import SVC



# Reading and Preprocessing Data

emails = []
labels = []

#Read all the emails in the ten folders
for root, dirs, files in os.walk('lingspam_public/bare'):
    for file in files:
        if file != '.DS_Store':
            with open(os.path.join(root,file), 'r') as f:
                emails.append(f.read())
                #Save the labels (spam/not spam, or 0/1) of each email to a list
                if 'spm' in file:
                    labels.append(0)
                else:
                    labels.append(1)
                    
#Split the emails and labels into 80% training and 20% testing
emails_train, emails_test, labels_train, labels_test = train_test_split(emails, labels, test_size = 0.2, random_state = 0)

#Fit and transform the training emails and transform the testing emails using a CountVectorizer
v = CountVectorizer()
transformed_emails_train = v.fit_transform(emails_train)
transformed_emails_test = v.transform(emails_test)

# Scikit-Learn Classifiers

#Multinomial Naive Bayes
print('Multinomial Naive Bayes\n')

mnb_clf = MultinomialNB(alpha=1)
mnb_clf.fit(transformed_emails_train, labels_train)
mnb_predictions = mnb_clf.predict(transformed_emails_test)

print('precision: ', metrics.precision_score(labels_test, mnb_predictions))
print('recall: ', metrics.recall_score(labels_test, mnb_predictions))
print('f-score: ', metrics.f1_score(labels_test, mnb_predictions),'\n')

#K Neighbors Classifier
print('K Neighbors Classifier\n')

kn_clf = KNeighborsClassifier()
kn_clf.fit(transformed_emails_train, labels_train)
kn_predictions = kn_clf.predict(transformed_emails_test)

print('precision: ', metrics.precision_score(labels_test, kn_predictions))
print('recall: ', metrics.recall_score(labels_test, kn_predictions))
print('f-score: ', metrics.f1_score(labels_test, kn_predictions),'\n')

#Random Forest Classifier
print('Random Forest Classifier\n')

rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(transformed_emails_train, labels_train)
rf_predictions = rf_clf.predict(transformed_emails_test)

print('precision: ', metrics.precision_score(labels_test, rf_predictions))
print('recall: ', metrics.recall_score(labels_test, rf_predictions))
print('f-score: ', metrics.f1_score(labels_test, rf_predictions),'\n')

# Classifying using Readability Features

#A list for every feature, where every element is the feature value of a given email

#The number of sentences in an email

f1 = [len(email.split('.')) for email in emails]

#The number of verbs in an email

f2 = []

for email in emails:
    tagged_email = pos_tag(word_tokenize(email))
    count_verb = 0
    for word_tag_tuple in tagged_email:
        if word_tag_tuple[1] == 'VB':
            count_verb+=1
    f2.append(count_verb)

#The number of words containing both numeric and alphabetical characters

def contains_num_alpha(word):
    return any(char.isdigit() for char in word) and any(char.isalpha() for char in word)

f3 = []

for email in emails:
    count_contains_num_alpha = 0
    for word in word_tokenize(email):
        if contains_num_alpha(word):
            count_contains_num_alpha+=1
    f3.append(count_contains_num_alpha)

#The number of words in an email that are found in the spam list

with open('spam-term-list', 'r') as f:
    spam_term_list = f.read().split('\n')

f4 = []

for email in emails:
    count_spam = 0
    for word in word_tokenize(email):
        if word in spam_term_list:
            count_spam+=1
    f4.append(count_spam)

#init pyphen dic
    
dic = pyphen.Pyphen(lang='en_GB')

#The number of words in an email that have more than 3 syllables

f5 = []

for email in emails:
    more_than_three_sylls_count = 0
    for word in word_tokenize(email):
        if len(dic.inserted(word).split('-')) > 3:
            more_than_three_sylls_count+=1
    f5.append(more_than_three_sylls_count)
    
#The average number of syllables of words in an email

f6 = []

for email in emails:
    all_sylls = 0
    for word in word_tokenize(email):
        all_sylls+=len(dic.inserted(word).split('-'))
    f6.append(all_sylls/len(word_tokenize(email)))
    
#A feature matrix (list of lists), where every row corresponds to an email,
# and every column corresponds to a feature value of this email
    
feat_matrix = [[f1[i], f2[i], f3[i], f4[i], f5[i], f6[i]] for i in range(len(emails))]

#Feeding the feature matrix and the labels to the classifier

new_labels = []
for label in labels:
    new_labels.append(label)

emails_train_new, emails_test_new, labels_train_new, labels_test_new = train_test_split(feat_matrix, new_labels, test_size = 0.2, random_state = 0)

clf = SVC()

clf.fit(emails_train_new, labels_train_new)

new_predictions = clf.predict(emails_test_new)

print('Classifying using Readability Features\n')

print('precision: ', metrics.precision_score(labels_test_new, new_predictions))
print('recall: ', metrics.recall_score(labels_test_new, new_predictions))
print('f-score: ', metrics.f1_score(labels_test_new, new_predictions),'\n')


Multinomial Naive Bayes

precision:  0.997885835095
recall:  0.995780590717
f-score:  0.996832101373 

K Neighbors Classifier

precision:  0.969325153374
recall:  1.0
f-score:  0.984423676012 

Random Forest Classifier

precision:  0.977272727273
recall:  0.997890295359
f-score:  0.987473903967 

Classifying using Readability Features

precision:  0.826315789474
recall:  0.993670886076
f-score:  0.902298850575 

