## Load Required Packages

In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

The data used here is from [Apache SpamAssassin](data sourcehttps://spamassassin.apache.org/old/publiccorpus/) and contains a few thousand classified "spam" and "ham" emails.

The uncompressed data is a bit large for github, so we've preprocessed the data and stored it in a single CSV file [here](https://raw.githubusercontent.com/plb2018/data620/master/Week10/spam_n_ham.csv) to make life easier.  The compressed data has been archived [here](https://github.com/plb2018/data620/tree/master/Week10/raw_data) for reproducibility.

In [2]:

'''
#here we preprocess the data - commented out to save on run-time.

PATH = "C:/Users/Paul/OneDrive - CUNY School of Professional Studies/CUNY/DATA 620/Week10/"
spam_folders = ['easy_ham/','spam/','easy_ham_2/','hard_ham/','spam_2/']

cols = ['body','source','isSpam']
df = pd.DataFrame(columns = cols)

for folder in spam_folders:
    files = [f for f in listdir(PATH+folder) if isfile(join(PATH+folder, f))]
    
    for f in files:
        data = open(PATH+folder+f,'rb')
          
        newData = {'body':data.read(),
                  'source':folder,
                  'isSpam':0}
        
        df = df.append(newData,ignore_index=True)
        data.close()

#encode the spam as spam
df['isSpam'][(df['source'] == 'spam/') | (df['source'] == 'spam_2/')] = 1   

#df.to_csv("spam_n_ham.csv",sep='|')

'''

#load the preprocessed data from github

df = pd.read_csv('https://raw.githubusercontent.com/plb2018/data620/master/Week10/spam_n_ham.csv',sep='|',index_col=0)


In [3]:
df
df.shape


(9348, 3)

In [4]:
 def cleanData(data):
    data=str(data)
    data = data.lower()
    data=data.replace('{html}',"") 
    clnr = re.compile('<.*?>')
    cleantext = re.sub(clnr, '', data)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)



In [5]:
df['body']=df['body'].map(lambda s:cleanData(s)) 

In [6]:
df.head(20)

Unnamed: 0,body,source,isSpam
0,exmh workers admin redhat com thu aug nreturn ...,easy_ham/,0
1,steve_burt cursor system com thu aug nreturn p...,easy_ham/,0
2,timc ubh com thu aug nreturn path ndelivered z...,easy_ham/,0
3,irregulars admin thu aug nreturn path ndeliver...,easy_ham/,0
4,stewart smith thu aug nreturn path ndelivered ...,easy_ham/,0
5,martin srv ems thu aug nreturn path ndelivered...,easy_ham/,0
6,martin srv ems thu aug nreturn path ndelivered...,easy_ham/,0
7,stewart smith thu aug nreturn path ndelivered ...,easy_ham/,0
8,martin srv ems thu aug nreturn path ndelivered...,easy_ham/,0
9,exmh workers admin redhat com thu aug nreturn ...,easy_ham/,0


In [17]:
c = CountVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1))

counts = c.fit_transform(df['body'])

X_train,X_test,y_train,y_test = train_test_split(counts,df['isSpam'],test_size=0.9,random_state=42)

clf = MultinomialNB().fit(X_train, y_train)

predicted= clf.predict(X_test)

print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.8538150701212265


In [19]:
X_train,X_test,y_train,y_test = train_test_split(counts,df['isSpam'],test_size=0.9,random_state=42)


In [20]:
clf = MultinomialNB().fit(X_train, y_train)

predicted= clf.predict(X_test)


print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.8538150701212265


In [12]:

tf=TfidfVectorizer()

text_tf= tf.fit_transform(df['body'])

X_train, X_test, y_train, y_test = train_test_split(text_tf, df['isSpam'], test_size=0.3, random_state=123)

clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.8684491978609625
