# Initial data parsing

In [1]:
from pandas import read_csv, DataFrame, concat
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import snowballstemmer
from stop_words import get_stop_words
import collections

stop_word = get_stop_words('en')
df = read_csv("./spam_ham_dataset.csv")
new_df = df

df.drop(['Unnamed: 0', 'label'],axis=1,inplace=True)

print(df.head(5)["text"][0])

Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .


In [2]:
print(df.head(10))

                                                text  label_num
0  Subject: enron methanol ; meter # : 988291\r\n...          0
1  Subject: hpl nom for january 9 , 2001\r\n( see...          0
2  Subject: neon retreat\r\nho ho ho , we ' re ar...          0
3  Subject: photoshop , windows , office . cheap ...          1
4  Subject: re : indian springs\r\nthis deal is t...          0
5  Subject: ehronline web address change\r\nthis ...          0
6  Subject: spring savings certificate - take 30 ...          0
7  Subject: looking for medication ? we ` re the ...          1
8  Subject: noms / actual flow for 2 / 26\r\nwe a...          0
9  Subject: nominations for oct . 21 - 23 , 2000\...          0


# Process data

- Remove "Subject:"
- Stem words (turn all similar words into the same word)
- Remove all common stop words

In [3]:
stemmer: snowballstemmer.EnglishStemmer = snowballstemmer.stemmer('english')
def preprocess(text, stem=False):
    text = "\n".join(text.split("\n")[1:])
    tokens = []
    for token in text.split():
        if token not in stop_word:
            if stem:
                tokens.append(stemmer.stemWord(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
   

for index, text in enumerate(df["text"]):
    data = preprocess(text)
    df["text"][index] = data

print(df.head(5)["text"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"][index] = data


0    follow note gave monday , 4 / 3 / 00 { prelimi...
1    ( see attached file : hplnol 09 . xls ) - hpln...
2    ho ho ho , ' re around wonderful time year - -...
3    abasements darer prudently fortuitous undergon...
4    deal book teco pvr revenue . understanding tec...
Name: text, dtype: object


# Create dictionary of 3000 most common words

In [4]:
def make_Dictionary1(emails):
    all_words = []       
    for mail in emails:    
        words = mail.split()
        all_words += words
    
    dictionary = collections.Counter(all_words)

    for item in dictionary.copy():
        if not item.isalpha() or len(item) == 0:
            del dictionary[item]
            
    dictionary = dictionary.most_common(3000)

    return dictionary

word_dict = make_Dictionary1(df["text"])

# Turn letters into numbers

In [5]:
import numpy as np

def extract_features(email_text_list): 
    features_matrix = np.zeros((len(email_text_list),3000))
    docID = 0
    for text in email_text_list:
        words = text.split()
        for word in words:
            wordID = 0
            for i,d in enumerate(word_dict.copy()):
                if d[0] == word:
                    wordID = i
                    features_matrix[docID,wordID] = words.count(word)
        docID = docID + 1
    return features_matrix

- Create training dataframe
- Split training and testing data
- Split text and prediction columns
- Sort rows in dataframe by label number

In [6]:
feature_cols = ["text"]
predicted_cols = ["label_num"]

x = df[feature_cols]
y = df[predicted_cols]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.33, random_state=42)

train_df = [x_train, y_train]
train_df_rows: DataFrame = concat(train_df, axis=1)
sorted_train_df = train_df_rows.sort_values(by="label_num")

train_features = extract_features(sorted_train_df["text"])

In [7]:
model = GaussianNB()
model.fit(train_features, sorted_train_df["label_num"])

GaussianNB()

# Do the same with the testing data

In [8]:
test_df = [x_test, y_test]
test_df_rows = concat(test_df, axis=1)
sorted_test_df = test_df_rows.sort_values(by="label_num")
test_features = extract_features(sorted_test_df["text"])

# Predict testing and training data

In [9]:
test_result = model.predict(test_features)
train_result = model.predict(train_features)

# Get accuracy of algorithm

In [10]:
from sklearn import metrics
train_confusion_matrix = metrics.confusion_matrix(sorted_train_df["label_num"], train_result)
test_confusion_matrix = metrics.confusion_matrix(sorted_test_df["label_num"], test_result)

print("Training confusion matrix:", train_confusion_matrix)
print("Testing confusion matrix:", test_confusion_matrix)

print("Accuracy:", metrics.accuracy_score(sorted_test_df["label_num"], test_result))

Training confusion matrix: [[2280  146]
 [   0 1038]]
Testing confusion matrix: [[1174   72]
 [  19  442]]
Accuracy: 0.9466900995899239


# Save model to file

In [11]:
from joblib import dump

dump(model, "trained-model.sav")

['trained-model.sav']