In [None]:
import matplotlib.pyplot as plt
import calplot
import calmap
import csv
import sklearn
import pickle
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,train_test_split,StratifiedKFold,cross_val_score,learning_curve
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import os
from nltk.corpus import words 
from IPython.display import HTML, display
import email_read_util
from datasketch import MinHash, MinHashLSH

<h2>Overview of the data and data source.</h2> 

This dataset contains and excert of Spam and Ham E-Mails from the Enron Corpus. 

Enron was an american company, that due to financial fraud collapse and subsequentiall was under investigation by law enforcement agencies. During these investigations, E-Mails servers were seized and later published. 

Today this corpus is special, as it is only one of the few datsets, that actually contain 'real' E-Mails ready for analysis. 

The entire Corpus is around 600000 E-Mails, for this project a smaller subset is being used for Spam detection. 

<h2>Goals of the project</h2>

The goal of the project is to analyze different Spam. Several goals: 

1. Find the best Spam filter
2. Determine which effect on the Spam recognition the Subject line has. 
3. Determine the best Dataset for Spam detection out of a number of datasets --> e.g. train with 5 datasets and test on 6 dataset

<h2>Preprocessing and Cleaning</h2<

In [None]:
data_file = './enron_spam_data.csv'

In [None]:
enron_df = pd.read_csv(data_file)

In [None]:
header_names = enron_df.keys()

In [None]:
print(header_names)

In [None]:
enron_df.head()

In [None]:
enron_df.describe()

In [None]:
enron_label = enron_df['Spam/Ham'].value_counts()
print(enron_label)

In [None]:
enron_df['Spam/Ham'].value_counts(normalize=True)

Noticing the NaN in the first message shown above, I decided to anayze how complete the dataset is. 

In [None]:
messageID_NaN = enron_df['Message ID'].isna().sum()
subject_NaN = enron_df['Subject'].isna().sum()
message_NaN = enron_df['Message'].isna().sum()
spam_NaN = enron_df['Spam/Ham'].isna().sum()
print(f"Number of NaN values in 'Message ID' column: {messageID_NaN}")
print(f"Number of NaN values in 'Subject' column: {subject_NaN}")
print(f"Number of NaN values in 'Message' column: {message_NaN}")
print(f"Number of NaN values in 'Spam/Ham' column: {spam_NaN}")

In [None]:
max_nan = subject_NaN + message_NaN
percentage_nan = max_nan / len(enron_df)
print('Total Maximum Lost E-Mails:' ,max_nan)
print('Percentage of Total E-Mails:',percentage_nan)

As we can see, a maximum of 660 E-Mails would be lost if we dropped all NaN values. While this is only 1,9% of all E-Mails, we risk eleminating Spam messages and tainting the analysis. Therefore I analyze how many of the 'subjectless' E-Mails are Spam.

In [None]:
na_and_spam_sub = len(enron_df[(enron_df["Subject"].isna()) & (enron_df["Spam/Ham"]=='spam')])
na_and_spam_mes = len(enron_df[(enron_df["Message"].isna()) & (enron_df["Spam/Ham"]=='spam')])
print('Subject = NaN && is Spam:',na_and_spam_sub)
print('Message = NaN && is spam:' ,na_and_spam_mes)

Interestingly enough, all of the message where there is no subject are classified as Spam.
For Messages with no Body, this happens also for most of the messages

In [None]:
print(enron_df[enron_df['Subject'].isna()])

Taking a quick peek at these couple of message, they indeed appear to be Spam. 

Let us now count the number of message where there is no subject line and no body: 

In [None]:
empty_message = len(enron_df[(enron_df["Subject"].isna()) & (enron_df["Message"].isna())])

In [None]:
print(empty_message)

In [None]:
print(enron_df[(enron_df["Subject"].isna()) & (enron_df["Message"].isna())])

Indeed there are 51 message that do not contain Subject and Message. We will leave those in the dataframe, as they are nevertheless Spam. While they are probably not Harmful (maybe except for DoS against the server), they can be annoying for the enduse. 

As can be seen above, we notice the Column Date. While it is surely interesting to correlate the impending doom of financial collapse of the company to the amount of E-Mails send in that time frame, having the date for detecting Spams is not necessary. 

Potentially for later graphics it would be interesting to see, if Spam is sent on the weekend or not.

In any case, for now we can safely ignore the Date column: 

In [None]:
enron_df.drop('Date', axis=1, inplace = True)

In [None]:
df2 = enron_df[enron_df['Message'].duplicated() == True].sort_values('Message')

In [None]:
print(df2)

Obviously if we consider the nature of Spam messages, it becomes clear that some of the received messages are Duplicates. I have decided to remove all of these duplicates, as we do not want to have any bias in the following analysis. 

In [None]:
enron_df.describe()

In [None]:
enron_df.drop_duplicates(subset=['Message'], inplace = True)

In [None]:
enron_df.head()

In [None]:
enron_df.describe()

In [None]:
df2 = len(enron_df[enron_df['Subject'].duplicated() == True])

In [None]:
print(df2)

As we can see, there are also 6205 dubplicated subjects. This surprised me, as it represent a significant part of all the messages. NEverthless I decided to not remove those messages, as we have already remove messages.

As Python is casesensitive I have decided to transform all words into lowercase. ALthough it appears the dataset is already completly in lowercase, I do this for safety reasons.

In [None]:
enron_df['Message'] = enron_df['Message'].str.lower()
enron_df['Subject'] = enron_df['Subject'].str.lower()

In [None]:
enron_df.head()

In [None]:
enron_df["Message"] = enron_df['Message'].str.replace('[^\w\s]','', regex=True)
enron_df["Subject"] = enron_df['Subject'].str.replace('[^\w\s]','', regex=True)

In [None]:
enron_df.head()

In [None]:
stop = stopwords.words('english')
enron_df['Message'] = enron_df['Message'].fillna('')
enron_df['Subject'] = enron_df['Subject'].fillna('')

In [None]:
enron_df['Message'] = enron_df['Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
enron_df['Subject'] = enron_df['Subject'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
enron_df.head()

As we can see removing punctuation and stopwords works fine. Sadly as we can see, due to reasons the NaN values had to be replaced with NaN. #################Potentially fix this alter

In the next step we divide the dataset into training data and testing data

Having thought about how to handle the subject line of the message, I decideded to view the subject line simply as a "Headline", and therefore decided to unfiy message and subject into one column. In a sense, this also solves our NaN problem, as it is now ensure that every anaylze E-Mail has a unified message we can look at. 

In [None]:
enron_df['unified'] = enron_df['Subject'] + ' ' + enron_df['Message']
enron_df_unified = enron_df.drop(['Subject', 'Message'], axis = 1)

In [None]:
enron_df.head()

In [None]:
enron_df_unified.head()

In [None]:
na_rows = enron_df_unified[enron_df_unified['Spam/Ham'].isna() | (enron_df_unified['Spam/Ham'] == ' ')]
print(na_rows)

As a last step, I decide to randomlay arrange the order of rows, so there is no bias: 

In [None]:
enron_df_unified = enron_df_unified.sample(frac=1)

At this point, our dataframe is ready to be analyzed. We can begin with the training using the enron_df_uniied df. 

<h2>Naive Bayes Filter</h2>

We will start with the Naive Bayes Filter. In this stage we start to analyze firstly our unified version of the subject and the E-Mail body. 

Later we take a look how well this Bayes Filter works, when we analyze the Subject Line and the E-Mail body separatly.

Lastly I use a different way to implement the Bayes Filter that I found online: the description can be found here: https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html

In [None]:
training_ratio = 0.7

<h3>Unified Subject Line and E-Mail Body</h3>

In [None]:
X_unified = enron_df_unified['unified']
y_unified = enron_df_unified['Spam/Ham']
X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X_unified, y_unified, range(len(y_unified)), 
    train_size=training_ratio, random_state=1)

vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

results_df = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred)], axis=1, keys=['true', 'predicted'])

print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred)))

In [None]:
wrong_predictions = 0
for i, row in results_df.iterrows():
    if row['true'] != row['predicted']:
        print(f"Predicted: {row['predicted']}, True: {row['true']},\nText: {X_test.iloc[i]}\n")
        wrong_predictions += 1

In [None]:
print(f"Number of wrong predictions: {wrong_predictions}")

<h1>#########NOTE: NEED TO ANALYZE THIS NOW</h1>

<h3>Analyzing only the subject line</h3>

Let us now try to analyze what happens if we only observe the Subject headers. How does this affect the Spam detection accuracy?

My suspicion is, that this will lower the recognition rate of Spam significantly. I even suspect it will make the Spam Filter completly worthless, with possible prediction rates approach 60% or less.

In [None]:
subject_NaN = enron_df['Subject'].isna().sum()
subject_empty = (enron_df['Subject'] == ' ').sum()

print(subject_NaN)
print(subject_empty)

In [None]:
X_subject = enron_df['Subject']
y_subject = enron_df['Spam/Ham']
X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X_subject, y_subject, range(len(y_unified)), 
    train_size=training_ratio, random_state=1)

vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

results_df = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred)], axis=1, keys=['true', 'predicted'])

print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred)))

As we can see, even when only anayzing the subject line it can still reach a correct predicition of 90.3%, which is not great but still much better than expected. Still quite useless though and it confirms the suspicion that only using the Subject Line to analyze Spam is definitly not enough.

In [None]:
wrong_predictions = 0
for i, row in results_df.iterrows():
    if row['true'] != row['predicted']:
        print(f"Predicted: {row['predicted']}, True: {row['true']},\nText: {X_test.iloc[i]}\n")
        wrong_predictions += 1

In [None]:
print(f"Number of wrong predictions: {wrong_predictions}")

<h3>Analyzing only the E-Mail Body</h3>

Now lets try the same with only the message body and see how much effect the Subject has on the Spam detection. 

In [None]:
X_body = enron_df['Message']
y_body = enron_df['Spam/Ham']
X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X_body, y_body, range(len(y_unified)), 
    train_size=training_ratio, random_state=1)

vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

results_df = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred)], axis=1, keys=['true', 'predicted'])

print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred)))

It has 98.4%! This is quite good. But still less than the unification of the body and the Subject. Therefore it makes sense to use the first method which unfies both Subject Line and E-Mail body. 

For future analysis and representation, only the first analysis type will be considered

<h2>############### TO DO Blacklisting --> Need to randomize order of files</h2>

In [None]:
X_train_bl = enron_df_unified[:int(len(enron_df_unified)*training_ratio)]
X_test_bl = enron_df_unified[int(len(enron_df_unified)*training_ratio):]
x_train_Spam_bl = enron_df_unified['Spam/Ham']
stemmer = nltk.PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('english'))
spam_words = set()
ham_words = set()


for _, row in X_train_bl.iterrows():
    path = row['unified']
    label = row['Spam/Ham']
    tokens = nltk.word_tokenize(path)
    stems = [stemmer.stem(w) for w in tokens if w not in stopwords]
    if not stems:
        continue
    if label == 'ham':
        ham_words.update(stems)
    elif label == 'spam':
        spam_words.update(stems)
    else:
        continue

blacklist = spam_words - ham_words
pickle.dump(blacklist, open('blacklist.pkl', 'wb'))

print(len(spam_words))
print(len(ham_words))
print('Blacklist of {} tokens successfully built/loaded'.format(len(blacklist)))

In [None]:
word_set = set(words.words()) 
word_set.intersection(blacklist)

In [None]:
fp = 0
tp = 0
fn = 0
tn = 0

for _, row in X_test_bl.iterrows():
    path = row['unified']
    label = row['Spam/Ham']
    tokens = nltk.word_tokenize(path)
    stems = [stemmer.stem(w) for w in tokens if w not in stopwords]
    if not stems:
        continue
    stems_set = set(stems)
    if stems_set & blacklist: # email's words are in blacklist
        if label == 'ham': # ham
            fp = fp + 1 
        else:
            tp = tp + 1
    else: # email's words are not in blacklist
        if label == 'ham':
            tn = tn + 1
        else:
            fn = fn + 1

In [None]:
conf_matrix = [[tn, fp],
               [fn, tp]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row)) 
                     for row in conf_matrix))))

In [None]:
count = tn + tp + fn + fp
percent_matrix = [["{:.1%}".format(tn/count), "{:.1%}".format(fp/count)],
                  ["{:.1%}".format(fn/count), "{:.1%}".format(tp/count)]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row)) 
                     for row in percent_matrix))))

In [None]:
print("Classification accuracy: {}".format("{:.1%}".format((tp+tn)/count)))

<h2>LSH</h2>

In [None]:
filelist = enron_df_unified['unified']
X_train_lsh = enron_df_unified[:int(len(enron_df_unified)*training_ratio)]
X_test_lsh = enron_df_unified[int(len(enron_df_unified)*training_ratio):]

In [None]:
print(len(X_train_lsh))

In [None]:
spam_files = X_train_lsh[X_train_lsh['Spam/Ham'] == 'spam']

In [None]:
print(len(spam_files))

In [None]:
lsh = MinHashLSH(threshold=0.5, num_perm=128)