## Import libraries

In [26]:
# import libraries
import pandas as pd
import numpy as np
import email
import os
import re
import email
from email.parser import Parser, BytesParser
from email.message import EmailMessage
from email.header import decode_header
import chardet
import pkg_resources
from platform import python_version
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
spacy.load('en')

<spacy.lang.en.English at 0x7fbfe68e85d0>

### 1. List the files inside the data directory (excluding files which have a '.')

In [27]:
data_dirs = [path for path in os.listdir('Data/') if path.find('.')==-1]

### 2. Create a loop that goes through all directories and creates a dataframe which stores file paths and respective labels (spam/ham)

In [28]:
# initialize empty dict
file_label_dict = dict()

# initialize counter
i = 1

for directory in data_dirs:
    
    # get all file names inside the directory
    file_paths = [os.path.abspath(os.path.join('Data', directory ,f)) for f in os.listdir(os.path.join('Data', directory))]
    
    if directory.find('spam') != -1:
            for file in file_paths:
                file_label_dict.update({file : 'spam'})
                
    elif directory.find('ham') != -1:
            for file in file_paths:
                file_label_dict.update({file : 'ham'})
    
    print(f'{i}. Processing folder {directory} : {len(file_paths)} files')
    
    # update counter
    i = i + 1

    
# create dataframe
df_mails = pd.DataFrame(data = list(file_label_dict.items()), columns = ['file_path', 'label'])

1. Processing folder spam : 501 files
2. Processing folder hard_ham : 250 files
3. Processing folder spam_2 2 : 1397 files
4. Processing folder easy_ham : 2551 files
5. Processing folder spam 2 : 501 files
6. Processing folder hard_ham 2 : 251 files
7. Processing folder easy_ham 2 : 2501 files


In [29]:
# check shape and sample rows
print(df_mails.shape)
df_mails.head()

(7952, 2)


Unnamed: 0,file_path,label
0,/Users/nikhilarora/Desktop/Data Science/Projec...,spam
1,/Users/nikhilarora/Desktop/Data Science/Projec...,spam
2,/Users/nikhilarora/Desktop/Data Science/Projec...,spam
3,/Users/nikhilarora/Desktop/Data Science/Projec...,spam
4,/Users/nikhilarora/Desktop/Data Science/Projec...,spam


In [30]:
# check distribution of spam and ham
df_mails['label'].value_counts() / df_mails.shape[0]

ham     0.698315
spam    0.301685
Name: label, dtype: float64

### 3. Read all email files and check if they are multipart

In [31]:
def is_email_multipart(file_path, parser):
    
    '''
    Function to check whether an email is multipart
    
    Inputs:
    file_path: String, absoulte file path that has the email contents
    parser:  BytesParser object
    
    Output: String, 'True' if email is multipart else 'False'
    
    '''
    
    
    # initialize is_multipart string as blank
    is_multipart = ''
    
    with open(file_path, mode = 'rb') as f:
        
        # read file content
        file_content = f.read()
        
        # parse file content
        email_obj = parser.parsebytes(file_content)
        
        # check if email is multipart
        is_multipart = str(email_obj.is_multipart())
        
    
    # return is_mulipart
    return is_multipart

In [32]:
# create empty list
is_multipart_list = []

# create email Parser
bytes_parser = BytesParser()

In [33]:
df_mails['is_multipart'] = df_mails['file_path'].apply(lambda f: is_email_multipart(f, bytes_parser))

In [34]:
# check shape and sample rows
print(df_mails.shape)
df_mails.head()

(7952, 3)


Unnamed: 0,file_path,label,is_multipart
0,/Users/nikhilarora/Desktop/Data Science/Projec...,spam,True
1,/Users/nikhilarora/Desktop/Data Science/Projec...,spam,True
2,/Users/nikhilarora/Desktop/Data Science/Projec...,spam,False
3,/Users/nikhilarora/Desktop/Data Science/Projec...,spam,True
4,/Users/nikhilarora/Desktop/Data Science/Projec...,spam,False


In [35]:
# check distribution
df_mails['is_multipart'].value_counts()

False    7264
True      688
Name: is_multipart, dtype: int64

In [36]:
# keep only single part mails (multipart mails to be tackled in the next version)
df_mails = df_mails[df_mails['is_multipart'] == 'False'].copy()
df_mails.reset_index(drop = True, inplace = True)
print(df_mails.shape)

(7264, 3)


In [37]:
# check label distribution
df_mails['label'].value_counts() / df_mails.shape[0]

ham     0.72522
spam    0.27478
Name: label, dtype: float64

In [38]:
# divide between X and y
X = df_mails[['file_path']]
y = [1 if a == 'spam' else 0 for a in df_mails['label'].str.lower()]

In [39]:
X.shape, len(y), sum(y)/len(y)

((7264, 1), 7264, 0.2747797356828194)

### 4. Create train-test split

In [40]:
# import train test split function
from sklearn.model_selection import train_test_split

In [41]:
# perform the train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [42]:
# check shapes of X_train and X_test
X_train.shape, X_test.shape

((5811, 1), (1453, 1))

In [43]:
# check label distributions in y_train and y_test (should follow the original data's distributions)
print('Label distribution of train data \n', sum(y_train) / len(y_train), '\n')
print('Label distribution of test data \n', sum(y_test) / len(y_test))

Label distribution of train data 
 0.2743073481328515 

Label distribution of test data 
 0.276668960770819


### 4. Create features from email files

In [44]:
# import BaseEstimator and TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin

#### 4.1 Create custom transformer for creating basic features

In [45]:
class BasicEmailFeatures(BaseEstimator, TransformerMixin):
    
    '''
    Custom transformer to create basic features from an email file

    '''
    
    
    def __init__(self):
        return None
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        
        sender_address_list = []
        sender_name_list = []
        sender_business_list = []
        sender_domain_list = []
        sender_domain_type_list = []
        
        subject_list = []
        subject_len_list = []
        
        payload_list = []
        payload_len_list = []
        payload_has_url_list = []
        
        content_type_list = []
        
        bytes_parser = BytesParser()
        nlp = spacy.load('en')
        
        # go through each email to create features
        for file_path in X['file_path']:
            with open(file_path, mode = 'rb') as f:
                
                # read file
                email_file = f.read()
                
                
                ## parse file
                email_obj = bytes_parser.parsebytes(email_file)
                
                
                ## get the sender email address
                sender = email_obj['From']
                
                if sender != None:
                    sender_add, encoding = decode_header(sender)[0]

                    if isinstance(sender_add, bytes):
                        sender_add = sender_add.decode('ascii', 'ignore')
                    
                    sender_add = sender_add.lower()
                    
                    
                doc = nlp(sender_add)
                
                for token in doc:
                    if token.like_email:
                        sender_add = token.text
                
                
                ## get sender name
                sender_name = ''
                sender_name = sender_add.split('@')[0]
                
                
                ## get sender domain
                sender_domain = ''
                sender_domain = sender_add.split('@')[-1]
                
                
                 ## get sender business
                sender_business = ''
                sender_business = sender_domain.split('.')[0]
                
                
                ## get sender website type (.com, .net etc.)
                sender_domain_type = ''
                sender_domain_type = sender_domain.split('.')[-1]
                
                
                ## get the mail subject
                subject = ''
                header = email_obj["Subject"]
                
                if header != None:
                    subject, encoding = decode_header(header)[0]

                    if isinstance(subject, bytes):
                        subject = subject.decode('ascii', 'ignore')
                        
                        
                ## get subject length
                subject_len = len(subject)
                
    
                ## get the mail body
                payload = email_obj.get_payload()
            
            
                ## get payload length
                payload_len = len(payload)
                
                
                ## get flag whether payload has url
                payload_has_url = 0
                
                doc_payload = nlp(payload)
                
                for token in doc_payload:
                    if token.like_url:
                        payload_has_url = 1
                
                
                ## get content type
                content_type = email_obj.get_content_type()
                
                ## add featrues to respective lists
                sender_address_list.append(sender_add)
                sender_name_list.append(sender_name)
                sender_business_list.append(sender_business)
                sender_domain_list.append(sender_domain)
                sender_domain_type_list.append(sender_domain_type)
                
                subject_list.append(subject)
                subject_len_list.append(subject_len)
                
                payload_list.append(payload)
                payload_len_list.append(payload_len)
                payload_has_url_list.append(payload_has_url)
                
                content_type_list.append(content_type)
          
        
        ## add columns to X
        X = pd.DataFrame(data = {'file_path' : X['file_path'].tolist(),
                                 'sender_add' : sender_address_list,
                                 'sender_name' : sender_name_list,
                                 'sender_business' : sender_business_list,
                                 'sender_domain' : sender_domain_list,
                                 'sender_domain_type' : sender_domain_type_list,
                                 'content_type' : content_type_list,
                                 'subject' : subject_list,
                                 'subject_len' : subject_len_list,
                                 'payload' : payload_list,
                                 'payload_len' : payload_len_list,
                                 'payload_has_url' : payload_has_url_list})
        
        ## return new X
        return X

#### 4.2 Create column transformer to parallelize transformers for below:
1. Create TF-IDF based DTM for email subject text
2. Create TF-IDF based DTM for email body text
3. One hot encoding of sender_business, sender_domain_type, content_type, payload_has_url
4. Standard Scaler for subject_len, payload_len

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [47]:
feature_pipeline = ColumnTransformer([
    ('dtm_subject', TfidfVectorizer(stop_words = 'english', min_df = 0.01, decode_error = 'ignore'), 'subject'),
    ('dtm_payload', TfidfVectorizer(stop_words = 'english', min_df = 0.01, decode_error = 'ignore'), 'payload'),
    ('one_hot_encoder', OneHotEncoder(handle_unknown = 'ignore'), ['sender_business', 'sender_domain_type' , 'content_type', 'payload_has_url']),
    ('standard_scaler', StandardScaler(), ['subject_len', 'payload_len'])
],
    sparse_threshold = 0,
    remainder = 'drop',
    verbose = True,
    n_jobs = -1)

#### 4.3 Create full pipeline

In [48]:
from sklearn.pipeline import Pipeline

In [49]:
full_pipeline = Pipeline([
    ('basic_features', BasicEmailFeatures()),
    ('feature_pipeline', feature_pipeline)
], verbose = True)

### 5. Create feature train and data set

#### 5.1 Fit on train data and transform it

In [50]:
X_train_updated = full_pipeline.fit_transform(X_train)

[Pipeline] .... (step 1 of 2) Processing basic_features, total= 6.8min
[Pipeline] .. (step 2 of 2) Processing feature_pipeline, total=   4.2s


In [51]:
X_train_updated.shape

(5811, 3291)

In [52]:
X_train_updated

array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -1.31273151, -0.32185419],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.14987621, -0.38042689],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.77670809,  3.87373082],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -0.11130374,  0.10913994],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -0.94707958, -0.37391881],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.30658418, -0.23100674]])

In [53]:
X_train_updated_copy = X_train_updated.copy()

#### 5.2 Transform the test data

In [54]:
X_test_updated = full_pipeline.transform(X_test)

In [55]:
X_test_updated.shape

(1453, 3291)

In [56]:
X_test_updated

array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.62000012,  0.42245739],
       [ 0.        ,  0.55495388,  0.        , ...,  1.        ,
        -0.26801171, -0.23406155],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.51552814, -0.3820207 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -0.58142765, -0.25252324],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
        -0.73813562, -0.369403  ],
       [ 0.55379217,  0.46208509,  0.        , ...,  1.        ,
        -0.16353973, -0.31069748]])

In [57]:
X_test_updated_copy = X_test_updated.copy()

### 5. Train the model

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

#### 5.1 Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

In [60]:
# initialize param grid
param_grid_rf = [
    {'n_estimators' : [10, 100, 500],
     'max_features' : ['auto', 'log2'],
     'max_depth' : [1, 3, 5, 10, 50, None]}
]

In [61]:
# initialize the rf model
rf_model = RandomForestClassifier()

In [62]:
# create the grid search object
grid_search_rf = GridSearchCV(rf_model, 
                              param_grid = param_grid_rf,
                              cv = 5,
                              n_jobs = -1,
                              verbose = 1)

In [63]:
# train the model
grid_search_rf.fit(X_train_updated, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'max_depth': [1, 3, 5, 10, 50, None],
                          'max_features': ['auto', 'log2'],
                          'n_estimators': [10, 100, 500]}],
             verbose=1)

In [64]:
# see best params
grid_search_rf.best_params_

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 500}

In [65]:
# keep the best model
rf_best_model = grid_search_rf.best_estimator_

In [66]:
# predictions on train data
X_train_pred = rf_best_model.predict(X_train_updated)

In [67]:
print(confusion_matrix(y_train, X_train_pred), '\n')
print('Accuracy: ', accuracy_score(y_train, X_train_pred))
print('F1 Score: ', f1_score(y_train, X_train_pred), '\n')
print(classification_report(y_train, X_train_pred))

[[4217    0]
 [   0 1594]] 

Accuracy:  1.0
F1 Score:  1.0 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4217
           1       1.00      1.00      1.00      1594

    accuracy                           1.00      5811
   macro avg       1.00      1.00      1.00      5811
weighted avg       1.00      1.00      1.00      5811



In [68]:
# predictions on test data
X_test_pred = rf_best_model.predict(X_test_updated)

In [69]:
print(confusion_matrix(y_test, X_test_pred), '\n')
print('Accuracy: ', accuracy_score(y_test, X_test_pred))
print('F1 Score: ', f1_score(y_test, X_test_pred), '\n')
print(classification_report(y_test, X_test_pred))

[[1049    2]
 [  11  391]] 

Accuracy:  0.9910529938059188
F1 Score:  0.9836477987421383 

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1051
           1       0.99      0.97      0.98       402

    accuracy                           0.99      1453
   macro avg       0.99      0.99      0.99      1453
weighted avg       0.99      0.99      0.99      1453



In [70]:
# feature importances
sorted(zip(feature_pipeline.get_feature_names_out(), rf_best_model.feature_importances_), reverse = True, 
       key = lambda t: t[1])

[('dtm_payload__remove', 0.027276111838227646),
 ('one_hot_encoder__content_type_text/plain', 0.019963001286792282),
 ('dtm_payload__href', 0.017814498424312965),
 ('one_hot_encoder__content_type_text/html', 0.016418872013303623),
 ('dtm_payload__br', 0.01585339495470493),
 ('dtm_payload__body', 0.015605290567294861),
 ('dtm_payload__font', 0.015393676076202423),
 ('dtm_payload__size', 0.013915470451611825),
 ('dtm_payload__removed', 0.012895679065519857),
 ('dtm_payload__receive', 0.01252314768180452),
 ('dtm_payload__2002', 0.010549637121237842),
 ('dtm_payload__color', 0.01020747120826327),
 ('dtm_payload__visit', 0.009459817137816406),
 ('dtm_payload__free', 0.00900832632140696),
 ('dtm_payload__click', 0.007857580958432248),
 ('dtm_payload__reply', 0.007391566728174382),
 ('dtm_payload__money', 0.007307929512535718),
 ('dtm_payload__information', 0.0068737555378197),
 ('one_hot_encoder__sender_business_yahoo', 0.0067762461958032535),
 ('dtm_payload__align', 0.006633900617182617),


[ColumnTransformer] ... (2 of 4) Processing dtm_payload, total=   1.6s
[ColumnTransformer] ... (1 of 4) Processing dtm_subject, total=   0.1s
[ColumnTransformer]  (4 of 4) Processing standard_scaler, total=   0.0s
[ColumnTransformer]  (3 of 4) Processing one_hot_encoder, total=   0.0s
