In [2]:
import pandas as pd
import scipy as sp
from bs4 import BeautifulSoup
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

### Reading ham and spam files

In [5]:
spam_filenames=os.listdir(os.path.join("spam"))
ham_filenames=os.listdir(os.path.join("easy_ham"))


### Email parsing

In [6]:
import email
import email.policy
def load_email(directory,filename):
    with open(os.path.join(directory, filename), "rb") as f:
     return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
    

In [7]:
ham_emails = [load_email("easy_ham", filename=name) for name in ham_filenames]
spam_emails = [load_email("spam", filename=name) for name in spam_filenames]

### splitting data

In [29]:
X=np.array(ham_emails+spam_emails)
Y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=40)


### Data Preprocessing

#### Convert all emails to plain text

In [33]:

def html_text(html):
    soup = BeautifulSoup(html,"html")
    return soup.get_text()
    
def email_to_text(email):
    # Enter code  #
    
    return str(email.get_payload())


### Finding most common words

In [57]:
def clean_words(wordlist):
    newlist=[]
    for word in wordlist:
        if (word.isalpha() and len(word)>2):
            newlist.append(word)
            
   
    return newlist

count=0
word_list=[]
for i in X_train:
    mail=email_to_text(i)
    if mail is not None:
        words=None
        words=mail.lower().split()
        final_word=clean_words(words)
        for w in final_word:
            word_list.append(w)

most_common_words= [word for word, word_count in Counter(word_list).most_common(500)]
most_common = [item for item in Counter(word_list).most_common(500)]
print(most_common_words)

        

['the', 'and', 'that', 'for', 'you', 'this', 'with', 'have', 'are', 'not', 'from', 'your', 'but', 'was', 'will', 'can', 'all', 'they', 'has', 'our', 'just', 'more', 'about', 'one', 'get', 'their', 'would', 'list', 'what', 'which', 'out', 'new', 'like', 'there', 'who', 'people', 'any', 'use', 'only', 'some', 'when', 'other', 'been', 'email', 'than', 'mailing', 'his', 'also', 'had', 'how', 'its', 'make', 'then', 'into', 'these', 'because', 'were', 'over', 'most', 'free', 'even', 'could', 'time', 'should', 'now', 'think', 'them', 'first', 'those', 'using', 'many', 'where', 'want', 'see', 'know', 'may', 'after', 'way', 'much', 'here', 'same', 'very', 'message', 'need', 'work', 'send', 'please', 'being', 'good', 'find', 'take', 'through', 'does', 'such', 'still', 'sep', 'money', 'before', 'world', 'united', 'business', 'own', 'linux', 'two', 'every', 'government', 'object', 'mail', 'states', 'said', 'really', 'something', 'must', 'since', 'internet', 'years', 'best', 'while', 'help', 'web',

### Transformed training set


In [58]:
X_transform=[]
for email in X_train[:1500]:    
    mail=email_to_text(email)
    X_word=[]
    if mail is not None:
        words=None
        words=mail.lower().split()
        for j in most_common_words:
            num=words.count(j)
            X_word.append(num)
    
    X_transform.append(X_word) 

from scipy import sparse
sparse.csr_matrix(X_transform)

<1500x500 sparse matrix of type '<class 'numpy.int64'>'
	with 53752 stored elements in Compressed Sparse Row format>

In [59]:
## Bonus Task #
# Fine tune the hyperparameters #
lg=RandomForestClassifier()
lg.fit(X_transform,Y_train[:1500])
score = cross_val_score(lg, X_transform, Y_train[:1500], cv=10, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ....................... , score=0.9403973509933775, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9271523178807947, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9139072847682119, total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ..................................... , score=0.96, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9333333333333333, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9333333333333333, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9466666666666667, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9328859060402684, total=   0.1s
[CV]  ................................................................
[CV] ........................ , score=0.912751677852349, total=   0.1s
[CV]  ................................................................
[CV] ....................... , score=0.9261744966442953, total=   0.1s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.7s finished


0.932660236751263