In [15]:
import pandas as pd
import scipy as sp
from bs4 import BeautifulSoup
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to /home/ashwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading ham and spam files

In [3]:
spam_filenames=os.listdir(os.path.join("spam"))
ham_filenames=os.listdir(os.path.join("easy_ham"))


### Email parsing

In [4]:
import email
import email.policy
def load_email(directory,filename):
    with open(os.path.join(directory, filename), "rb") as f:
     return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
    

In [5]:
ham_emails = [load_email("easy_ham", filename=name) for name in ham_filenames]
spam_emails = [load_email("spam", filename=name) for name in spam_filenames]

### splitting data

In [6]:
X=np.array(ham_emails+spam_emails)
Y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=40)


### Data Preprocessing

#### Convert all emails to plain text

In [7]:

def html_text(html):
    soup = BeautifulSoup(html,"html")
    return soup.get_text()
    
def email_to_text(email):
    # Enter code  #
    
    return str(email.get_payload())


### Finding most common words

In [32]:
def clean_words(wordlist):
    newlist=[]
    stemmer=PorterStemmer()
    stop_words=set(stopwords.words('english'))
    for word in wordlist:
        if (word.isalpha() and word not in stop_words and len(word)>2):
            #can also implement stemming by stemmer.stem(word)
            newlist.append(word)
            
   
    return newlist

count=0
word_list=[]
for i in X_train:
    mail=email_to_text(i)
    if mail is not None:
        words=None
        words=mail.lower().split()
        final_word=clean_words(words)
        for w in final_word:
            word_list.append(w)

most_common_words= [word for word, word_count in Counter(word_list).most_common(500)]
most_common = [item for item in Counter(word_list).most_common(500)]
print(most_common_words)

        

['one', 'get', 'would', 'list', 'new', 'like', 'people', 'use', 'email', 'mailing', 'also', 'make', 'free', 'even', 'could', 'time', 'think', 'first', 'using', 'many', 'want', 'see', 'know', 'may', 'way', 'much', 'message', 'need', 'work', 'send', 'please', 'good', 'find', 'take', 'still', 'sep', 'money', 'world', 'united', 'business', 'linux', 'two', 'every', 'government', 'object', 'mail', 'states', 'said', 'really', 'something', 'must', 'since', 'internet', 'years', 'best', 'help', 'web', 'software', 'made', 'last', 'got', 'used', 'right', 'line', 'next', 'spam', 'without', 'old', 'information', 'might', 'found', 'file', 'change', 'name', 'another', 'address', 'going', 'back', 'better', 'data', 'well', 'different', 'system', 'messages', 'sure', 'set', 'say', 'never', 'within', 'give', 'xml', 'technology', 'report', 'look', 'run', 'sponsored', 'security', 'september', 'actually', 'order', 'number', 'put', 'come', 'million', 'great', 'keep', 'home', 'company', 'network', 'long', 'prob

### Transformed training set


In [33]:
X_transform=[]
for email in X_train[:1500]:    
    mail=email_to_text(email)
    X_word=[]
    if mail is not None:
        words=None
        words=mail.lower().split()
        for j in most_common_words:
            num=words.count(j)
            X_word.append(num)
    
    X_transform.append(X_word) 

from scipy import sparse
sparse.csr_matrix(X_transform)

<1500x500 sparse matrix of type '<class 'numpy.int64'>'
	with 33011 stored elements in Compressed Sparse Row format>

In [34]:
## Bonus Task #
# Fine tune the hyperparameters #
lg=RandomForestClassifier()
lg.fit(X_transform,Y_train[:1500])
score = cross_val_score(lg, X_transform, Y_train[:1500], cv=10, verbose=0)
score.mean()

0.9333225476687852