# Uczenie maszynowe w Python - Zaliczenie
## E-mail spam classifier

In [7]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from textblob import Word
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Analysis of the dataset

In [9]:
#get the dataset
df = pd.read_csv("spam_ham_dataset.csv")

# label the unnamed column with ids
df.rename(columns={'Unnamed: 0': 'id'}, inplace = True)

#show 5 rows
df.head()

Unnamed: 0,id,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [10]:
# assign information about data to variables

# amount of emails in the dataset
emailAmount = df.shape[0]

# Series with the amount of words in each email and their total count
wordsInEmails = (df['text'].apply(lambda x: len(str(x).split(" "))))
wordCount = np.sum(wordsInEmails)

# average amount of words in each email
avgWords = round(np.divide(wordCount, emailAmount), 2)

# amount of spam and ham emails 
hamAndSpamAmount= df.groupby(['label']).count()

hamAmount = hamAndSpamAmount.iloc[0, 0]
spamAmount = hamAndSpamAmount.iloc[1, 0]

In [11]:
# print out the information about the dataset

print(f'The dataset contains {emailAmount} emails, {hamAmount} of which are labeled as non-spam' + 
      f' and {spamAmount} as spam. Between the emails, theres a total of {wordCount} words,' +
      f' with an average of ~{avgWords} words per email.')

The dataset contains 5171 emails, 3672 of which are labeled as non-spam and 1499 as spam. Between the emails, theres a total of 1083244 words, with an average of ~209.48 words per email.


In [12]:
# clean the contents of each email

def clean_contents(string, reg = RegexpTokenizer(r'[a-z]+')):
    string = string.lower()
    tokens = reg.tokenize(string)
    return " ".join(tokens)

# apply the function to each row of the dataframe
df['text_clean'] = df['text'].apply(lambda string: clean_contents(string))

# show the first rows cleaned
df.head()

Unnamed: 0,id,label,text,label_num,text_clean
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter this is a follow ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom for january see attached file ...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho we re around to ...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject re indian springs this deal is to book...


In [13]:
# get rid of the stopwords

stop = stopwords.words('english')
df['text_clean'] = df['text_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# show the first rows without the stopwords
df.head()

Unnamed: 0,id,label,text,label_num,text_clean
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom january see attached file hpln...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonderful...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian springs deal book teco pvr reve...


In [14]:
# lemmatize the cleaned email contents

df['text_clean'] = df['text_clean'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# show the first rows after lemmatizing
df['text_clean'].head(10)

0    subject enron methanol meter follow note gave ...
1    subject hpl nom january see attached file hpln...
2    subject neon retreat ho ho ho around wonderful...
3    subject photoshop window office cheap main tre...
4    subject indian spring deal book teco pvr reven...
5    subject ehronline web address change message i...
6    subject spring saving certificate take save us...
7    subject looking medication best source difficu...
8    subject noms actual flow agree forwarded melis...
9    subject nomination oct see attached file hplnl...
Name: text_clean, dtype: object

In [21]:
# define the most frequent words in the emails

frequentWords = pd.Series(' '.join(df['text_clean']).split()).value_counts()
frequentWords = frequentWords[frequentWords > 1000]

# remove them from the email contents

frequentWords = list(frequentWords.index)
df['text_clean'] = df['text_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in frequentWords))

# show the first rows after removing the most frequent words 
df['text_clean'].head()

0    methanol follow note gave monday preliminary f...
1                       nom january file hplnol hplnol
2    neon retreat ho ho ho around wonderful year ne...
3    photoshop window office cheap main trending ab...
4    indian spring book teco pvr revenue understand...
Name: text_clean, dtype: object