<a href="https://colab.research.google.com/github/nidhin-koshy/ML_AI_IISc/blob/master/42_email_spam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Email Spam Classification

In this module we will attempt to classify emails as a spam email or not. We will use the email dataset available at: http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/index.html

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
#import nltk
#nltk.download('punkt')

In [2]:
folder_list = ['enron1','enron2','enron3','enron4','enron5','enron6'] # The data are present in 6 folders. We will look at just the first two as this notebook takes a lot of memory and crashes if we try to use all the 6 files.
for folder in folder_list:
  
  os.system("wget -N 'http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/" +folder + ".tar.gz'")
  os.system("tar -xzf "+folder+".tar.gz")
! ls -l

total 16980
drwx------ 4 1006  513    4096 May 15  2006 enron1
-rw-r--r-- 1 root root 1802573 Sep 14  2013 enron1.tar.gz
drwx------ 4 1006  513    4096 May 15  2006 enron2
-rw-r--r-- 1 root root 2905627 Sep 14  2013 enron2.tar.gz
drwx------ 4 1006  513    4096 May 15  2006 enron3
-rw-r--r-- 1 root root 4569634 Sep 14  2013 enron3.tar.gz
drwx------ 4 1006  513    4096 May 15  2006 enron4
-rw-r--r-- 1 root root 2533019 Sep 14  2013 enron4.tar.gz
drwx------ 4 1006  513    4096 May 15  2006 enron5
-rw-r--r-- 1 root root 2396886 Sep 14  2013 enron5.tar.gz
drwx------ 4 1006  513    4096 May 15  2006 enron6
-rw-r--r-- 1 root root 3137204 Sep 14  2013 enron6.tar.gz
drwxr-xr-x 1 root root    4096 Jul 30 16:17 sample_data


In [3]:
! cat enron1/Summary.txt

Legitimate
----------
- Owner: farmer-d
- Total number: 3672 emails
- Date of first email: 1999-12-10
- Date of last email: 2002-01-11
- Similars deletion: No
- Encoding: No


Spam
----
- Owner: GP
- Total number: 1500 emails
- Date of first email: 2003-12-18
- Date of last email: 2005-09-06
- Similars deletion: No
- Encoding: No

Spam:Legitimate rate = 1:3
Total number of emails (legitimate + spam): 5975


In [0]:

token_ham_list=[]
document_ham_list =[]
token_spam_list =[]
document_spam_list=[]

for folder in folder_list:
  

  directory='enron1/ham' # emails which are non-spam
  for filename in os.listdir(directory):
      if filename.endswith(".txt"):
          filename_complete = os.path.join(directory, filename) #full filename with directory path
          #print(os.path.join(directory, filename))
          file_content = open(filename_complete).read() # open the file and read the contents
          #token_ham_list.append(nltk.word_tokenize(file_content))
          document_ham_list.append(file_content) # append the file contents as an element to a list.
          continue
      else:
          continue



  directory='enron1/spam' # emails which are spam
  for filename in os.listdir(directory):
      if filename.endswith(".txt"):
          filename_complete = os.path.join(directory, filename)
          #print(os.path.join(directory, filename))
          file_content = open(filename_complete,encoding='latin-1').read()
          #token_spam_list.append(nltk.word_tokenize(file_content))
          document_spam_list.append(file_content)
          continue
      else:
          continue       

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
document_list = np.concatenate((document_ham_list,document_spam_list),axis=0) # concatenate the non-spam and spam contents
#print(document_list[:10])
#print(document_list[-10:])
vectorizer = CountVectorizer(max_features=1000) # Initialise the word count extractor and restrict the number of features to 1000
word_count_array = vectorizer.fit_transform(document_list) # vectorizes the text content as word count vector
word_count_array = word_count_array.toarray() #convert the vectorized data as an array. This will be a very very sparse matrix

In [6]:
print("features = ", vectorizer.get_feature_names()) # prints the word dictionary
print("length of feature vector = ", len(vectorizer.get_feature_names())) # prints the length of the word dictionary
print("word_count_array = ", word_count_array[:10,:50])
print("word_count_array_shape = ",word_count_array.shape ) # prints the shape of the word count matrix

features =  ['00', '000', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '11', '12', '120', '13', '14', '15', '16', '161', '17', '18', '19', '1999', '20', '200', '2000', '2001', '2003', '2004', '21', '22', '23', '24', '25', '26', '27', '28', '281', '29', '30', '300', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '42', '44', '45', '47', '48', '49', '50', '500', '52', '53', '54', '55', '56', '58', '59', '60', '63', '6353', '69', '70', '713', '75', '80', '800', '830', '85', '90', '95', '98', '99', 'able', 'about', 'above', 'access', 'account', 'accounting', 'act', 'action', 'activity', 'acton', 'actual', 'actuals', 'add', 'added', 'addition', 'additional', 'address', 'adobe', 'advice', 'advise', 'aep', 'after', 'afternoon', 'again', 'agree', 'agreement', 'aimee', 'al', 'albrecht', 'align', 'all', 'allen', 'allocated', 'allocation', 'already', 'also', 'always', 'am', 'america', 'ami', 'amount', 'an', 'and', 'anita', 'another', 'any', 'anyone', 'anything', 

In [7]:
ham_len = len(document_ham_list) # number of non-spam emails
spam_len=len(document_spam_list) # number of spam emails
print("ham_len = ", ham_len, ". spam_len = ",spam_len)

ham_len =  22032 . spam_len =  9000


In [8]:
labels = np.concatenate((np.zeros((ham_len,1)),np.ones((spam_len,1))),axis=0) # generate the label vector

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

plt.figure(figsize=(10,10))
x_train, x_test, y_train, y_test = train_test_split(word_count_array, labels, test_size=0.4,random_state=42) # randomly split the data into training and test samples. 0.4 fraction is set aside for testing and 0.6 is used for training
log_reg = LogisticRegression() #DecisionTreeClassifier(max_depth=4)
log_reg.fit(x_train,y_train)
#tree.plot_tree(log_reg.fit(x_train,y_train),feature_names=vectorizer.get_feature_names(),class_names=['non-spam','spam'])

print("Classification score = ", log_reg.score(x_test,y_test)) #Works for all classifiers

y_predict = log_reg.predict(x_test)
confu_matrix = confusion_matrix(y_test,y_predict) # obtain the confusion matrix
print("Confusion Matrix = \n", confu_matrix)


  y = column_or_1d(y, warn=True)


Classification score =  0.9964553290904696
Confusion Matrix = 
 [[8810   36]
 [   8 3559]]


<Figure size 720x720 with 0 Axes>