<a href="https://colab.research.google.com/github/nikagrawal90/Spam_Filter/blob/master/Spam_Filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [125]:
import pandas as pd
import numpy as np

In [147]:
# importing dataframe
df = pd.read_csv('/content/drive/My Drive/spam_ham_dataset.csv')

In [148]:
#Drop unnecessary data.
df.drop(['Unnamed: 0'],axis=1,inplace=True)

#Remove any duplicates (if any)
df.drop_duplicates(inplace=True)

#Replace empty strings with nan values so that we can remove them later
df.replace("", np.nan, inplace=True)

#Drop NAN values as they are of no use to us
df.dropna()


Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5165,ham,"Subject: fw : crosstex energy , driscoll ranch...",0
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [128]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [161]:
#preprocessing

#import regex
import re

#Every mail start with Subject: thus remove it, also replace any non alphanumeric charcter with a empty string and convert the resulting string to lowercase
df['text'] = df['text'].apply(lambda x: x[8:]).apply(lambda x: (re.sub('(?:[^A-Za-z0-9\s])+','', x)).lower())

#import lemmaizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#lemmatize the words for saving memory
df['text'] = df['text'].apply(lambda x: lemmatizer.lemmatize(x))

#import stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Join the list of text words which are not in stopwords on a whitespace
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
df

Unnamed: 0,label,text,label_num
0,ham,meter 988291 follow note gave monday 4 3 00 pr...,0
1,ham,uary 9 2001 see attached file hplnol 09 xls hp...,0
2,ham,ho ho around wonderful time year neon leaders ...,0
3,spam,dows office cheap main trending abasements dar...,1
4,ham,ings deal book teco pvr revenue understanding ...,0
...,...,...,...
5165,ham,nergy driscoll ranch 1 3 meter nos 9858 9868 p...,0
5166,ham,ft transport volumes decreased 25000 10000 100...,0
5167,ham,following noms hpl take extra 15 mmcf weekend ...,0
5169,ham,sheets august 2000 activity attached worksheet...,0


In [162]:
from sklearn.feature_extraction.text import CountVectorizer

#Assign a CountVectorizer object to vec
vec = CountVectorizer()

#Transform the training and test data using vec object
X = vec.fit_transform(df['text'])

from sklearn.model_selection import train_test_split
# Split the training and test set test size=0.20,default is 0.25 and random state = 0 , so that everytime we run the code it results in similar output
X_train, X_test, y_train, y_test = train_test_split(X, df['label_num'], random_state=0, test_size=0.2)


In [163]:
from sklearn.model_selection import GridSearchCV
# Set candidates for best parameter
parameters_candidate = {'alpha':[0.001,0.01,0.1,1,10,100]}

#The classifier algorithm we are using is Logistic Regression based on linear model
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

parameters_candidate = {'alpha':[0.001,0.1,1,10,100]}

#Use GridSearchCV to find the best candidate for parameter
#Default Stratified kfold validation which ensures that splitting is done by maintaing the proportion of each class in y
grid = GridSearchCV(estimator=clf, param_grid=parameters_candidate)

#fit the grid on training set
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.001, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [166]:
#Now we are ready to test our model
predictions = grid.predict(X_test)
print(grid.best_params_)
print(grid.score(X_test,y_test))

#Lets print Confusion matrix for test set
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

{'alpha': 1}
0.974974974974975


array([[710,  12],
       [ 13, 264]])

In [133]:
#Here we can see that for MultinomailNB we get best parameter tuning for alpha=1.0
#The accuracy we got is nearly 97.4%

In [165]:
# Lets test our model with cross validator
from sklearn.model_selection import cross_val_score
#List of accuracy for every split inside cross validator
accuracy_list =  cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 5)

#Mean accuracy for our model
mean_accuracy = accuracy_list.mean()
print(mean_accuracy)

0.9797202643655446


In [135]:
#Here we can see that the mean accuracy score with 10 splits in cross_validation is nearly 98% which is quite good