<a href="https://colab.research.google.com/github/nikagrawal90/Spam_Filter/blob/master/Spam_Filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# importing dataframe
df = pd.read_csv('spam_ham_dataset.csv')

In [None]:
#Drop unnecessary data.
df.drop(['Unnamed: 0'],axis=1,inplace=True)

#Remove any duplicates (if any)
df.drop_duplicates(inplace=True)

#Replace empty strings with nan values so that we can remove them later
df.replace("", np.nan, inplace=True)

#Drop NAN values as they are of no use to us
df.dropna()


Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5165,ham,"Subject: fw : crosstex energy , driscoll ranch...",0
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#preprocessing

#import regex
import re

#Every mail start with Subject: thus remove it, also replace any non alphanumeric charcter with a empty string and convert the resulting string to lowercase
df['text'] = df['text'].apply(lambda x: x[8:]).apply(lambda x: (re.sub('(?:[^A-Za-z0-9\s])+','', x)).lower())

#import lemmaizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#lemmatize the words for saving memory
df['text'] = df['text'].apply(lambda x: lemmatizer.lemmatize(x))

#import stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Join the list of text words which are not in stopwords on a whitespace
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
df

Unnamed: 0,label,text,label_num
0,ham,enron methanol meter 988291 follow note gave m...,0
1,ham,hpl nom january 9 2001 see attached file hplno...,0
2,ham,neon retreat ho ho ho around wonderful time ye...,0
3,spam,photoshop windows office cheap main trending a...,1
4,ham,indian springs deal book teco pvr revenue unde...,0
...,...,...,...
5165,ham,fw crosstex energy driscoll ranch 1 3 meter no...,0
5166,ham,put 10 ft transport volumes decreased 25000 10...,0
5167,ham,3 4 2000 following noms hpl take extra 15 mmcf...,0
5169,ham,industrial worksheets august 2000 activity att...,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Assign a CountVectorizer object to vec
vec = CountVectorizer()

#Transform the training and test data using vec object
X = vec.fit_transform(df['text'])

from sklearn.model_selection import train_test_split
# Split the training and test set test size=0.20,default is 0.25 and random state = 0 , so that everytime we run the code it results in similar output
X_train, X_test, y_train, y_test = train_test_split(X, df['label_num'], random_state=0, test_size=0.2)


In [None]:
from sklearn.model_selection import GridSearchCV
# Set candidates for best parameter
parameters_candidate = {'C':[0.001,0.1,1,10]}

#The classifier algorithm we are using is Logistic Regression based on linear model
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

#Use GridSearchCV to find the best candidate for parameter
#Default Stratified kfold validation which ensures that splitting is done by maintaing the proportion of each class in y
grid = GridSearchCV(estimator=clf, param_grid=parameters_candidate)

#fit the grid on training set
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.1, 1, 10]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [None]:
#Now we are ready to test our model
predictions = grid.predict(X_test)
print(grid.best_params_)
print(grid.score(X_test,y_test))

#Lets print Confusion matrix for test set
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

{'C': 1}
0.9819819819819819


array([[708,  14],
       [  4, 273]])

In [None]:
#Here we can see that for MultinomailNB we get best parameter tuning for alpha=1.0
#The accuracy we got is nearly 98%

In [None]:
# Lets test our model with cross validator
from sklearn.model_selection import cross_val_score
#List of accuracy for every split inside cross validator
accuracy_list =  cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 5)

#Mean accuracy for our model
mean_accuracy = accuracy_list.mean()
print(mean_accuracy)

0.9767149412956672


In [None]:
#Here we can see that the mean accuracy score with 10 splits in cross_validation is nearly 97.6% which is quite good !