Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [2]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('spam_ham_dataset.csv')

In [4]:
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [7]:
# raw_mail_data = raw_mail_data.drop(

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [17]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [18]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [19]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5171, 4)

Label Encoding

In [20]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['label'] == 'spam', 'label',] = 0
mail_data.loc[mail_data['label'] == 'ham', 'label',] = 1

# mail_data['label'] = mail_data['label'].map({'spam':0, 'ham':1})
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,1,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,1,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,0,"Subject: photoshop , windows , office . cheap ...",1
4,2030,1,Subject: re : indian springs\r\nthis deal is t...,0


In [22]:
# separating the data as texts and label

X = mail_data['text']

y = mail_data['label']
print(X,y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object 0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [25]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4136,)
(1035,)


Feature Extraction

In [27]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

y_train =y_train.astype('int')
y_test = y_test.astype('int')

In [28]:
print(X_train)

2209    Subject: hplc to wellhead\r\ndaren here is the...
2000    Subject: mobil chemical - hpl meter # 1256 - e...
5030    Subject: revised nom 5 / 5 - eastrans ; revise...
1376    Subject: re : exxon company , usa global # 960...
1564    Subject: your pharmacy nx\r\nwant a cheap pain...
                              ...                        
789     Subject: incr ' ease yo ' ur man ' hood by 4 -...
968     Subject: subscribers receive first notice on r...
1667    Subject: neon for march 28\r\nhere is the neon...
3321    Subject: re : first delivery - pure resources ...
1688    Subject: enhance your chest size\r\nemail is l...
Name: text, Length: 4136, dtype: object


In [29]:
print(X_train_features)

  (0, 38946)	0.01685805739459834
  (0, 21882)	0.15822963156663866
  (0, 43279)	0.15279926161032253
  (0, 13656)	0.0442446131354994
  (0, 26040)	0.06237658334565303
  (0, 13860)	0.05933334362320758
  (0, 29166)	0.04619404683559966
  (0, 32538)	0.08215570069958765
  (0, 42165)	0.1301153463744466
  (0, 38303)	0.12444307809779734
  (0, 37119)	0.09423749412823572
  (0, 28593)	0.09000082107218554
  (0, 16156)	0.3284546097301849
  (0, 12663)	0.17173706521192092
  (0, 1179)	0.13387711316973605
  (0, 27743)	0.44166331773844575
  (0, 2645)	0.12444307809779734
  (0, 1177)	0.13387711316973605
  (0, 2844)	0.11843023142166303
  (0, 31384)	0.14556222812251965
  (0, 836)	0.14556222812251965
  (0, 3875)	0.14556222812251965
  (0, 16637)	0.24438399643390496
  (0, 19429)	0.14556222812251965
  (0, 517)	0.14556222812251965
  :	:
  (4135, 18858)	0.08197239345561386
  (4135, 41043)	0.08197239345561386
  (4135, 39629)	0.08197239345561386
  (4135, 14442)	0.08197239345561386
  (4135, 42179)	0.08197239345561386
 

Training the Model

Logistic Regression

In [30]:
model = LogisticRegression()

In [31]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, y_train)

Evaluating the trained model

In [33]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [34]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9968568665377177


In [35]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [36]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9806763285024155


Building a Predictive System

In [37]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
