In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

Dta Collection & Pre-processing

In [41]:
mail_df=pd.read_csv('spam_ham_dataset.csv')
mail_df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [42]:
mail_df.shape

(5171, 4)

In [43]:
mail_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
label,0
text,0
label_num,0


In [44]:
mail_df['label_num'].value_counts()

Unnamed: 0_level_0,count
label_num,Unnamed: 1_level_1
0,3672
1,1499


spam---> 1                 




ham-----> 0

In [45]:
mail_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,3672
spam,1499


In [46]:
mail_df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [47]:
mail_df.drop(['Unnamed: 0','label'],axis=1,inplace=True)
mail_df.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [48]:
#separating data as texts and label
X=mail_df['text']
Y=mail_df['label_num']

In [49]:
print(X)
print(Y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


In [50]:
#splitting the data into train and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3,stratify=Y)

In [51]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4136,)
(1035,)
(4136,)
(1035,)


 Feature extraction

In [57]:
#transform the text data to feature vectors thet can be used as input to logistic reg
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

In [58]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 271005 stored elements and shape (4136, 44782)>
  Coords	Values
  (0, 38848)	0.0233968588143813
  (0, 27550)	0.10508568721241728
  (0, 40166)	0.13945534705545934
  (0, 37401)	0.15822773419334218
  (0, 8341)	0.1262471229642913
  (0, 14965)	0.18058370056817458
  (0, 32023)	0.19253541556619574
  (0, 31211)	0.0943927865303455
  (0, 21873)	0.19253541556619574
  (0, 7854)	0.10262058210296071
  (0, 26929)	0.12057194258560895
  (0, 43603)	0.12970134158427699
  (0, 36778)	0.2805527474213879
  (0, 26050)	0.09780429065409588
  (0, 9440)	0.1858045587336128
  (0, 415)	0.06936462200543111
  (0, 40000)	0.07925814404276134
  (0, 28462)	0.1479761622175791
  (0, 36179)	0.1519511995740034
  (0, 43760)	0.13193130040473205
  (0, 26054)	0.2805527474213879
  (0, 13852)	0.1760943876720704
  (0, 14786)	0.09120985904078249
  (0, 43613)	0.09193944159143777
  (0, 25508)	0.11571383368358455
  :	:
  (4135, 26414)	0.09526535852763258
  (4135, 3653)	0.1036

Training the model


In [54]:
lg_model=LogisticRegression()
lg_model.fit(X_train_features,Y_train)


In [55]:
y_train_pred=lg_model.predict(X_train_features)
training_accuracy=accuracy_score(y_train_pred,Y_train)
print("Training Accuracy:",training_accuracy)

Training Accuracy: 0.9961315280464217


In [56]:
y_test_pred=lg_model.predict(X_test_features)
test_accuracy=accuracy_score(y_test_pred,Y_test)
print("test Accuracy:",test_accuracy)

test Accuracy: 0.9874396135265701


Building a prediction system

In [61]:
input=["Subject: vlc , 0 dln for sale , no prior pres . crip . tion neededenjoy up to 80 % off"]
#comnveert text to feature vectors
input_data_features=feature_extraction.transform(input)
# making prediction
prediction =lg_model.predict(input_data_features)
print(prediction)
if prediction[0]==1:
  print('spam mail')
else:
  print('ham mail')

[1]
spam mail
