In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('dataset.csv')

In [4]:
print(raw_mail_data)

       Category                                            Message
0          True  But what has the bomb to do with what I wish y...
1          True              The bomb was approached with caution.
2          True  The road was ripped open, as if by a massive b...
3          True  He wasn't about to believe Death until he saw ...
4          True  Colin finds an unexploded bomb in the Leisure ...
...         ...                                                ...
52411     False  She fell in love with a trapper and he took he...
52412     False                          Maybe love was like that.
52413     False       Oh, I'd love to come here and fish sometime.
52414     False  Mary had never been the jealous type, but then...
52415     False  Was she blinded by love now, or had she merely...

[52416 rows x 2 columns]


In [5]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [6]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,True,But what has the bomb to do with what I wish y...
1,True,The bomb was approached with caution.
2,True,"The road was ripped open, as if by a massive b..."
3,True,He wasn't about to believe Death until he saw ...
4,True,Colin finds an unexploded bomb in the Leisure ...


In [7]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(52416, 2)

In [8]:
# label spam mail as 1;  ham mail as 0;

mail_data.loc[mail_data['Category'] == 'True', 'Category',] = 1
mail_data.loc[mail_data['Category'] == 'False', 'Category',] = 0

true here - ie 1 sus

false here - ie 0 non sus

In [9]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [10]:
print(X)

0        But what has the bomb to do with what I wish y...
1                    The bomb was approached with caution.
2        The road was ripped open, as if by a massive b...
3        He wasn't about to believe Death until he saw ...
4        Colin finds an unexploded bomb in the Leisure ...
                               ...                        
52411    She fell in love with a trapper and he took he...
52412                            Maybe love was like that.
52413         Oh, I'd love to come here and fish sometime.
52414    Mary had never been the jealous type, but then...
52415    Was she blinded by love now, or had she merely...
Name: Message, Length: 52416, dtype: object


In [11]:
print(Y)

0         True
1         True
2         True
3         True
4         True
         ...  
52411    False
52412    False
52413    False
52414    False
52415    False
Name: Category, Length: 52416, dtype: object


Splitting the data into training data & test data

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [13]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(52416,)
(41932,)
(10484,)


Feature Extraction

In [14]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [15]:
print(X_train)

21133    Mom would have been horrified to see her do th...
3984     Above this is the kamarchin, a tunic of colore...
38046    The same phenomena have been witnessed, not on...
2915     This example is similar to cases among the Pol...
7967       Well, technically I have to eat, just not food.
                               ...                        
25544    Revelation is a divine source of knowledge, of...
48056    Abul Fazl died by the hand of an assassin, whi...
11513    They can make you laugh, lust or cry with lyri...
1688     Spiritual death is their goal and their cackli...
5994     While he must beware of hasty speech, he has o...
Name: Message, Length: 41932, dtype: object


In [16]:
print(X_train_features)

  (0, 4354)	0.24489889618490185
  (0, 4726)	0.3218501497073004
  (0, 7627)	0.5282227133660254
  (0, 5136)	0.5553124652782079
  (0, 6759)	0.4990449306141462
  (1, 11208)	0.2627566245892988
  (1, 7938)	0.17696005376460877
  (1, 8304)	0.23096110712357404
  (1, 11426)	0.15369539145561134
  (1, 10437)	0.12940925506614978
  (1, 474)	0.1852860122975374
  (1, 3680)	0.2627566245892988
  (1, 4720)	0.21321692732779993
  (1, 10987)	0.24501244479352474
  (1, 9063)	0.2627566245892988
  (1, 9450)	0.23248469716963652
  (1, 9332)	0.24501244479352474
  (1, 5830)	0.2627566245892988
  (1, 5807)	0.14126645609539937
  (1, 2235)	0.2416036238600553
  (1, 1796)	0.2627566245892988
  (1, 2309)	0.24383099392760726
  (1, 10710)	0.2627566245892988
  (1, 5796)	0.2627566245892988
  (2, 2098)	0.24635977937724748
  :	:
  (41930, 1774)	0.3759129704729533
  (41930, 9730)	0.3005367009911988
  (41930, 5172)	0.25031041600738246
  (41930, 4707)	0.3390834916266274
  (41930, 2949)	0.22699150735432225
  (41930, 5986)	0.13994231

Training the Model

Logistic Regression

In [17]:
model = LogisticRegression()

In [18]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

LogisticRegression()

Evaluating the trained model

In [19]:
# prediction on training 

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [20]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9989268339215873


In [21]:
# prediction on test 

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [22]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9981877146127432


Building a Predictive System

In [23]:
input_mail = ["we will assasin you"]

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Suspicious message')

else:
  print('Not suspicious')

[1]
Suspicious message


In [26]:
input_mail = ["kill you"]


input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Suspicious message')

else:
  print('Not suspicious')

[1]
Suspicious message


In [25]:
input_mail = ["i will go for a walk"]

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Suspicious message')

else:
  print('Not suspicious')

[0]
Not suspicious
