In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importing data

df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# checking null values

df.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
df.shape

(5572, 2)

### Label Encoding

* Label spam as 0; ham as 1

In [5]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [6]:
df['Category_new'] = np.where((df['Category']=='ham'),1,0)
df.head()

Unnamed: 0,Category,Message,Category_new
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [7]:
# dropping Category feature

df = df.drop(columns=['Category'])
df.head()

Unnamed: 0,Message,Category_new
0,"Go until jurong point, crazy.. Available only ...",1
1,Ok lar... Joking wif u oni...,1
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,1
4,"Nah I don't think he goes to usf, he lives aro...",1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Message       5572 non-null   object
 1   Category_new  5572 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 65.4+ KB


In [8]:
# Separating data as texts and label
# Dependent feature and independent feature

X = df['Message']
y = df['Category_new']

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


### TF-IDF

* Transforming  the text data to feature vectors 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# min_df = ignore terms that have a document frequency strictly lower than the given threshold

tv = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_features = tv.fit_transform(X_train)
X_test_features = tv.transform(X_test)

In [15]:
print(X_train_features)

  (0, 3269)	0.22054637918276537
  (0, 268)	0.24268694798090706
  (0, 6770)	0.22054637918276537
  (0, 3683)	0.29820405490236296
  (0, 1759)	0.2170940285037797
  (0, 193)	0.29820405490236296
  (0, 1703)	0.3463668729960997
  (0, 5412)	0.26659686450403225
  (0, 5072)	0.18515987150578092
  (0, 697)	0.2603077338178364
  (0, 5272)	0.29820405490236296
  (0, 5624)	0.21107975758257638
  (0, 1994)	0.19237243410818475
  (0, 4440)	0.21107975758257638
  (0, 6774)	0.2393992502064106
  (0, 7026)	0.23108043306518108
  (1, 2292)	0.3492579692442415
  (1, 3877)	0.28721518875566576
  (1, 1542)	0.26346904298985996
  (1, 6331)	0.42282226614314494
  (1, 3562)	0.42282226614314494
  (1, 5750)	0.3615497364476182
  (1, 3245)	0.24301471455207635
  (1, 5581)	0.42282226614314494
  (2, 1710)	0.5953778026871843
  :	:
  (4174, 2361)	0.4566133962329911
  (4174, 7074)	0.3957305909097366
  (4174, 3858)	0.4566133962329911
  (4174, 3187)	0.2651298270698404
  (4174, 2162)	0.2706455499867595
  (4174, 2988)	0.23095874540003675

In [16]:
print(X_test_features)

  (0, 4797)	0.2249854341740055
  (0, 3978)	0.28142633196821903
  (0, 3838)	0.3091529161984464
  (0, 3288)	0.6591729096397505
  (0, 3127)	0.32958645481987525
  (0, 2105)	0.24988724470478607
  (0, 1998)	0.2956936883872987
  (0, 1692)	0.2856408585418783
  (1, 6956)	0.3958833420713546
  (1, 6787)	0.2492928312395227
  (1, 6447)	0.3238476862667447
  (1, 6351)	0.2203215476723988
  (1, 5276)	0.38477401788178034
  (1, 3923)	0.32100202325847027
  (1, 3335)	0.3238476862667447
  (1, 1910)	0.36137462920657265
  (1, 1307)	0.3756970390973034
  (2, 6376)	0.2618780102833671
  (2, 5440)	0.3785175534180398
  (2, 4240)	0.3785175534180398
  (2, 3400)	0.34667462215073225
  (2, 2988)	0.1914572370643152
  (2, 2971)	0.19119695512710608
  (2, 2301)	0.3785175534180398
  (2, 1239)	0.5551556989491848
  :	:
  (1389, 5636)	0.49471725723951215
  (1389, 5624)	0.39169552471782315
  (1389, 4122)	0.3970328650072385
  (1389, 2988)	0.26677113133733354
  (1389, 2971)	0.26640846180380934
  (1389, 2359)	0.3970328650072385
  (

### Model Training

In [17]:
from sklearn.linear_model import LogisticRegression

In [19]:
model = LogisticRegression()
model.fit(X_train_features,y_train)

### Evaluating trained data

In [21]:
# Prediction (training data)

y_pred = model.predict(X_train_features)

### Accuracy

In [20]:
from sklearn.metrics import accuracy_score

In [22]:
accuracy_score(y_train,y_pred)

0.9653027039961714

##### 96.7% accuracy score

In [23]:
# Prediction (test data)

y_pred_test = model.predict(X_test_features)
accuracy_score(y_test,y_pred_test)

0.9662598707824839

##### 96.6% accuracy score

### Building Predictive System

In [24]:
# Taking random mail from the dataset

input_mail = ['SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info']

In [25]:
# Converting text to feature vectors

input_data_feature = tv.transform(input_mail)

In [26]:
# Predicting

pred = model.predict(input_data_feature)
print(pred)

[0]


In [27]:
if pred[0]==1:
    print("Ham Mail")
else:
    print("Spam mail")

Spam mail
