In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# import dataset

In [3]:
mail_data=pd.read_csv("/content/mail_data.csv")

In [4]:
#lets check 1st five rows

In [5]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# in this data we have onlt two columns , category and message and both have object datatype

In [7]:
#lets check it shape

In [8]:
mail_data.shape

(5572, 2)

In [9]:
mail_data.isnull().sum() # there is no missing values

Category    0
Message     0
dtype: int64

In [10]:
#lets check for duplicate value

In [11]:
mail_data.duplicated().sum()

415

In [12]:
mail_data[mail_data.duplicated()] # our dataset contains 415 duplicates values

Unnamed: 0,Category,Message
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls
...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...
5535,ham,"I know you are thinkin malaria. But relax, chi..."
5539,ham,Just sleeping..and surfing
5553,ham,Hahaha..use your brain dear


In [13]:
mail_data=mail_data.drop_duplicates() # Removes duplicate value


In [14]:
#now check shape of data

In [15]:
mail_data.shape

(5157, 2)

In [16]:
# Label Encoding in Category column

In [17]:
# SPAM MAIL AS 0 ,  AND HAM MAIL AS 1

In [18]:
mail_data=mail_data.replace({"Category":{"spam":"0","ham":"1"}})

In [19]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
mail_data["Category"].value_counts() #only 641 mails are spam

1    4516
0     641
Name: Category, dtype: int64

In [21]:
#Separating the data as text and label

In [22]:
X=mail_data["Message"]
Y=mail_data["Category"]

In [23]:
#spliting the data into training and testing data

In [24]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [25]:
print(x_train.shape,x_test.shape,X.shape)

(4125,) (1032,) (5157,)


In [26]:
print(y_train.shape,y_test.shape,Y.shape)

(4125,) (1032,) (5157,)


**Feature Extraction**
# transform the text data into numerical form

In [27]:
feature_ext=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [28]:
x_train_feat=feature_ext.fit_transform(x_train)

In [29]:
x_test_feat=feature_ext.transform(x_test)

In [30]:
print(x_train_feat)

  (0, 6692)	0.48303813512243965
  (0, 6605)	0.4898673616987752
  (0, 1247)	0.5538832733861689
  (0, 2400)	0.4689535663823655
  (1, 1592)	0.5594126567616489
  (1, 5859)	0.5964494866231046
  (1, 6492)	0.5755914257195885
  (2, 300)	0.16113294658934302
  (2, 820)	0.24488128414489752
  (2, 6835)	0.24488128414489752
  (2, 5064)	0.22967525805125708
  (2, 7355)	0.21568440262445418
  (2, 682)	0.22967525805125708
  (2, 25)	0.24488128414489752
  (2, 2564)	0.19505111090227498
  (2, 798)	0.24488128414489752
  (2, 4393)	0.24488128414489752
  (2, 4520)	0.24488128414489752
  (2, 4519)	0.24488128414489752
  (2, 4084)	0.2644704778405301
  (2, 6828)	0.13848562282513652
  (2, 3252)	0.18176623831152225
  (2, 3695)	0.21568440262445418
  (2, 4522)	0.2363176943466334
  (2, 4089)	0.2196593047164816
  :	:
  (4120, 1223)	0.5846719438819551
  (4120, 6456)	0.4751891362657192
  (4120, 3181)	0.4285103741434733
  (4120, 3921)	0.39220487670240334
  (4120, 2101)	0.30807173962343376
  (4121, 5968)	0.4786671802830861
  (

In [31]:
print(x_test_feat)

  (0, 6086)	0.3944231765529361
  (0, 4738)	0.32748577740962176
  (0, 4018)	0.33853958897969616
  (0, 3886)	0.41026591111704
  (0, 1518)	0.6739853501828288
  (1, 6643)	0.34343632198724217
  (1, 4277)	0.6049436351033556
  (1, 3080)	0.3405927969937548
  (1, 874)	0.6325276576749704
  (2, 7259)	0.8099051408446086
  (2, 5662)	0.5865608773464819
  (3, 854)	1.0
  (4, 7330)	0.1709848146862032
  (4, 7157)	0.22907651727073883
  (4, 7049)	0.2351824570884995
  (4, 6857)	0.18128246190114575
  (4, 6828)	0.15580311274041234
  (4, 6802)	0.24712748312101787
  (4, 6668)	0.2755034460856627
  (4, 6556)	0.20586305286327689
  (4, 6255)	0.31681116547744975
  (4, 4927)	0.24712748312101787
  (4, 4720)	0.20728837222143673
  (4, 3393)	0.22160345089830902
  (4, 3329)	0.26586898786839125
  :	:
  (1027, 6925)	0.16321241341070308
  (1027, 6813)	0.3158063487602124
  (1027, 6605)	0.23500573210717166
  (1027, 6098)	0.27669654882822414
  (1027, 5753)	0.2799727568730548
  (1027, 4786)	0.33671482458181523
  (1027, 4453)	0.

In [32]:
#change data type

In [33]:
y_train=y_train.astype("int")
y_test=y_test.astype('int')

In [34]:
#model creating

In [35]:
model=LogisticRegression()

In [37]:
model.fit(x_train_feat,y_train)

In [38]:
# Check Accuracy

In [39]:
pred_train_data=model.predict(x_train_feat)

In [40]:
pred_test=model.predict(x_test_feat)

In [41]:
#train accuracy

In [43]:
accuracy_score(pred_train_data,y_train)

0.961939393939394

In [44]:
#test accuracy

In [46]:
accuracy_score(pred_test,y_test)

0.9544573643410853

In [47]:
#check score how good our model is.

In [48]:
from sklearn.metrics import r2_score

In [51]:
r2_score(pred_train_data,y_train) #it is >0.5 so our model is good

0.5136183117589275

In [52]:
# buildung a predictive system

In [64]:
def mail_func(input_data):
#input_data=["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
#convert text into feature vector
  input_data_feature=feature_ext.transform(input_data)
#Making PRediction
  prediction=model.predict(input_data_feature)
  print(prediction)

  if prediction[0]==1:
   print("Ham Mail")
  else:
   print("Spam Mail")

In [65]:
mail_func(["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"])

[0]
Spam Mail
