In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#TfidfVectorizer


In [2]:
raw_mail_data=pd.read_csv("mail_data.csv")

In [3]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Spam -0
Ham- 1

In [4]:
#replace the null values with null string

mail_data= raw_mail_data.where((pd.notnull(raw_mail_data)),"")

In [5]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
mail_data.shape

(5572, 2)

In [8]:
mail_data.loc[mail_data["Category"]=="spam","Category",]=0
mail_data.loc[mail_data["Category"]=="ham","Category",]=1

In [9]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [10]:
X=mail_data["Message"]
y=mail_data["Category"]

In [13]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [15]:
#Splitting into train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [16]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [17]:
#Text preprocessing

feature_extraction=TfidfVectorizer(min_df=1,stop_words="english",lowercase="True")

In [19]:
X_train_feat=feature_extraction.fit_transform(X_train)
X_test_feat=feature_extraction.transform(X_test)

In [20]:
#conveting the datatypes
y_train=y_train.astype('int')
y_test=y_test.astype("int")

In [22]:
print(X_train_feat)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [23]:
model=LogisticRegression()
model.fit(X_train_feat,y_train)

LogisticRegression()

In [24]:
#prediction on training data

pred_on_train=model.predict(X_train_feat)
accuracy_on_train=accuracy_score(y_train,pred_on_train)

In [25]:
print(accuracy_on_train)

0.9670181736594121


In [26]:
pred_on_test=model.predict(X_test_feat)
accuracy_on_test=accuracy_score(y_test,pred_on_test)

In [27]:
accuracy_on_test

0.9659192825112107

In [38]:
#Building a predictive system
input_mail=["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
input_data_feat=feature_extraction.transform(input_mail)

In [39]:
#making prediction

prediction=model.predict(input_data_feat)
#print(prediction)

if(prediction[0]==1):
    print("Ham email")
    
else:
    print("Spam Email")

Spam Email
