# Task 2 --- Oasis Infobyte

# Email Spam Detection using Machine Learning

# By -- Rahul Sharma

In [18]:
#importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [19]:
#importing the dataset
df=pd.read_csv('spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [20]:
#Removing unwanted columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [21]:
#Checking shape of dataset
df.shape

(5572, 2)

In [22]:
#Renaming the columns
df.rename(columns={'v1':'Category','v2':'Messages'},inplace=True)

In [23]:
#Let's check old name are converted with the new names
df.head()

Unnamed: 0,Category,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
df1=df.where((pd.notnull(df)),'')

In [25]:
df1

Unnamed: 0,Category,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Label Encoding

In [26]:
#Encoding the Category Column
df1.replace({'Category':{'spam':0,'ham':1}},inplace=True)

# Separating the features and target variables

In [28]:
x=df1['Messages']

In [29]:
y=df1['Category']

In [30]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Messages, Length: 5572, dtype: object

In [31]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

# Splitting the data into testing and training data

In [32]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=60)

In [33]:
x_train.shape

(3900,)

In [34]:
x_test.shape

(1672,)

In [35]:
y_train.shape

(3900,)

In [36]:
y_test.shape

(1672,)

# Feature Extraction

In [37]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [38]:
#Convert y_train , y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [40]:
print(x_train_features)

  (0, 1965)	0.30835275135011947
  (0, 4304)	0.39944922457263615
  (0, 3024)	0.45496434802508123
  (0, 1976)	0.35533640057435256
  (0, 4162)	0.49832213926634444
  (0, 2936)	0.4046942397577723
  (1, 6030)	0.6210509952633732
  (1, 5457)	0.6616154121527643
  (1, 4418)	0.4201912751168229
  (2, 1496)	0.42793975570796267
  (2, 926)	0.580650913514294
  (2, 6133)	0.692612505026891
  (3, 654)	0.673197393273669
  (3, 3163)	0.4807743859509671
  (3, 6619)	0.36588897598215814
  (3, 3680)	0.42636430051979635
  (4, 1477)	0.5313085630538509
  (4, 4259)	0.5079689969418486
  (4, 3568)	0.2858894880016133
  (4, 4269)	0.31202709313907856
  (4, 3750)	0.2839018017574477
  (4, 6161)	0.33237531168384077
  (4, 6207)	0.2991845950426612
  (5, 4196)	0.5570689447104746
  (5, 5141)	0.5307307676001095
  :	:
  (3895, 4378)	0.2858834297897797
  (3895, 6767)	0.22405694750968386
  (3895, 2208)	0.2313347167317831
  (3895, 4049)	0.2153106689830759
  (3895, 2169)	0.1680717209364572
  (3895, 5516)	0.26558771820988236
  (3895,

# Training the model

# Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()

In [42]:
#training model with training data
lg.fit(x_train_features,y_train)

LogisticRegression()

# Evaluating the Model

In [43]:
#Prediction on training data
predlg=lg.predict(x_train_features)

In [44]:
predlg

array([1, 1, 1, ..., 1, 1, 1])

In [45]:
from sklearn.metrics import accuracy_score
print(accuracy_score(predlg,y_train))

0.9666666666666667


In [47]:
#Prediction on testing data
pred_lg=lg.predict(x_test_features)

In [48]:
print(accuracy_score(pred_lg,y_test))

0.9551435406698564


# Building a Predictive System

In [50]:
input_mail=["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,,,"]

#convert text to feature vectors
input_mail_features=feature_extraction.transform(input_mail)

#making prediction
prediction=lg.predict(input_mail_features)
print(prediction)

if(prediction==1):
    print('Mail is Ham')
    
else:print('Mail is Spam')



[1]
Mail is Ham
