In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [65]:
df=pd.read_csv('SMSSpam',sep='\t',names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [66]:
df.shape

(5572, 2)

In [67]:
# replacing unwanted charcters
df['message_new']=df['message'].str.replace('[^a-zA-Z0-9]+',' ')
df.head()

Unnamed: 0,label,message,message_new
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives arou...


In [68]:
# convert to lowercase
df['message_new']=df['message_new'].str.lower()
df.head()

Unnamed: 0,label,message,message_new
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


In [69]:
# stripping to spaces
df['message_new']=df['message_new'].str.strip()
df.head()

Unnamed: 0,label,message,message_new
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


In [70]:
df.isnull().sum()

label          0
message        0
message_new    0
dtype: int64

In [71]:
df[df.duplicated()]

Unnamed: 0,label,message,message_new
103,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnaminun...
154,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnaminun...
207,ham,"As I entered my cabin my PA said, '' Happy B'd...",as i entered my cabin my pa said happy b day b...
223,ham,"Sorry, I'll call later",sorry i ll call later
326,ham,No calls..messages..missed calls,no calls messages missed calls
...,...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...,you are awarded a sipix digital camera call 09...
5535,ham,"I know you are thinkin malaria. But relax, chi...",i know you are thinkin malaria but relax child...
5539,ham,Just sleeping..and surfing,just sleeping and surfing
5553,ham,Hahaha..use your brain dear,hahaha use your brain dear


In [72]:
# dropping the duplicates
df.drop_duplicates(inplace=True)

In [73]:
df.shape

(5169, 3)

In [74]:
import nltk
# nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [76]:
# stopwords with stemming
corpus=[]
lm=PorterStemmer()
for i in range(len(df)):
    message=df.message_new.iloc[i].split()
    message=[lm.stem(word) for word in message if not word in stopwords.words('english')]
    message=' '.join(message)
    corpus.append(message)
df['stemming']=corpus

In [77]:
df.head()

Unnamed: 0,label,message,message_new,stemming
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...,nah think goe usf live around though


In [80]:
print(df.message_new.iloc[1],df.stemming.iloc[1],sep='\n')

ok lar joking wif u oni
ok lar joke wif u oni


In [81]:
# Encoding
df['label']=df['label'].replace({'ham':1,'spam':0})
df.head()

Unnamed: 0,label,message,message_new,stemming
0,1,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,go jurong point crazi avail bugi n great world...
1,1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,1,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,u dun say earli hor u c alreadi say
4,1,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...,nah think goe usf live around though


In [82]:
from sklearn.model_selection import train_test_split
x=df.stemming
y=df.label
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=0)

## Bag of Words

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))

In [84]:
xtrain_cv=cv.fit_transform(xtrain)
xtest_cv=cv.transform(xtest)

In [86]:
cv.get_feature_names()

['00',
 '00 easter',
 '00 per',
 '00 sub',
 '000',
 '000 bonu',
 '000 cash',
 '000 homeown',
 '000 pound',
 '000 price',
 '000 xma',
 '008704050406',
 '008704050406 sp',
 '0089',
 '0089 last',
 '0121',
 '0121 2025050',
 '01223585236',
 '01223585236 xx',
 '0125698789',
 '0125698789 ring',
 '02',
 '02 06',
 '02 09',
 '02 user',
 '0207',
 '0207 083',
 '0207 153',
 '02072069400',
 '02072069400 bx',
 '02073162414',
 '02073162414 cost',
 '02085076972',
 '02085076972 repli',
 '021',
 '021 3680',
 '03',
 '03 05',
 '03 2nd',
 '03 final',
 '03 marsm',
 '04',
 '05',
 '05 05',
 '05 prize',
 '050703',
 '050703 csbcm4235wc1n3xx',
 '0578',
 '06',
 '06 03',
 '06 05',
 '06 11',
 '07',
 '07 11',
 '07046744435',
 '07046744435 arrang',
 '07090201529',
 '07099833605',
 '07099833605 schedul',
 '07123456789',
 '07123456789 87077',
 '0721072',
 '0721072 find',
 '07742676969',
 '07742676969 show',
 '0776xxxxxxx',
 '0776xxxxxxx invit',
 '07781482378',
 '07781482378 com',
 '07786200117',
 '078',
 '07801543489',


In [104]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
model_cv=lr.fit(xtrain_cv,ytrain)
ypred_cv=model.predict(xtest_cv)
ypred_cv

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [105]:
from sklearn.metrics import accuracy_score,f1_score,classification_report

In [106]:
accuracy_score(ytest,ypred_cv)

0.9742101869761445

In [107]:
f1_score(ytest,ypred_cv)

0.9853587115666178

In [108]:
print(classification_report(ytest,ypred_cv))

              precision    recall  f1-score   support

           0       0.98      0.82      0.89       201
           1       0.97      1.00      0.99      1350

    accuracy                           0.97      1551
   macro avg       0.98      0.91      0.94      1551
weighted avg       0.97      0.97      0.97      1551



## Tfidfvectorizer 

In [102]:
# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
xtrain_tf=tf.fit_transform(xtrain)
xtest_tf=tf.transform(xtest)

In [110]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
model_tf=lr.fit(xtrain_tf,ytrain)
ypred_tf=model_tf.predict(xtest_tf)
ypred_tf

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [111]:
accuracy_score(ytest,ypred_tf)

0.9638942617666022

In [112]:
f1_score(ytest,ypred_tf)

0.9796215429403202

In [113]:
print(classification_report(ytest,ypred_tf))

              precision    recall  f1-score   support

           0       0.97      0.74      0.84       201
           1       0.96      1.00      0.98      1350

    accuracy                           0.96      1551
   macro avg       0.97      0.87      0.91      1551
weighted avg       0.96      0.96      0.96      1551



## Word2vec

In [115]:
from gensim.models import Word2Vec
w2v=Word2Vec

In [133]:
#Build the model
model = Word2Vec( #Word list
#                                min_count=10, #Ignore all words with total frequency lower than this                           
#                                workers=4, #Number of CPU Cores
#                                size=50,  #Embedding size
#                                window=5, #Maximum Distance between current and predicted word
#                                iter=10   #Number of iterations over the text corpus
                              )  

In [134]:
model.build_vocab(df.stemming,progress_per=1000)

In [126]:
model.predict_output_word

<bound method Word2Vec.predict_output_word of <gensim.models.word2vec.Word2Vec object at 0x00000238BED45B80>>

In [127]:
model

<gensim.models.word2vec.Word2Vec at 0x238bed45b80>

In [128]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x238bed45bb0>