In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
import spacy
import nltk
from nltk.stem import PorterStemmer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [7]:
df = pd.read_csv('spam.csv')
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#Spam upsampling
Spam_upsampling = resample(df[df.Category == 'spam'] ,n_samples = len(df[df.Category == 'ham']) , replace=True , random_state=42)

In [9]:
ham = df[df.Category == 'ham']

In [10]:
df_new = pd.concat([Spam_upsampling , ham])
df_new

Unnamed: 0,Category,Message
713,spam,08714712388 between 10am-7pm Cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
505,spam,#ERROR!
...,...,...
5565,ham,Huh y lei...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
df_new['Spam'] = df.Category.apply(lambda x : 1 if x == 'spam' else 0)

In [12]:
nlp = spacy.load('en_core_web_sm')

In [14]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)   


def stemmer(text):
    text = text.split()
    words = ''
    stemmer = PorterStemmer()
    for i in text:
        words += (stemmer.stem(i)) + ' '
    return words.strip()  

df_new['Message_new'] = df_new.Message.apply(preprocess)
df_new['Message_Stremmed'] = df_new.Message_new.apply(stemmer)
df_new

Unnamed: 0,Category,Message,Spam,Message_new,Message Stremmed,Message_Stremmed
713,spam,08714712388 between 10am-7pm Cost 10p,1,08714712388 10am-7pm Cost 10p,08714712388 10am-7pm cost 10p,08714712388 10am-7pm cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1,Ur cash balance currently 500 pounds maximize ...,ur cash balanc current 500 pound maxim ur cash...,ur cash balanc current 500 pound maxim ur cash...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1,08702490080 tells u 2 09066358152 claim £ 5000...,08702490080 tell u 2 09066358152 claim £ 5000 ...,08702490080 tell u 2 09066358152 claim £ 5000 ...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1,Hi Customer Loyalty Offer NEW Nokia6650 Mobile...,hi custom loyalti offer new nokia6650 mobil £ ...,hi custom loyalti offer new nokia6650 mobil £ ...
505,spam,#ERROR!,1,ERROR,error,error
...,...,...,...,...,...,...
5565,ham,Huh y lei...,0,Huh y lei,huh y lei,huh y lei
5568,ham,Will ü b going to esplanade fr home?,0,ü b going esplanade fr home,ü b go esplanad fr home,ü b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity mood suggestions,piti mood suggest,piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,0,guy bitching acted like interested buying week...,guy bitch act like interest buy week gave free,guy bitch act like interest buy week gave free


In [15]:
x = df_new.Message_Stremmed
y = df_new.Spam

In [16]:
xtrain , xtest , ytrain , ytest = train_test_split(x , y , test_size = 0.2 , random_state = 42)

In [17]:
v = CountVectorizer(ngram_range = (1,3))

In [18]:
xtrain_cv = v.fit_transform(xtrain.values)
xtest_cv = v.transform(xtest)

print(xtrain_cv.shape)
print(xtest_cv.shape)

(7720, 53528)
(1930, 53528)


In [20]:
model_final = MultinomialNB()
model_final.fit(xtrain_cv, ytrain)

ypred = model_final.predict(xtest_cv)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       945
           1       0.98      1.00      0.99       985

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



In [None]:
import pickle

with open ('model_pickle' , 'wb') as f:
    pickle.dump(model_final , f)
    
    
with    