In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
import spacy
import nltk
from nltk.stem import PorterStemmer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
#Spam upsampling
Spam_upsampling = resample(df[df.Category == 'spam'] ,n_samples = len(df[df.Category == 'ham']) , replace=True , random_state=42)

In [9]:
Spam_upsampling

Unnamed: 0,Category,Message
713,spam,08714712388 between 10am-7pm Cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
505,spam,#ERROR!
...,...,...
4248,spam,Text PASS to 69669 to collect your polyphonic ...
3675,spam,You have won a Nokia 7250i. This is what you g...
3620,spam,8007 25p 4 Alfie Moon's Children in Need song ...
3501,spam,Dorothy@kiefer.com (Bank of Granite issues Str...


In [10]:
ham = df[df.Category == 'ham']

In [11]:
ham

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
5565,ham,Huh y lei...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [12]:
df_new = pd.concat([Spam_upsampling , ham])

In [13]:
df_new

Unnamed: 0,Category,Message
713,spam,08714712388 between 10am-7pm Cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...
505,spam,#ERROR!
...,...,...
5565,ham,Huh y lei...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [14]:
df_new['Spam'] = df.Category.apply(lambda x : 1 if x == 'spam' else 0)
df_new.head()

Unnamed: 0,Category,Message,Spam
713,spam,08714712388 between 10am-7pm Cost 10p,1
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1
505,spam,#ERROR!,1


In [15]:
df_new.Spam.value_counts()

Spam
1    4825
0    4825
Name: count, dtype: int64

In [24]:
nlp = spacy.load('en_core_web_sm')

In [25]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_words)   

In [26]:
df_new['Message_new'] = df_new.Message.apply(preprocess)
df_new.head()

Unnamed: 0,Category,Message,Spam,Message_new
713,spam,08714712388 between 10am-7pm Cost 10p,1,08714712388 10am-7pm Cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1,Ur cash balance currently 500 pounds maximize ...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1,08702490080 tells u 2 09066358152 claim £ 5000...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1,Hi Customer Loyalty Offer NEW Nokia6650 Mobile...
505,spam,#ERROR!,1,ERROR


In [27]:
def stemmer(text):
    text = text.split()
    words = ''
    stemmer = PorterStemmer()
    for i in text:
        words += (stemmer.stem(i)) + ' '
    return words.strip()  

In [28]:
df_new['Message Stremmed'] = df_new.Message_new.apply(stemmer)
df_new

Unnamed: 0,Category,Message,Spam,Message_new,Message Stremmed
713,spam,08714712388 between 10am-7pm Cost 10p,1,08714712388 10am-7pm Cost 10p,08714712388 10am-7pm cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1,Ur cash balance currently 500 pounds maximize ...,ur cash balanc current 500 pound maxim ur cash...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1,08702490080 tells u 2 09066358152 claim £ 5000...,08702490080 tell u 2 09066358152 claim £ 5000 ...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1,Hi Customer Loyalty Offer NEW Nokia6650 Mobile...,hi custom loyalti offer new nokia6650 mobil £ ...
505,spam,#ERROR!,1,ERROR,error
...,...,...,...,...,...
5565,ham,Huh y lei...,0,Huh y lei,huh y lei
5568,ham,Will ü b going to esplanade fr home?,0,ü b going esplanade fr home,ü b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",0,Pity mood suggestions,piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,0,guy bitching acted like interested buying week...,guy bitch act like interest buy week gave free


In [30]:
x = df_new.Message_new
y = df_new.Spam

In [31]:
x.head()

713                         08714712388 10am-7pm Cost 10p
3230    Ur cash balance currently 500 pounds maximize ...
1929    08702490080 tells u 2 09066358152 claim £ 5000...
738     Hi Customer Loyalty Offer NEW Nokia6650 Mobile...
505                                                 ERROR
Name: Message_new, dtype: object

In [32]:
y.head()

713     1
3230    1
1929    1
738     1
505     1
Name: Spam, dtype: int64

In [33]:
x.shape

(9650,)

In [34]:
y.shape

(9650,)

In [35]:
xtrain , xtest , ytrain , ytest = train_test_split(x , y , test_size = 0.2 , random_state = 42)

In [36]:
xtrain.shape

(7720,)

In [37]:
xtest.shape

(1930,)

In [38]:
ytrain.shape

(7720,)

In [39]:
ytest.shape

(1930,)

In [40]:
v = CountVectorizer()

In [41]:
xtrain_cv = v.fit_transform(xtrain.values)

In [42]:
xtrain_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
xtest_cv = v.transform(xtest)

In [44]:
xtest_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [45]:
print(xtrain_cv.shape)
print(xtest_cv.shape)

(7720, 7747)
(1930, 7747)


In [47]:
v.vocabulary_

{'amanda': 990,
 'regard': 5689,
 'renewing': 5726,
 'upgrading': 7203,
 'current': 2176,
 'mobile': 4577,
 'handset': 3360,
 'free': 3022,
 'charge': 1782,
 'offer': 4933,
 'ends': 2640,
 'today': 6951,
 'tel': 6767,
 '0845': 72,
 '021': 13,
 '3680': 495,
 'subject': 6568,
 'seeing': 6008,
 'weird': 7460,
 'shit': 6110,
 'bein': 1358,
 'woah': 7570,
 'realising': 5628,
 'actually': 867,
 'reasonable': 5638,
 'oh': 4945,
 'secret': 5997,
 'admirer': 882,
 'looking': 4224,
 'contact': 2035,
 'find': 2889,
 'reveal': 5789,
 'thinks': 6863,
 'ur': 7213,
 'special': 6372,
 '09058094599': 204,
 'friends': 3044,
 'help': 3441,
 'problems': 5459,
 'stupid': 6561,
 'suggestion': 6603,
 'lands': 4030,
 'problem': 5458,
 'helps': 3448,
 'forgt': 2992,
 'previous': 5432,
 'usually': 7242,
 'person': 5185,
 'unconscious': 7149,
 'children': 1840,
 'adults': 894,
 'behave': 1356,
 'abnormally': 822,
 'll': 4185,
 'monthly': 4614,
 'password': 5129,
 'wap': 7400,
 'mobsi': 4584,
 'com': 1959,
 '3917

In [48]:
v.get_feature_names_out()[1450:1500]

array(['blood', 'bloody', 'bloomberg', 'blow', 'blowing', 'blown', 'blu',
       'blue', 'bluetooth', 'bluetoothhdset', 'bluff', 'bluray', 'bmw',
       'board', 'boat', 'boatin', 'bob', 'body', 'boggy', 'bognor',
       'bold', 'bold2', 'boltblue', 'bomb', 'bone', 'bong', 'bonus',
       'boo', 'boobs', 'book', 'booked', 'bookedthe', 'booking',
       'bookmark', 'books', 'bookshelf', 'boooo', 'boost', 'booty',
       'bootydelious', 'borderline', 'bored', 'borin', 'boring', 'born',
       'borrow', 'boss', 'boston', 'bot', 'both'], dtype=object)

In [49]:
model = MultinomialNB()
model.fit(xtrain_cv, ytrain)

In [50]:
ypred = model.predict(xtest_cv)

In [51]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       945
           1       0.97      0.98      0.98       985

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [52]:
#using Bag of Words
v = CountVectorizer(ngram_range = (1,3))

In [53]:
xtrain_cv = v.fit_transform(xtrain.values)
xtest_cv = v.transform(xtest)

In [54]:
model = MultinomialNB()
model.fit(xtrain_cv, ytrain)

ypred = model.predict(xtest_cv)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       945
           1       0.98      1.00      0.99       985

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
from sklearn.pipeline import Pipeline

clf = Pipeline(
    [
        ('Vectorizer' , TfidfVectorizer()) , 
        ('Model' , MultinomialNB())
    ]
)

In [57]:
clf.fit(xtrain, ytrain)

y_pred = clf.predict(xtest)

print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       945
           1       0.97      0.98      0.98       985

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



In [58]:
df_new.head()

Unnamed: 0,Category,Message,Spam,Message_new,Message Stremmed
713,spam,08714712388 between 10am-7pm Cost 10p,1,08714712388 10am-7pm Cost 10p,08714712388 10am-7pm cost 10p
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1,Ur cash balance currently 500 pounds maximize ...,ur cash balanc current 500 pound maxim ur cash...
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1,08702490080 tells u 2 09066358152 claim £ 5000...,08702490080 tell u 2 09066358152 claim £ 5000 ...
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1,Hi Customer Loyalty Offer NEW Nokia6650 Mobile...,hi custom loyalti offer new nokia6650 mobil £ ...
505,spam,#ERROR!,1,ERROR,error


In [59]:
nlp = spacy.load("en_core_web_lg")

In [60]:
df_new['vector'] = df_new['Message Stremmed'].apply(lambda text : nlp(text).vector)

In [61]:
df_new.head()

Unnamed: 0,Category,Message,Spam,Message_new,Message Stremmed,vector
713,spam,08714712388 between 10am-7pm Cost 10p,1,08714712388 10am-7pm Cost 10p,08714712388 10am-7pm cost 10p,"[0.76285756, -0.5446782, -1.0078951, -0.132202..."
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1,Ur cash balance currently 500 pounds maximize ...,ur cash balanc current 500 pound maxim ur cash...,"[-0.48464194, 0.09029646, -0.6008689, 1.244997..."
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1,08702490080 tells u 2 09066358152 claim £ 5000...,08702490080 tell u 2 09066358152 claim £ 5000 ...,"[-0.54434526, -1.2294583, -1.9976357, -0.18263..."
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1,Hi Customer Loyalty Offer NEW Nokia6650 Mobile...,hi custom loyalti offer new nokia6650 mobil £ ...,"[-0.17209114, -1.0705323, -1.7756789, 0.423812..."
505,spam,#ERROR!,1,ERROR,error,"[0.54302, -1.3744, 3.4711, -0.045181, 5.9612, ..."


In [63]:
df_new.vector.head()

713     [0.76285756, -0.5446782, -1.0078951, -0.132202...
3230    [-0.48464194, 0.09029646, -0.6008689, 1.244997...
1929    [-0.54434526, -1.2294583, -1.9976357, -0.18263...
738     [-0.17209114, -1.0705323, -1.7756789, 0.423812...
505     [0.54302, -1.3744, 3.4711, -0.045181, 5.9612, ...
Name: vector, dtype: object

In [64]:
x = df_new.vector
y = df_new.Spam

In [67]:
x.head()

713     [0.76285756, -0.5446782, -1.0078951, -0.132202...
3230    [-0.48464194, 0.09029646, -0.6008689, 1.244997...
1929    [-0.54434526, -1.2294583, -1.9976357, -0.18263...
738     [-0.17209114, -1.0705323, -1.7756789, 0.423812...
505     [0.54302, -1.3744, 3.4711, -0.045181, 5.9612, ...
Name: vector, dtype: object

In [68]:
xtrain ,xtest , ytrain , ytest = train_test_split(x.values , y , test_size = 0.2 , random_state = 42)

In [70]:
xtrain

array([array([-0.52962446,  1.3999765 , -1.3469889 , -1.4543858 ,  1.5599191 ,
              -0.04191551,  0.2991285 ,  0.7533139 , -2.2313097 , -2.346805  ,
               1.9278944 , -0.88171256, -3.6105113 ,  0.8815495 ,  0.31929296,
               0.4531819 ,  2.2829392 , -1.2099172 , -1.6781559 , -0.90747994,
              -1.340752  ,  1.6054169 , -0.9405115 ,  0.4785843 , -0.70824367,
               0.32641354, -0.38028902,  0.22155957, -0.6622351 ,  0.7628514 ,
               1.6710083 ,  0.7377454 , -1.0969187 , -1.4177198 ,  1.6631384 ,
              -0.04821927,  0.9628499 ,  0.46304426,  1.7572334 ,  0.9963199 ,
               0.32249248,  0.006051  , -0.05477653,  0.8832761 , -0.8809126 ,
               2.2452884 ,  0.0833539 , -1.6537975 ,  1.0495325 ,  0.951543  ,
              -0.21529599, -1.9706892 ,  0.4155895 , -2.2342849 , -1.9676981 ,
               2.310335  , -0.42960396,  2.1430647 ,  0.14888522,  0.193995  ,
               2.7360187 , -0.24276802, -2.981199  ,

In [71]:
ytrain

4149    1
4741    0
1623    1
5385    0
520     0
       ..
1071    0
430     0
661     0
4797    1
2835    0
Name: Spam, Length: 7720, dtype: int64

In [72]:
xtrain = np.stack(xtrain)
xtest = np.stack(xtest)

In [73]:
xtrain

array([[-0.52962446,  1.3999765 , -1.3469889 , ..., -1.6283016 ,
        -1.8753376 ,  1.9062907 ],
       [ 0.88571334,  0.26109666, -1.6135668 , ...,  1.25285   ,
        -1.9823767 ,  0.45039463],
       [ 0.2560361 ,  0.49593082, -0.27429077, ..., -1.080024  ,
        -2.3950205 , -0.13956155],
       ...,
       [-0.4439909 , -2.1364942 , -0.78174293, ...,  0.39900988,
        -0.9983758 ,  0.51671803],
       [-0.931806  , -1.88904   , -2.3927748 , ..., -1.0539123 ,
         0.08801281,  2.490285  ],
       [-1.47078   ,  0.39235005, -1.3470668 , ...,  0.9086667 ,
        -2.7441666 ,  1.0271333 ]], dtype=float32)

In [74]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(xtrain)
scaled_test = scaler.transform(xtest)

model = MultinomialNB()
model.fit(scaled_train, ytrain)

In [75]:
ypred = model.predict(scaled_test)

In [76]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       945
           1       0.84      0.83      0.83       985

    accuracy                           0.83      1930
   macro avg       0.83      0.83      0.83      1930
weighted avg       0.83      0.83      0.83      1930



In [77]:
from  sklearn.neighbors import KNeighborsClassifier


clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf.fit(xtrain, ytrain)
y_pred = clf.predict(xtest)

print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.88      0.93       945
           1       0.90      0.99      0.94       985

    accuracy                           0.94      1930
   macro avg       0.94      0.94      0.94      1930
weighted avg       0.94      0.94      0.94      1930

