In [66]:
import spacy
import pandas as pd
import numpy as np

In [67]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [68]:
df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [71]:
X_train

5398    Hi. Hope you had a good day. Have a better night.
5503    Perhaps * is much easy give your account ident...
3085    Ok lor. I ned 2 go toa payoh 4 a while 2 retur...
3268                    Ok then i come n pick u at engin?
3768                  Sir Goodmorning, Once free call me.
                              ...                        
1482                     I'm a guy, browsin is compulsory
772     Lol! U drunkard! Just doing my hair at d momen...
2943    Hello. No news on job, they are making me wait...
4703                                           Anytime...
1383    Its ok my arm is feeling weak cuz i got a shot...
Name: Message, Length: 4457, dtype: object

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59135 stored elements and shape (4457, 7676)>

In [73]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7676))

In [75]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [76]:
X_test_cv = v.transform(X_test)

In [77]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       954
           1       0.98      0.90      0.94       161

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [78]:
emails = [
    'Hi would you like to play football with me tomorrow?',
    'Urgent! Win $2000 dollars and redeem them in your bank account immediately!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [80]:
# Doin the same using pipelines
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('Vectorizer', CountVectorizer()),
    ('model', MultinomialNB())
])
clf.fit(X_train, y_train)

0,1,2
,steps,"[('Vectorizer', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [81]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       954
           1       0.98      0.90      0.94       161

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [83]:
v.vocabulary_

{'hi': 3403,
 'hope': 3471,
 'you': 7639,
 'had': 3275,
 'good': 3162,
 'day': 2173,
 'have': 3335,
 'better': 1323,
 'night': 4736,
 'perhaps': 5101,
 'is': 3723,
 'much': 4592,
 'easy': 2504,
 'give': 3127,
 'your': 7645,
 'account': 788,
 'identification': 3570,
 'so': 6210,
 'will': 7453,
 'tomorrow': 6894,
 'at': 1116,
 'uni': 7091,
 'ok': 4873,
 'lor': 4176,
 'ned': 4685,
 'go': 3141,
 'toa': 6868,
 'payoh': 5075,
 'while': 7425,
 'return': 5701,
 'smth': 6194,
 'wan': 7309,
 'send': 5941,
 'me': 4373,
 'there': 6763,
 'or': 4927,
 'wat': 7332,
 'then': 6758,
 'come': 1891,
 'pick': 5151,
 'engin': 2585,
 'sir': 6113,
 'goodmorning': 3167,
 'once': 4893,
 'free': 2972,
 'call': 1602,
 'uncle': 7068,
 'boye': 1464,
 'need': 4686,
 'movies': 4566,
 'oh': 4868,
 'guide': 3256,
 'plus': 5210,
 'know': 3929,
 'torrents': 6922,
 'are': 1049,
 'not': 4788,
 'particularly': 5040,
 'legal': 4037,
 'here': 3391,
 'and': 959,
 'the': 6747,
 'system': 6602,
 'slowing': 6164,
 'down': 2416,
 

In [85]:
v.get_feature_names_out()[7645]

'your'

In [88]:
X_train_np = X_train_cv.toarray()
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7676))

In [89]:
np.where(X_train_np[0] != 0)

(array([1323, 2173, 3162, 3275, 3335, 3403, 3471, 4736, 7639]),)

In [90]:
v.get_feature_names_out()[1323]

'better'

In [97]:
X_train_np[0][1323]

np.int64(1)