In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
df = pd.read_csv('./spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [18]:
df['Category'].value_counts() / len(df) * 100

Category
ham     86.593683
spam    13.406317
Name: count, dtype: float64

In [19]:
df['spam'] = df['Category'].apply(lambda x : 1 if x=='spam' else 0)

In [20]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [21]:
df.shape

(5572, 3)

#### Train test splite

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=.25)

#### Create bag of words using countVectorizer

In [25]:
v = CountVectorizer()

x_train = v.fit_transform(x_train)
x_test = v.transform(x_test)

In [33]:
x_train_np = x_train.toarray()

In [34]:
x_train_np.shape

(4179, 7480)

In [35]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'],
      shape=(7480,), dtype=object)

In [36]:
v.vocabulary_

{'tmr': 6679,
 'then': 6588,
 'brin': 1453,
 'lar': 3874,
 'aiya': 865,
 'later': 3885,
 'come': 1832,
 'mayb': 4238,
 'neva': 4586,
 'set': 5823,
 'properly': 5277,
 'got': 3088,
 'da': 2056,
 'help': 3274,
 'sheet': 5863,
 'wif': 7255,
 'alright': 900,
 'see': 5771,
 'you': 7443,
 'in': 3508,
 'bit': 1324,
 'putting': 5320,
 'it': 3618,
 'on': 4771,
 'now': 4676,
 'should': 5908,
 'be': 1232,
 'ready': 5407,
 'for': 2829,
 'lt': 4101,
 'time': 6653,
 'gt': 3144,
 'will': 7264,
 'if': 3469,
 'we': 7162,
 'propose': 5279,
 'going': 3060,
 'back': 1158,
 'again': 836,
 'tomorrow': 6709,
 'not': 4664,
 'possession': 5141,
 'especially': 2544,
 'first': 2766,
 'offense': 4733,
 'no': 4630,
 'few': 2713,
 'hours': 3389,
 'before': 1258,
 'went': 7207,
 'to': 6685,
 'hair': 3177,
 'cut': 2048,
 'up': 6933,
 'wan': 7117,
 'lor': 4060,
 'but': 1519,
 'din': 2234,
 'any': 968,
 'stripes': 6298,
 'skirt': 5994,
 'go': 3048,
 'chase': 1675,
 'after': 831,
 'her': 3283,
 'and': 937,
 'run': 5645,

In [38]:
np.where(x_train_np[0] != 0)

(array([ 865, 1453, 1832, 2056, 3088, 3274, 3874, 3885, 4238, 4586, 5277,
        5823, 5863, 6588, 6679, 7255]),)

### Naive Bayes

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [40]:
model = MultinomialNB()
model.fit(x_train_np, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [42]:
y_pred = model.predict(x_test)

accuracy_score(y_test, y_pred)

0.9892318736539842

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1212
           1       0.99      0.93      0.96       181

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



##### test random data

In [50]:
message = {"Upto 100% free on parking, exclusing offer just for you"}

message_cnt = v.transform(message)

model.predict(message_cnt)

array([1])