In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv('/content/spam.csv')

In [None]:
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df.Category.value_counts()/len(df)*100

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,86.593683
spam,13.406317


In [None]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [None]:
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
new_df = pd.read_csv('/content/spam.csv')

In [None]:
new_df['Category'].replace({'ham':0})

In [None]:
new_df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
df.shape

(5572, 3)

Train and Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [None]:
X_train.shape

(4179,)

In [None]:
X_test.shape

(1393,)

In [None]:
X_train[:4]

Unnamed: 0,Message
2279,Hmm...Bad news...Hype park plaza $700 studio t...
3718,I'm gonna rip out my uterus.
1311,"I.ll always be there, even if its just in spir..."
2937,"And stop wondering ""wow is she ever going to s..."


In [None]:
y_train[:4]

Unnamed: 0,spam
2279,0
3718,0
1311,0
2937,0


Create bag of words using CountVectorizer

In [None]:
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 55223 stored elements and shape (4179, 7470)>

In [None]:
X_test_cv = v.transform(X_test)
X_test_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17504 stored elements and shape (1393, 7470)>

In [None]:
X_train_cv.shape

(4179, 7470)

In [None]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
v.get_feature_names_out()[1771]

'clos1'

In [None]:
v.vocabulary_

{'hmm': 3316,
 'bad': 1137,
 'news': 4582,
 'hype': 3432,
 'park': 4899,
 'plaza': 5058,
 '700': 579,
 'studio': 6297,
 'taken': 6443,
 'only': 4763,
 'left': 3908,
 'bedrm': 1223,
 '900': 694,
 'gonna': 3059,
 'rip': 5575,
 'out': 4827,
 'my': 4484,
 'uterus': 6960,
 'll': 3996,
 'always': 891,
 'be': 1206,
 'there': 6576,
 'even': 2549,
 'if': 3461,
 'its': 3619,
 'just': 3724,
 'in': 3499,
 'spirit': 6143,
 'get': 3000,
 'bb': 1190,
 'soon': 6077,
 'trying': 6797,
 'to': 6674,
 'sure': 6378,
 'need': 4546,
 'it': 3612,
 'and': 919,
 'stop': 6253,
 'wondering': 7308,
 'wow': 7339,
 'is': 3601,
 'she': 5844,
 'ever': 2554,
 'going': 3051,
 'tm': 6667,
 'ing': 3534,
 'me': 4232,
 'because': 1216,
 'will': 7244,
 'you': 7432,
 'whenever': 7210,
 'want': 7106,
 'are': 1001,
 'mine': 4310,
 'laughs': 3880,
 'dont': 2303,
 'know': 3808,
 'exactly': 2577,
 'could': 1943,
 'ask': 1043,
 'chechi': 1679,
 'dunno': 2374,
 'they': 6584,
 'close': 1772,
 'oredi': 4801,
 'not': 4646,
 'ma': 4112,


In [None]:
X_train_np = X_train_cv.toarray()
X_train_np[122]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
np.where(X_train_np[122]!=0)

(array([1685, 1695, 1829, 2091, 2097, 2905, 3061, 3066, 3229, 3499, 3601,
        3789, 4115, 4238, 4974, 5010, 5825, 6558, 7218, 7316]),)

Naive Bayes Classifier

In [None]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
y_pred = model.predict(X_test_cv)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1197
           1       0.96      0.90      0.93       196

    accuracy                           0.98      1393
   macro avg       0.97      0.95      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [None]:
#test on random data
message = {"lottery win"}

In [None]:
message_cnt = v.transform(message)
model.predict(message_cnt)

array([1])

In [None]:
message1 = {"Off , is only for you"}

In [None]:
message_cnt1 = v.transform(message1)
model.predict(message_cnt1)

array([0])