### Packages Required

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

### Dataset

In [21]:
sms = pd.read_table('C:/Users/Nithin/Downloads/Spam messages_AI_text Minning/Datasets/sms.tsv',header = None,names=['label','message'])

In [22]:
sms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [23]:
sms.isna().sum()

label      0
message    0
dtype: int64

### Train & Test split

In [24]:
train_x,test_x,train_y,test_y = train_test_split(sms['message'],sms['label'],random_state=1)

### Count Vectorization

In [25]:
cnt_vec = CountVectorizer(max_df=0.5,min_df=20,stop_words='english',ngram_range=(1,2))

In [26]:
#Train Data :
train_x_dtm = cnt_vec.fit_transform(train_x)
train_x_dtm = pd.DataFrame(train_x_dtm.toarray(), columns = cnt_vec.get_feature_names_out())
train_x_dtm

Unnamed: 0,000,10,100,1000,150p,150ppm,16,18,1st,2000,...,work,world,www,xxx,ya,yeah,year,yes,yo,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4177,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#Train Data :
test_x_dtm = cnt_vec.transform(test_x)
test_x_dtm = pd.DataFrame(test_x_dtm.toarray(), columns = cnt_vec.get_feature_names_out())
test_x_dtm

Unnamed: 0,000,10,100,1000,150p,150ppm,16,18,1st,2000,...,work,world,www,xxx,ya,yeah,year,yes,yo,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1391,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Naive Bayes

In [28]:
nb = MultinomialNB()
nb.fit(train_x_dtm,train_y)
nb_pred = nb.predict(test_x_dtm)

**--Evaluation**

In [29]:
#Evaluation
print('MAE:', metrics.accuracy_score(test_y, nb_pred))

MAE: 0.9777458722182341


In [30]:
metrics.confusion_matrix(test_y,nb_pred)

array([[1191,   17],
       [  14,  171]], dtype=int64)

In [31]:
nb_pred = pd.Series(nb_pred)

**--False Positives**

In [32]:
len(test_x[(test_y == 'ham') & (nb_pred == 'spam')])

50

**--False Negatives**

In [33]:
len(test_x[(test_y == 'spam') & (nb_pred == 'ham')])

41

**--True Positive**

In [34]:
len(test_x[(test_y == 'spam') & (nb_pred == 'spam')])

11

**--True Negatives**

In [35]:
len(test_x[(test_y == 'ham') & (nb_pred == 'ham')])

253

**--Counting the frequency of words**

In [36]:
train_x_tokens = cnt_vec.get_feature_names_out()
ham_tokens = nb.feature_count_[0,:]
spam_tokens = nb.feature_count_[1,:]

tokens = pd.DataFrame({'tokens': train_x_tokens,'ham': ham_tokens, 'spam': spam_tokens}).set_index('tokens')

In [37]:
#Top 5 Spam word
tokens.spam.sort_values(ascending=False).head(5)

tokens
free    158.0
txt     114.0
ur      101.0
text     99.0
stop     97.0
Name: spam, dtype: float64

In [38]:
#Top 5 ham word
tokens.ham.sort_values(ascending=False).head(5)

tokens
gt       234.0
lt       231.0
ok       220.0
just     219.0
lt gt    205.0
Name: ham, dtype: float64

In [43]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1

# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]

# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
tokens.sample(5, random_state=6)

# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,ham,spam,spam_ratio
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
claim,0.000276,0.158363,572.798932
prize,0.000276,0.135231,489.131673
150p,0.000276,0.087189,315.361210
tone,0.000276,0.085409,308.925267
guaranteed,0.000276,0.076512,276.745552
...,...,...,...
lor,0.032900,0.001779,0.054084
da,0.032900,0.001779,0.054084
lt gt,0.056953,0.001779,0.031242
lt,0.064142,0.001779,0.027741
