In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
data= pd.read_csv("D:/Users/NITIN VERMA/Desktop/NLP/resources/spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [16]:
# handle the spam /ham variable (0, 1)
data['Category']= data['Category'].apply(lambda x: 1 if x == "ham" else 0)

In [17]:
data['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [18]:
x_train, x_test, y_train, y_test= train_test_split(data.Message, data.Category, test_size= 0.2, random_state= 23)

In [141]:
x_train[10:15]

2862    I am not at all happy with what you saying or ...
5476    Yes princess! I want to please you every night...
4981                               So what u doing today?
1398    Then we wait 4 u lor... No need 2 feel bad lar...
4674    I forgot 2 ask ü all smth.. There's a card on ...
Name: Message, dtype: object

In [142]:
v= CountVectorizer()
x_train_cv= v.fit_transform(x_train)
x_test_cv= v.transform(x_test)

In [143]:
x_train_np= x_train_cv.toarray()
x_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [158]:
arr= v.get_feature_names_out()
arr[1050:1080]


array(['00', '000', '000pes', ..., 'zyada', 'èn', '〨ud'], dtype=object)

In [145]:
vocab_arr= v.vocabulary_
vocab_arr

{'dear': 2206,
 'how': 3548,
 'you': 7741,
 'are': 1070,
 'ok': 4944,
 'been': 1314,
 'running': 5896,
 'but': 1596,
 'only': 4976,
 'managed': 4365,
 'minutes': 4530,
 'and': 978,
 'then': 6857,
 'needed': 4755,
 'oxygen': 5077,
 'might': 4502,
 'have': 3384,
 'to': 6968,
 'resort': 5784,
 'the': 6845,
 'roller': 5857,
 'option': 5003,
 'nah': 4708,
 'perpetual': 5199,
 'dd': 2200,
 'what': 7508,
 'will': 7555,
 'we': 7450,
 'do': 2390,
 'in': 3667,
 'shower': 6168,
 'baby': 1212,
 've': 7290,
 'trying': 7091,
 'reach': 5634,
 'him': 3464,
 'without': 7586,
 'success': 6609,
 'oh': 4939,
 'must': 4687,
 'taken': 6721,
 'your': 7746,
 'real': 5645,
 'valentine': 7271,
 'out': 5042,
 'shopping': 6150,
 'first': 2899,
 'jokin': 3866,
 'oni': 4973,
 'lar': 4039,
 'busy': 1595,
 'wun': 7673,
 'disturb': 2375,
 'yup': 7765,
 'elaborating': 2577,
 'on': 4965,
 'safety': 5914,
 'aspects': 1125,
 'some': 6336,
 'other': 5032,
 'issues': 3780,
 'accordingly': 795,
 'repeat': 5754,
 'just': 3897

In [146]:
x_train[:1]

1627    Dear how you. Are you ok?
Name: Message, dtype: object

In [147]:
x_train_np[:1]

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [149]:
from sklearn.naive_bayes import MultinomialNB
model= MultinomialNB()

model.fit(x_train_cv, y_train)

In [156]:
from sklearn.metrics import classification_report
y_pred= model.predict(x_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       156
           1       0.99      1.00      0.99       959

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [22]:
# to save a lot of steps 
# a nlp pipe line can also be created

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

v= CountVectorizer()

model= MultinomialNB()

clf= Pipeline([
        ('Vectorizer', v),
        ('nb', model)
    ])

clf.fit(x_train, y_train)


In [28]:
from sklearn.metrics import classification_report
y_pred= clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95       156
           1       0.99      1.00      0.99       959

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

