<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [8]:
df.shape

(5572, 3)

In [9]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [11]:
X_train.shape

(4457,)

In [12]:
X_test.shape

(1115,)

In [13]:
type(X_train)

pandas.core.series.Series

In [14]:
X_train[:4]

870     What do U want for Xmas? How about 100 free te...
477                          Love you aathi..love u lot..
1231    I want to send something that can sell fast.  ...
2267    <Forwarded from 88877>FREE entry into our £250...
Name: Message, dtype: object

In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
y_train[:4]

870     1
477     0
1231    0
2267    1
Name: spam, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7670 sparse matrix of type '<class 'numpy.int64'>'
	with 59532 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
X_train_cv.shape

(4457, 7670)

In [21]:
v.get_feature_names_out()[1771]

'chicken'

In [22]:
v.vocabulary_

{'what': 7392,
 'do': 2373,
 'want': 7294,
 'for': 2924,
 'xmas': 7577,
 'how': 3510,
 'about': 753,
 '100': 255,
 'free': 2971,
 'text': 6720,
 'messages': 4419,
 'new': 4717,
 'video': 7207,
 'phone': 5135,
 'with': 7466,
 'half': 3288,
 'price': 5353,
 'line': 4094,
 'rental': 5647,
 'call': 1601,
 'now': 4799,
 'on': 4891,
 '0800': 45,
 '0721072': 26,
 'to': 6869,
 'find': 2838,
 'out': 4963,
 'more': 4548,
 'love': 4194,
 'you': 7632,
 'aathi': 741,
 'lot': 4181,
 'send': 5955,
 'something': 6252,
 'that': 6745,
 'can': 1625,
 'sell': 5947,
 'fast': 2769,
 'lt': 4217,
 'gt': 3243,
 'is': 3732,
 'not': 4786,
 'easy': 2512,
 'money': 4533,
 'forwarded': 2949,
 'from': 3008,
 '88877': 697,
 'entry': 2615,
 'into': 3701,
 'our': 4961,
 '250': 367,
 'weekly': 7358,
 'comp': 1909,
 'just': 3858,
 'the': 6748,
 'word': 7512,
 'enter': 2605,
 '18': 319,
 'www': 7566,
 'textcomp': 6725,
 'com': 1890,
 'she': 6020,
 'replying': 5661,
 'has': 3330,
 'boye': 1466,
 'changed': 1716,
 'his': 34

In [23]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
np.where(X_train_np[0]!=0)

(array([ 755,  916, 1125, 1127, 1279, 1483, 1771, 1842, 1928, 2046, 2263,
        2290, 2327, 2464, 2744, 3018, 3151, 3154, 3195, 3272, 3348, 3391,
        3541, 3717, 3770, 4138, 4218, 4237, 4311, 4706, 4804, 4908, 4951,
        5117, 5423, 5425, 5432, 6474, 6588, 6847, 6966, 7184, 7191, 7302],
       dtype=int64),)

<h3>Training the naive bayes model</h3>

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       961
           1       0.97      0.91      0.94       154

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115


In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Training the model using sklearn pipeline to reduce number of lines of code</h3>

In [29]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [30]:
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       961
           1       0.97      0.91      0.94       154

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115
