# NLP Tutorial: Text Representation - Bag Of Words (BOW)


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [33]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [34]:
df.shape

(5572, 3)

In [35]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<b> Train test split </b>

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [37]:
X_train.shape

(4457,)

In [38]:
X_test.shape

(1115,)

In [39]:
type(X_train)

pandas.core.series.Series

In [40]:
X_train[:4]

1546    Good afternoon, my love ! Any job prospects ? ...
4577    Congratulations ur awarded 500 of CD vouchers ...
547     I know but you need to get hotel now. I just g...
3915    Today is ACCEPT DAY..U Accept me as? Brother S...
Name: Message, dtype: object

In [41]:
type(y_train)

pandas.core.series.Series

In [42]:
y_train[:4]

1546    0
4577    1
547     0
3915    0
Name: spam, dtype: int64

In [43]:
type(X_train.values)

numpy.ndarray

<b> Create bag of words representation using CountVectorizer</b>

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7729 sparse matrix of type '<class 'numpy.int64'>'
	with 59147 stored elements in Compressed Sparse Row format>

In [45]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [46]:
X_train_cv.shape

(4457, 7729)

In [47]:
v.get_feature_names_out()[1771]

'chicken'

In [48]:
v.vocabulary_

{'good': 3178,
 'afternoon': 849,
 'my': 4639,
 'love': 4208,
 'any': 988,
 'job': 3825,
 'prospects': 5452,
 'are': 1046,
 'you': 7692,
 'missing': 4489,
 'me': 4380,
 'what': 7456,
 'do': 2366,
 'being': 1306,
 'lazy': 4029,
 'and': 956,
 'bleak': 1381,
 'hmmm': 3454,
 'or': 4947,
 'happy': 3332,
 'filled': 2838,
 'with': 7529,
 'congratulations': 1965,
 'ur': 7183,
 'awarded': 1166,
 '500': 527,
 'of': 4864,
 'cd': 1691,
 'vouchers': 7310,
 '125gift': 276,
 'guaranteed': 3256,
 'free': 2981,
 'entry': 2616,
 '100': 249,
 'wkly': 7542,
 'draw': 2442,
 'txt': 7078,
 'music': 4630,
 'to': 6923,
 '87066': 681,
 'know': 3955,
 'but': 1571,
 'need': 4703,
 'get': 3112,
 'hotel': 3510,
 'now': 4815,
 'just': 3871,
 'got': 3193,
 'invitation': 3719,
 'had': 3287,
 'apologise': 1011,
 'cali': 1600,
 'is': 3741,
 'sweet': 6639,
 'for': 2935,
 'come': 1901,
 'some': 6283,
 'english': 2597,
 'bloke': 1393,
 'weddin': 7411,
 'today': 6929,
 'accept': 768,
 'day': 2177,
 'as': 1079,
 'brother': 1

In [49]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [50]:
np.where(X_train_np[0]!=0)

(array([ 849,  956,  988, 1046, 1306, 1381, 2366, 2838, 3178, 3332, 3454,
        3825, 4029, 4208, 4380, 4489, 4639, 4947, 5452, 7456, 7529, 7692]),)

In [52]:
X_train_np[0][1771]

0

<b> Train the naive bayes model </b>

In [53]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [54]:
X_test_cv = v.transform(X_test)

<b>Evaluate Performance</b>

In [55]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       962
           1       0.96      0.92      0.94       153

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [56]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<b>Train the model using sklearn pipeline and reduce number of lines of code </b>

In [57]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [58]:
clf.fit(X_train, y_train)

In [59]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       962
           1       0.96      0.92      0.94       153

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115

