# NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df.shape

(5572, 2)

In [5]:
# Create a new column with 1 for spam and 0 for ham
df['spam'] = df.Category.apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:5]

5431                   If I was I wasn't paying attention
2910    URGENT! Your Mobile number has been awarded wi...
227     Will u meet ur dream partner soon? Is ur caree...
1755    How is your schedule next week? I am out of to...
1022    Guess what! Somebody you know secretly fancies...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:5]

5431    0
2910    1
227     1
1755    0
1022    1
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

# Create bag of words representation using CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv



<4457x7756 sparse matrix of type '<class 'numpy.int64'>'
	with 59571 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
X_train_cv.shape

(4457, 7756)

In [18]:
v.get_feature_names_out()[6521]

'stop'

In [19]:
v.vocabulary_

{'if': 3618,
 'was': 7405,
 'wasn': 7407,
 'paying': 5143,
 'attention': 1139,
 'urgent': 7222,
 'your': 7724,
 'mobile': 4564,
 'number': 4878,
 'has': 3362,
 'been': 1298,
 'awarded': 1174,
 'with': 7562,
 '2000': 350,
 'prize': 5444,
 'guaranteed': 3277,
 'call': 1610,
 '09058094454': 180,
 'from': 3038,
 'land': 4016,
 'line': 4138,
 'claim': 1834,
 '3030': 435,
 'valid': 7261,
 '12hrs': 289,
 'only': 4965,
 'will': 7534,
 'meet': 4443,
 'ur': 7219,
 'dream': 2450,
 'partner': 5111,
 'soon': 6343,
 'is': 3766,
 'career': 1663,
 'off': 4914,
 'flyng': 2928,
 'start': 6473,
 'find': 2865,
 'out': 5028,
 'free': 3002,
 'txt': 7118,
 'horo': 3521,
 'followed': 2937,
 'by': 1594,
 'star': 6467,
 'sign': 6176,
 'aries': 1070,
 'how': 3541,
 'schedule': 5962,
 'next': 4787,
 'week': 7452,
 'am': 941,
 'of': 4912,
 'town': 7027,
 'this': 6878,
 'weekend': 7453,
 'guess': 3282,
 'what': 7490,
 'somebody': 6321,
 'you': 7719,
 'know': 3973,
 'secretly': 5998,
 'fancies': 2779,
 'wanna': 7390

In [28]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
np.where(X_train_np[0]!=0)

(array([1139, 3618, 5143, 7405, 7407]),)

In [30]:
X_train[:4]

5431                   If I was I wasn't paying attention
2910    URGENT! Your Mobile number has been awarded wi...
227     Will u meet ur dream partner soon? Is ur caree...
1755    How is your schedule next week? I am out of to...
Name: Message, dtype: object

In [32]:
X_train[:4][5431]

"If I was I wasn't paying attention"

In [41]:
X_train_np[0][1139]

1

# Train the naive bayes model

In [34]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [35]:
X_test_cv = v.transform(X_test)


# Evaluate Performance

In [36]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       969
           1       0.98      0.87      0.92       146

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [37]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [38]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [39]:
clf.fit(X_train, y_train)

In [40]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       969
           1       0.98      0.87      0.92       146

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

