<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df.shape

(5572, 2)

In [6]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Category, test_size=0.2, random_state=1)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
X_test

1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
                              ...                        
324     That would be great. We'll be at the Guild. Co...
1163    Free entry in 2 a wkly comp to win FA Cup fina...
86      For real when u getting on yo? I only need 2 m...
4214                     I attended but nothing is there.
90      Yeah do! Don‘t stand to close tho- you‘ll catc...
Name: Message, Length: 1115, dtype: object

In [11]:
X_train

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
480     When're you guys getting back? G said you were...
3485    Tell my  bad character which u Dnt lik in me. ...
157                           I'm leaving my house now...
                              ...                        
905     We're all getting worried over here, derek and...
5192    Oh oh... Den muz change plan liao... Go back h...
3980    CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235     Text & meet someone sexy today. U can find a d...
5157                              K k:) sms chat with me.
Name: Message, Length: 4457, dtype: object

In [12]:
X_train[:4]

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
480     When're you guys getting back? G said you were...
3485    Tell my  bad character which u Dnt lik in me. ...
Name: Message, dtype: object

In [None]:
type(y_train)

In [None]:
y_train[:4]

In [23]:
X_train.values

array(["Hi , where are you? We're at  and they're not keen to go out i kind of am but feel i shouldn't so can we go out tomo, don't mind do you?",
       'If you r @ home then come down within 5 min',
       "When're you guys getting back? G said you were thinking about not staying for mcr",
       ...,
       'CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C YA 2MORO! WHO NEEDS BLOKES',
       'Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence',
       'K k:) sms chat with me.'], dtype=object)

In [None]:
X_train.values.shape

In [None]:
type(X_train.values)

<h3>Create bag of words representation using CountVectorizer</h3>

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

In [20]:
emails = [
            'Upto 20% discount reward!',
            'Hey mohan, discount Upto'
         ]

emails_cv = v.fit_transform(emails)
emails_cv

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [21]:
emails_cv.toarray()

array([[1, 1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0, 1]], dtype=int64)

In [17]:
v.get_feature_names_out()

array(['20', 'discount', 'hey', 'mohan', 'reward', 'upto'], dtype=object)

In [22]:
pd.DataFrame(emails_cv.toarray(), columns=v.get_feature_names_out())

Unnamed: 0,20,discount,hey,mohan,reward,upto
0,1,1,0,0,1,1
1,0,1,1,1,0,1


In [24]:
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7711 sparse matrix of type '<class 'numpy.int64'>'
	with 58978 stored elements in Compressed Sparse Row format>

In [25]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
X_train_cv.shape

(4457, 7711)

In [None]:
X_train_cv[0].toarray()

In [27]:
v.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'zyada', 'èn', '〨ud'],
      dtype=object)

In [28]:
pd.DataFrame(X_train_cv.toarray(), columns=v.get_feature_names_out())

Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
v.vocabulary_

In [None]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

In [None]:
np.where(X_train_np[0]!=0)

In [None]:
X_train_np[0][1771]

<h3>Train the naive bayes model</h3>

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

<h3>Evaluate Performance</h3>

In [None]:
X_test_cv = v.transform(X_test)

In [None]:
X_test_cv.toarray().shape

In [None]:
model.score(X_test_cv, y_test)

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

In [None]:
emails = [
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    'Hey mohan, can we get together to watch footbal game tomorrow?'
]

In [None]:
CV = CountVectorizer()
emails_count = CV.fit_transform(emails)
emails_count.toarray().shape

In [None]:
emails_count = v.transform(emails)
emails_count.toarray().shape

In [None]:
#emails_count.toarray().shape
model.predict(emails_count)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))