In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
mail = pd.read_csv("spam.csv")

In [4]:
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
mail["spam"] = mail["Category"].apply(lambda x: 1 if x == "spam" else 0)

In [7]:
mail.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
mail.shape

(5572, 3)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(mail.Message, mail.spam, test_size = 0.2)

In [10]:
X_train.shape, X_test.shape

((4457,), (1115,))

In [11]:
type(X_train)

pandas.core.series.Series

In [12]:
X_train[:5]

4860    Hey, a guy I know is breathing down my neck to...
2171    CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER ...
1557    Good sleep is about rhythm. The person has to ...
5476    Yes princess! I want to please you every night...
4528    Understand. his loss is my gain :) so do you w...
Name: Message, dtype: object

In [13]:
type(y_train)

pandas.core.series.Series

In [14]:
y_train[:5]

4860    0
2171    0
1557    0
5476    0
4528    0
Name: spam, dtype: int64

In [15]:
type(X_train.values)

numpy.ndarray

<p>Bag Of words</p>

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()


In [18]:
X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<4457x7723 sparse matrix of type '<class 'numpy.int64'>'
	with 59179 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
X_train_cv.shape

(4457, 7723)

In [21]:
cv.vocabulary_

{'hey': 3401,
 'guy': 3254,
 'know': 3953,
 'is': 3738,
 'breathing': 1494,
 'down': 2429,
 'my': 4661,
 'neck': 4716,
 'to': 6919,
 'get': 3101,
 'him': 3414,
 'some': 6276,
 'bud': 1536,
 'anyway': 993,
 'you': 7684,
 'be': 1265,
 'able': 749,
 'half': 3274,
 'track': 6990,
 'usf': 7207,
 'tonight': 6951,
 'can': 1625,
 'please': 5242,
 'come': 1907,
 'up': 7166,
 'now': 4834,
 'imin': 3609,
 'town': 6986,
 'dontmatter': 2412,
 'if': 3583,
 'urgoin': 7190,
 'outl8r': 5006,
 'just': 3869,
 'reallyneed': 5593,
 '2docd': 378,
 'dontplease': 2413,
 'dontignore': 2411,
 'mycalls': 4662,
 'no': 4785,
 'thecd': 6799,
 'isv': 3749,
 'important': 3617,
 'tome': 6939,
 '2moro': 392,
 'good': 3164,
 'sleep': 6201,
 'about': 751,
 'rhythm': 5762,
 'the': 6794,
 'person': 5155,
 'has': 3319,
 'establish': 2636,
 'that': 6791,
 'body': 1411,
 'will': 7498,
 'learn': 4052,
 'and': 951,
 'use': 7201,
 'want': 7359,
 'more': 4573,
 'yes': 7669,
 'princess': 5405,
 'every': 2654,
 'night': 4765,
 'you

In [22]:
cv.get_feature_names()[6814]

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [23]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
np.where(X_train_np[0]!= 0)

(array([ 749,  993, 1265, 1494, 1536, 2429, 3101, 3254, 3274, 3401, 3414,
        3738, 3953, 4661, 4716, 6276, 6919, 6951, 6990, 7207, 7684],
       dtype=int64),)

In [25]:
X_train[:5][3370]

KeyError: 3370

In [None]:
X_train_np[0][6170]

Naive Bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [27]:
model.fit(X_train_cv, y_train)

In [28]:
X_test_cv = cv.transform(X_test)

In [29]:
y_pred = model.predict(X_test_cv)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       956
           1       0.96      0.91      0.93       159

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


In [31]:
emails = [
    "How's it going? Got any exciting karaoke type activities planned? I'm debating whether to play football this eve. Feeling lazy though."
]
emails_count = cv.transform(emails)
model.predict(emails_count)

array([0], dtype=int64)

In [32]:
#using sklearn pipeline and reduce number of lines of code
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [33]:
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       956
           1       0.96      0.91      0.93       159

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


## Exercises 

1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
2. print the shape of the data
3. print top 5 datapoints

#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative.
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
#Do the 'train-test' splitting with test size of 20%

Exercise 1

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

*use CountVectorizer for pre-processing the text.

*use Random Forest as the classifier with estimators as 50 and criterion as entropy.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution1

In [35]:
#import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [36]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv("movies_sentiment_data.csv")

In [37]:
#2. print the shape of the data
print(df.shape)

(19000, 2)


In [38]:
#3. print top 5 datapoints
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [39]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df["Category"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [40]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df["Category"].value_counts()

1    9500
0    9500
Name: Category, dtype: int64

In [41]:
#Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size = 0.2)

Solution Exercise 1

In [42]:
#1. create a pipeline object

clf = Pipeline([
    ("vectorizer", CountVectorizer()), #initializing the vectorizer
    ("random_forest", (RandomForestClassifier(n_estimators = 50, criterion = "entropy"))) #using the RandomForest classifier
])

In [43]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [44]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [45]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1910
           1       0.83      0.83      0.83      1890

    accuracy                           0.83      3800
   macro avg       0.83      0.83      0.83      3800
weighted avg       0.83      0.83      0.83      3800


Exercise 2

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 2

In [46]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("KNN", (KNeighborsClassifier(n_neighbors = 10, metric = "euclidean")))   #using the KNN classifier with 10 neighbors 
])

In [47]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [48]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [49]:
#4. print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.62      0.63      1910
           1       0.63      0.65      0.64      1890

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800


Exercise 3

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use Multinomial Naive Bayes as the classifier.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 3

In [50]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("Multi NB", MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])

In [51]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [52]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [53]:
#4. print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1910
           1       0.86      0.83      0.85      1890

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800
