In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
mail = pd.read_csv("spam_data.csv")

In [5]:
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
mail.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
mail["spam"] = mail["Category"].apply(lambda x: 1 if x == "spam" else 0)

In [8]:
mail.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
mail.shape

(5572, 3)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(mail.Message, mail.spam, test_size = 0.2)

In [11]:
X_train.shape, X_test.shape

((4457,), (1115,))

In [12]:
type(X_train)

pandas.core.series.Series

In [13]:
X_train[:5]

3891    Double Mins & 1000 txts on Orange tariffs. Lat...
4145     That's a shame! Maybe cld meet for few hrs tomo?
3788                          WHORE YOU ARE UNBELIEVABLE.
4532    I wish things were different. I wonder when i ...
4964                   I want to see your pretty pussy...
Name: Message, dtype: object

In [14]:
type(y_train)

pandas.core.series.Series

In [15]:
y_train[:5]

3891    1
4145    0
3788    0
4532    0
4964    0
Name: spam, dtype: int64

In [16]:
type(X_train.values)

numpy.ndarray

<p>Bag Of words</p>

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer()


In [19]:
X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<4457x7811 sparse matrix of type '<class 'numpy.int64'>'
	with 59072 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
X_train_cv.shape

(4457, 7811)

In [22]:
cv.vocabulary_

{'double': 2445,
 'mins': 4557,
 '1000': 264,
 'txts': 7164,
 'on': 5008,
 'orange': 5051,
 'tariffs': 6776,
 'latest': 4086,
 'motorola': 4664,
 'sonyericsson': 6386,
 'nokia': 4876,
 'with': 7614,
 'bluetooth': 1436,
 'free': 3013,
 'call': 1631,
 'mobileupd8': 4611,
 '08000839402': 49,
 'or': 5047,
 'call2optout': 1633,
 'hf8': 3469,
 'that': 6871,
 'shame': 6135,
 'maybe': 4460,
 'cld': 1857,
 'meet': 4484,
 'for': 2968,
 'few': 2849,
 'hrs': 3573,
 'tomo': 7016,
 'whore': 7566,
 'you': 7773,
 'are': 1071,
 'unbelievable': 7198,
 'wish': 7606,
 'things': 6906,
 'were': 7525,
 'different': 2330,
 'wonder': 7645,
 'when': 7546,
 'will': 7583,
 'be': 1295,
 'able': 777,
 'to': 6992,
 'show': 6200,
 'how': 3560,
 'much': 4700,
 'value': 7314,
 'pls': 5331,
 'continue': 2013,
 'the': 6875,
 'brisk': 1544,
 'walks': 7431,
 'no': 4867,
 'drugs': 2489,
 'without': 7617,
 'askin': 1122,
 'me': 4466,
 'please': 5325,
 'and': 983,
 'find': 2880,
 'laugh': 4089,
 'about': 779,
 'love': 4290,
 

In [23]:
cv.get_feature_names()[6814]

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [24]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
np.where(X_train_np[0]!= 0)

(array([  49,  264, 1436, 1631, 1633, 2445, 3013, 3469, 4086, 4557, 4611,
        4664, 4876, 5008, 5047, 5051, 6386, 6776, 7164, 7614], dtype=int64),)

In [26]:
X_train[:5][3370]

KeyError: 3370

In [None]:
X_train_np[0][6170]

Naive Bayes

In [33]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [34]:
model.fit(X_train_cv, y_train)

In [35]:
X_test_cv = cv.transform(X_test)

In [36]:
y_pred = model.predict(X_test_cv)

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       942
           1       1.00      0.91      0.95       173

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115


In [38]:
emails = [
    "How's it going? Got any exciting karaoke type activities planned? I'm debating whether to play football this eve. Feeling lazy though."
]
emails_count = cv.transform(emails)
model.predict(emails_count)

array([0], dtype=int64)

In [39]:
#using sklearn pipeline and reduce number of lines of code
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [40]:
clf.fit(X_train, y_train)

In [41]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       942
           1       1.00      0.91      0.95       173

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115


## Exercises 

1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
2. print the shape of the data
3. print top 5 datapoints

### creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative.
### check the distribution of 'Category' and see whether the Target labels are balanced or not.
### Do the 'train-test' splitting with test size of 20%

Exercise 1

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.

*use CountVectorizer for pre-processing the text.

*use Random Forest as the classifier with estimators as 50 and criterion as entropy.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution1

In [42]:
#import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [43]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv("movies_sentiment_data.csv")

In [44]:
#2. print the shape of the data
print(df.shape)

(19000, 2)


In [45]:
#3. print top 5 datapoints
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [46]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df["Category"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [47]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df["Category"].value_counts()

1    9500
0    9500
Name: Category, dtype: int64

In [48]:
#Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size = 0.2)

Solution Exercise 1

In [49]:
#1. create a pipeline object

clf = Pipeline([
    ("vectorizer", CountVectorizer()), #initializing the vectorizer
    ("random_forest", (RandomForestClassifier(n_estimators = 50, criterion = "entropy"))) #using the RandomForest classifier
])

In [50]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [None]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [None]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

Exercise 2

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use KNN as the classifier with n_neighbors of 10 and metric as 'euclidean'.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 2

In [None]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("KNN", (KNeighborsClassifier(n_neighbors = 10, metric = "euclidean")))   #using the KNN classifier with 10 neighbors 
])

In [None]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [None]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [None]:
#4. print the classification report
print(classification_report(y_test, y_pred))

Exercise 3

Using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative..
Note:

*use CountVectorizer for pre-processing the text.

*use Multinomial Naive Bayes as the classifier.

*print the classification report.

1. create a pipeline object
2. fit with X_train and y_train
3. get the predictions for X_test and store it in y_pred
4. print the classfication report

Solution Exercise 3

In [None]:
#1. create a pipeline object
clf = Pipeline([
            ("vectorizer", CountVectorizer()),   
            ("Multi NB", MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])

In [None]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [None]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [None]:
#4. print the classification report
print(classification_report(y_test, y_pred))