<h1 style="text-align:center;color:mediumvioletred">Exercise - Bag of Words</h1>

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("movies_sentiment_data.csv")
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [4]:
df.shape

(19000, 2)

In [5]:
df.sentiment.value_counts()

sentiment
positive    9500
negative    9500
Name: count, dtype: int64

In [6]:
df['positive'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [7]:
df.head()

Unnamed: 0,review,sentiment,positive
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [8]:
X = df['review']
y = df['positive']

In [9]:
X.head()

0    I first saw Jake Gyllenhaal in Jarhead (2005) ...
1    I enjoyed the movie and the story immensely! I...
2    I had a hard time sitting through this. Every ...
3    It's hard to imagine that anyone could find th...
4    This is one military drama I like a lot! Tom B...
Name: review, dtype: object

In [10]:
y.head()

0    1
1    1
2    0
3    0
4    1
Name: positive, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

In [12]:
X_train.shape

(15200,)

In [13]:
X_test.shape

(3800,)

## Model Building

In [103]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline

### Logistic Regression

In [67]:
lr_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('lr', LogisticRegression(
        C=1,
        solver='saga',
        max_iter=10000,
        n_jobs=-1,           
        random_state=42
    ))
])

In [68]:
lr_model.fit(X_train, y_train)

In [69]:
lr_model.score(X_test, y_test)

0.8794736842105263

In [70]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = lr_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      1892
           1       0.88      0.89      0.88      1908

    accuracy                           0.88      3800
   macro avg       0.88      0.88      0.88      3800
weighted avg       0.88      0.88      0.88      3800



### Random Forest Classifier

In [100]:
rf_model.fit(X_train, y_train)

In [99]:
rf_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(
        n_estimators=1000,
        criterion='entropy',
        max_depth=None,
        min_samples_split=5,
        max_features='sqrt',       
        oob_score=True,
        n_jobs=-1
    ))
])

In [101]:
rf_model.score(X_test, y_test)

0.8647368421052631

In [102]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      1892
           1       0.86      0.88      0.87      1908

    accuracy                           0.86      3800
   macro avg       0.86      0.86      0.86      3800
weighted avg       0.86      0.86      0.86      3800



### KNN

In [107]:
knn_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn',KNeighborsClassifier(
        n_neighbors=10,
        metric='euclidean',
        n_jobs=-1
    ))
])

In [108]:
knn_model.fit(X_train, y_train)

In [109]:
knn_model.score(X_test, y_test)

0.6344736842105263

In [112]:
from sklearn.metrics import classification_report

y_pred = knn_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.64      0.64      1892
           1       0.64      0.63      0.63      1908

    accuracy                           0.63      3800
   macro avg       0.63      0.63      0.63      3800
weighted avg       0.63      0.63      0.63      3800



### Naive Bayes

In [113]:
nb_model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn',MultinomialNB())
])

In [114]:
nb_model.fit(X_train, y_train)

In [116]:
nb_model.score(X_test, y_test)

0.8460526315789474

In [117]:
from sklearn.metrics import classification_report

y_pred = nb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1892
           1       0.87      0.82      0.84      1908

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800

