In [34]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [35]:
df = pd.read_csv("../Dataset/movies_sentiment_data.csv")
print(df.head())

                                              review sentiment
0  I first saw Jake Gyllenhaal in Jarhead (2005) ...  positive
1  I enjoyed the movie and the story immensely! I...  positive
2  I had a hard time sitting through this. Every ...  negative
3  It's hard to imagine that anyone could find th...  negative
4  This is one military drama I like a lot! Tom B...  positive


In [36]:
df = df.rename(columns={'review':'movie_review','sentiment' : 'movie_sentiment'})
print(df.head())
print(df.shape)

                                        movie_review movie_sentiment
0  I first saw Jake Gyllenhaal in Jarhead (2005) ...        positive
1  I enjoyed the movie and the story immensely! I...        positive
2  I had a hard time sitting through this. Every ...        negative
3  It's hard to imagine that anyone could find th...        negative
4  This is one military drama I like a lot! Tom B...        positive
(19000, 2)


In [37]:
df['Category'] = df['movie_sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
print(df.head())

                                        movie_review movie_sentiment  Category
0  I first saw Jake Gyllenhaal in Jarhead (2005) ...        positive         1
1  I enjoyed the movie and the story immensely! I...        positive         1
2  I had a hard time sitting through this. Every ...        negative         0
3  It's hard to imagine that anyone could find th...        negative         0
4  This is one military drama I like a lot! Tom B...        positive         1


In [38]:
X = df['movie_review']
y = df['Category']

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [40]:
vector = CountVectorizer()
x_train_vector = vector.fit_transform(X_train)
print(x_train_vector.toarray()[:2])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [41]:
model = MultinomialNB()
model.fit(x_train_vector,y_train)

In [42]:
x_test_vector = vector.transform(X_test)
y_pred = model.predict(x_test_vector)

In [43]:
print("Accuracy: ", accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=[ 'negative','positive']))

Accuracy:  0.8436842105263158
Confusion Matrix:
 [[1635  229]
 [ 365 1571]]
Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.88      0.85      1864
    positive       0.87      0.81      0.84      1936

    accuracy                           0.84      3800
   macro avg       0.85      0.84      0.84      3800
weighted avg       0.85      0.84      0.84      3800



In [44]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [45]:
random = Pipeline([
    ('vector', CountVectorizer()),
    ('random', RandomForestClassifier())
])

In [46]:
random.fit(X_train,y_train)

In [47]:
y_pred = random.predict(X_test)

In [48]:
print("Random Forest Accuracy is: ",accuracy_score(y_test,y_pred))
print("Confusion Matrix is: \n",confusion_matrix(y_test,y_pred))
print("Classfication Report: \n",classification_report(y_test,y_pred,target_names=['negative','positive']))

Random Forest Accuracy is:  0.8481578947368421
Confusion Matrix is: 
 [[1590  274]
 [ 303 1633]]
Classfication Report: 
               precision    recall  f1-score   support

    negative       0.84      0.85      0.85      1864
    positive       0.86      0.84      0.85      1936

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800

