In [32]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df['category'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)

In [5]:
df['category'].value_counts()

category
1    25000
0    25000
Name: count, dtype: int64

In [11]:
X_train,X_test,y_train,y_test = train_test_split(df['review'],
                                                 df['category'],
                                                 test_size=0.2,
                                                 stratify = df['category'],
                                                 random_state=42)

# Bag of words preprocessing technique and Random Forest Classifier

In [13]:
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('rf',(RandomForestClassifier(n_estimators=50,criterion='entropy')))
])

In [16]:
pipe.fit(X_train,y_train)

In [18]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred,y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



In [19]:
test_pred = pipe.predict(X_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      5096
           1       0.83      0.85      0.84      4904

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# Bag of words preprocessing technique and KNN Classifier

In [23]:
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('knn',(KNeighborsClassifier(n_neighbors=10,metric='euclidean')))
])

In [24]:
pipe.fit(X_train,y_train)

In [25]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred,y_train))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74     20048
           1       0.74      0.74      0.74     19952

    accuracy                           0.74     40000
   macro avg       0.74      0.74      0.74     40000
weighted avg       0.74      0.74      0.74     40000



In [26]:
test_pred = pipe.predict(X_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.65      0.65      0.65      5001
           1       0.65      0.65      0.65      4999

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



# Bag of words preprocessing technique and Naive Bayes

In [28]:
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('naive_bayes',(MultinomialNB()))
])

In [29]:
pipe.fit(X_train,y_train)

In [30]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred,y_train))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89     21366
           1       0.86      0.92      0.89     18634

    accuracy                           0.89     40000
   macro avg       0.89      0.89      0.89     40000
weighted avg       0.89      0.89      0.89     40000



In [31]:
test_pred = pipe.predict(X_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86      5310
           1       0.82      0.87      0.85      4690

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# Bag of words preprocessing technique and Decision tree

In [33]:
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('naive_bayes',(DecisionTreeClassifier()))
])

In [34]:
pipe.fit(X_train,y_train)

In [35]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred,y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00     20000

    accuracy                           1.00     40000
   macro avg       1.00      1.00      1.00     40000
weighted avg       1.00      1.00      1.00     40000



In [36]:
test_pred = pipe.predict(X_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.73      0.73      0.73      4995
           1       0.73      0.73      0.73      5005

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



In [37]:
# checking with naive bayes

In [38]:
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('naive_bayes',(MultinomialNB()))
])

In [39]:
pipe.fit(X_train,y_train)

In [50]:
reviews = [
    "well",
    "Not too bad",
    "Terrible",
    "Absolutely terrible writing and dragged-out unnecessary dialogue",
    "Amazing"
]

In [51]:
pipe.predict(reviews)

array([1, 0, 0, 0, 1])