In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [8]:
df = pd.read_csv('IMDB Dataset.csv')

In [7]:
df.shape

(50000, 2)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Create new column (Category)

In [14]:
df['Category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [15]:
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [21]:
df.isnull().sum()

review       0
sentiment    0
Category     0
dtype: int64

In [29]:
df['Category'].value_counts()

Category
1    25000
0    25000
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size=0.2)

In [18]:
print(df.shape)
print(X_train.shape)
print(X_test.shape)

(50000, 3)
(40000,)
(10000,)


# CountVectorizer and RandomForestClassifier

In [23]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier()) 
])

clf.fit(X_train, y_train)

In [24]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4968
           1       0.85      0.85      0.85      5032

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# CountVectorizer and KNeighborsClassifier

In [25]:
knn = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

knn.fit(X_train, y_train)

In [26]:
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.58      0.61      4968
           1       0.63      0.69      0.66      5032

    accuracy                           0.64     10000
   macro avg       0.64      0.64      0.64     10000
weighted avg       0.64      0.64      0.64     10000



# CountVectorizer and MultinomialNB

In [27]:
mnb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

mnb.fit(X_train, y_train)

In [28]:
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4968
           1       0.88      0.82      0.85      5032

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

