In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import csv
import nltk
import pickle
import operator
import tkinter as tk
from tkinter import *

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
    
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Loading the train data and the test data**

In [None]:
train_data = pd.read_csv("/kaggle/input/imdb-50k-movie-reviews-test-your-bert/train.csv")
test_data = pd.read_csv("/kaggle/input/imdb-50k-movie-reviews-test-your-bert/test.csv")

movie_data = [train_data,test_data]
final_data = pd.concat(movie_data)

split_1 = int(0.8 * len(final_data))
split_2 = int(0.9 * len(final_data))
train_data = final_data[:split_1]
dev_data = final_data[split_1:split_2]
test_data = final_data[split_2:]

In [None]:
train_data.info

In [None]:
test_data.info

In [None]:
dev_data.info

**We add another column sentiment_int where the negative comments are branded as 0 and the positive comments are branded as 1 **

In [None]:
train_data.head(5)

In [None]:
test_data.head(5)


In [None]:
dev_data.head(5)

In [None]:
sentiment_int = []
for index, row  in train_data.iterrows():
    if row['sentiment'] == 'pos':
        sentiment_int.append(1)
    else:
        sentiment_int.append(0)

train_data["sentiment_int"] = sentiment_int


sentiment_int = []
for index, row  in test_data.iterrows():
    if row['sentiment'] == 'pos':
        sentiment_int.append(1)
    else:
        sentiment_int.append(0)

test_data["sentiment_int"] = sentiment_int


sentiment_int = []
for index, row  in dev_data.iterrows():
    if row['sentiment'] == 'pos':
        sentiment_int.append(1)
    else:
        sentiment_int.append(0)

dev_data["sentiment_int"] = sentiment_int

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

In [None]:
dev_data.head()

In [None]:
nRow, nCol = train_data.shape
print(f'There are {nRow} rows and {nCol} columns of training data') 


In [None]:
nRow, nCol = test_data.shape
print(f'There are {nRow} rows and {nCol} columns of test data')

In [None]:
nRow, nCol = dev_data.shape
print(f'There are {nRow} rows and {nCol} columns of development data')

In [None]:
train_data['sentiment_int'].plot.hist()
plt.show()

In [None]:
test_data['sentiment_int'].plot.hist()

In [None]:
dev_data['sentiment_int'].plot.hist()

In [None]:
train_len = len(train_data)
print('total train records:', train_len)

pos_len = len(train_data[train_data['sentiment_int'] == 1])
neg_len = len(train_data[train_data['sentiment_int'] == 0])

print ('positive records:', pos_len)
print ('negative records:', neg_len)

In [None]:
test_len = len(test_data)
print('total train records:', test_len)

pos_len = len(test_data[test_data['sentiment_int'] == 1])
neg_len = len(test_data[test_data['sentiment_int'] == 0])

print ('positive records:', pos_len)
print ('negative records:', neg_len)

In [None]:
dev_len = len(dev_data)
print('total train records:', dev_len)

pos_len = len(dev_data[dev_data['sentiment_int'] == 1])
neg_len = len(dev_data[dev_data['sentiment_int'] == 0])

print ('positive records:', pos_len)
print ('negative records:', neg_len)

**We now check for any duplicate data and thus remove it as it might affect the accuracy**

In [None]:
print(f'Number of duplicates in the data = {train_data.duplicated().sum()}')

In [None]:
movie_train = train_data.drop_duplicates(subset=None, keep='first')
movie_train

In [None]:
print(f'Number of duplicates in the data = {test_data.duplicated().sum()}')

In [None]:
movie_test = test_data.drop_duplicates(subset=None, keep='first')
movie_test

In [None]:
print(f'Number of duplicates in the data = {dev_data.duplicated().sum()}')

In [None]:
movie_dev = dev_data.drop_duplicates(subset=None, keep='first')
movie_dev

**Now we check if there is any sparse data and thus remove them , but luckily we do not have any sparse data**

In [None]:
train_data.isnull().sum()

In [None]:
train_data["text"] = train_data["text"].str.lower()
train_data

In [None]:
train_data

In [None]:
ratings_counts = train_data['sentiment_int'].value_counts().sort_index(ascending=False)
print("Count of each rating value:\n", ratings_counts)

In [None]:
ratings_count_list = sorted(ratings_counts.items(), key=operator.itemgetter(1), reverse=False)
min_count = ratings_count_list[0][1]

ratings_count_list = sorted(ratings_counts.items(), key=operator.itemgetter(1), reverse=True)
max_count = ratings_count_list[0][1]

ratings_count_list = sorted(ratings_counts.items(), key=operator.itemgetter(0), reverse=False)

ratingVal = [item[0] for item in ratings_count_list]
countVal = [item[1] for item in ratings_count_list]

df = pd.DataFrame({'Rating Count': countVal}, index=ratingVal)

ax = df.plot.bar(rot=0)
ax.set(ylim=[min_count-10000, max_count+5000])

In [None]:
small_df = train_data.groupby('sentiment_int').apply(lambda x: x.sample(frac=0.8))
original_len_small_df = len(small_df)
print(original_len_small_df)

**Now we implement mutiple classifiers**

In [None]:
tfidfconverter = TfidfVectorizer(min_df=0.002)

# For train data - use fit_transform
X_train = tfidfconverter.fit_transform(train_data['text']).toarray()

# For dev and test - use transform
X_dev_arr = tfidfconverter.transform(dev_data['text']).toarray()
X_test_arr = tfidfconverter.transform(test_data['text']).toarray()
X_dev = tfidfconverter.transform(dev_data['text'])
X_test = tfidfconverter.transform(test_data['text'])

# Put 'rating' column of each dataframe into y
y_train = np.asarray(train_data['sentiment_int'])
y_dev = np.asarray(dev_data['sentiment_int'])
y_test = np.asarray(test_data['sentiment_int'])


In [None]:
mse_dict = dict()
accuracy_dict = dict()
classifier_dict = dict()

First we implement Naive Bayes classifier

In [None]:
start_time = time.time()

# Train and Predict the data using Multinomial Naive Bayes
multinomialNB = MultinomialNB(alpha=1)
multinomialNB.fit(X_train, y_train)
classifier_dict["Multinomial Naive Bayes"] = multinomialNB;
y_pred_mnb_dev = multinomialNB.predict(X_dev)

# Calculate the Mean Squared Error and Accuracy
mse_mnb_dev = mean_squared_error(y_test, y_pred_mnb_dev)
accuracy_mnb_dev = accuracy_score(y_test, y_pred_mnb_dev)*100

# Print the Mean Squared Error and Accuracy
print("Using Multinomial Naive Bayes:")
print("Mean Squared Error:", mse_mnb_dev)
print("Accuracy:", accuracy_mnb_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Multinomial Naive Bayes"] = mse_mnb_dev;
accuracy_dict["Multinomial Naive Bayes"] = accuracy_mnb_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

**We implement Support vector Machine SVM classifier**

SVM where c=1

In [None]:
start_time = time.time()

# Train and Predict the data using Linear SVM (C=1)
linearSVC1 = LinearSVC(C=1, dual=False)
linearSVC1.fit(X_train, y_train)
classifier_dict["Linear SVC (C=1)"] = linearSVC1;
y_pred_lsvc = linearSVC1.predict(X_dev)

# Calculate the Mean Squared Error and Accuracy
mse_lsvc1_dev = mean_squared_error(y_test, y_pred_lsvc)
accuracy_lsvc1_dev = accuracy_score(y_test, y_pred_lsvc)*100

# Print the Mean Squared Error and Accuracy
print("Using Linear SVC (C=1):")
print('Mean Squared Error:', mse_lsvc1_dev)
print('Accuracy:', accuracy_lsvc1_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Linear SVC (C=1)"] = mse_lsvc1_dev;
accuracy_dict["Linear SVC (C=1)"] = accuracy_lsvc1_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

Svm where c = 100

In [None]:
start_time = time.time()

# Train and Predict the data using Linear SVM (C=100)
linearSVC100 = LinearSVC(C=100, dual=False)
linearSVC100.fit(X_train, y_train)
classifier_dict["Linear SVC (C=100)"] = linearSVC100;
y_pred_lsvc = linearSVC100.predict(X_dev)

# Calculate the Mean Squared Error and Accuracy
mse_lsvc100_dev = mean_squared_error(y_test, y_pred_lsvc)
accuracy_lsvc100_dev = accuracy_score(y_test, y_pred_lsvc)*100

# Print the Mean Squared Error and Accuracy
print("Using Linear SVC (C=100):")
print('Mean Squared Error:', mse_lsvc100_dev)
print('Accuracy:', accuracy_lsvc100_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Linear SVC (C=100)"] = mse_lsvc100_dev;
accuracy_dict["Linear SVC (C=100)"] = accuracy_lsvc100_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

In [None]:
start_time = time.time()

# Train and Predict the data using Linear SVM (C=1000)
linearSVC1000 = LinearSVC(C=1000, dual=False)
linearSVC1000.fit(X_train, y_train)
classifier_dict["Linear SVC (C=1000)"] = linearSVC1000;
y_pred_lsvc = linearSVC1000.predict(X_dev)

# Calculate the Mean Squared Error and Accuracy
mse_lsvc1000_dev = mean_squared_error(y_test, y_pred_lsvc)
accuracy_lsvc1000_dev = accuracy_score(y_test, y_pred_lsvc)*100

# Print the Mean Squared Error and Accuracy
print("Using Linear SVC (C=1000):")
print('Mean Squared Error:', mse_lsvc1000_dev)
print('Accuracy:', accuracy_lsvc1000_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Linear SVC (C=1000)"] = mse_lsvc1000_dev;
accuracy_dict["Linear SVC (C=1000)"] = accuracy_lsvc1000_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

**RandomForest Classifier**

In [None]:
start_time = time.time()

# Train and Predict the data using Random Forest Classifier (n_estimators=10)
randomForest10 = RandomForestClassifier(max_depth=100, n_estimators=10, max_features=1)
randomForest10.fit(X_train, y_train)
classifier_dict["Random Forest Classifier (n_estimators=10)"] = randomForest10;
y_pred_rfc = randomForest10.predict(X_dev)

# Calculate the Accuracy
mse_rfc10_dev = mean_squared_error(y_test, y_pred_rfc)
accuracy_rfc10_dev = accuracy_score(y_test, y_pred_rfc)*100

# Print the  and Accuracy
print("Using Random Forest Classifier:")

print('Accuracy:', accuracy_rfc10_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Random Forest Classifier (n_estimators=10)"] = mse_rfc10_dev;
accuracy_dict["Random Forest Classifier (n_estimators=10)"] = accuracy_rfc10_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

In [None]:
start_time = time.time()

# Train and Predict the data using Linear SVM (n_estimators=50)
randomForest50 = RandomForestClassifier(max_depth=100, n_estimators=50, max_features=1)
randomForest50.fit(X_train, y_train)
classifier_dict["Random Forest Classifier (n_estimators=50)"] = randomForest50;
y_pred_rfc = randomForest50.predict(X_dev)

# Calculate the Mean Squared Error and Accuracy
mse_rfc50_dev = mean_squared_error(y_test, y_pred_rfc)
accuracy_rfc50_dev = accuracy_score(y_test, y_pred_rfc)*100

# Print the Mean Squared Error and Accuracy
print("Using Random Forest Classifier (n_estimators=50):")
print('Mean Squared Error:', mse_rfc50_dev)
print('Accuracy:', accuracy_rfc50_dev)

# Store the Mean Squared Error and Accuracy in dictionaries
mse_dict["Random Forest Classifier (n_estimators=50)"] = mse_rfc50_dev;
accuracy_dict["Random Forest Classifier (n_estimators=50)"] = accuracy_rfc50_dev;

end_time = time.time()
print("runtime: %s sec" % (end_time - start_time))

Comparing the accuracies of all the classifiers

In [None]:
mse_dict_list = sorted(mse_dict.items(), key=operator.itemgetter(1), reverse=False)
accuracy_dict_list = sorted(accuracy_dict.items(), key=operator.itemgetter(1), reverse=True)
accuracy_dict_list

In [None]:
graph_accuracy_list = [item[1] for item in accuracy_dict_list]
graph_classifier_list = [item[0] for item in mse_dict_list]
graph_mse_list = [item[1] for item in mse_dict_list]

minY = 0;
maxY = max(graph_accuracy_list)

df = pd.DataFrame({'Accuracy': graph_accuracy_list}, index=graph_classifier_list)
ax = df.plot(figsize=(7,5), kind='bar', stacked=True)

ax. set(xlabel="Classifiers used", ylabel="Accuracy")

ax.set(ylim=[minY, maxY+2])

**Best Classifer**

In [None]:
highest_accuracy_classifier = accuracy_dict_list[0]
print("Best Classifier considering highest accuracy:", highest_accuracy_classifier)

In [None]:
best_classifier_name = accuracy_dict_list[0][0]
bestClassifier = classifier_dict.get(best_classifier_name)
print(bestClassifier)

In [None]:
y_pred_test = bestClassifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)*100
print("Using Best Classifier:\n")
print('Accuracy:', accuracy_test)

In [None]:
input_review = input('Enter your review: ') 
mcom = {'text': [input_review]}
mdf = pd.DataFrame(mcom, columns = ['text'])
X_single = tfidfconverter.transform(mdf['text'])
y_single = bestClassifier.predict(X_single)
print("review: ", y_single[0])
if y_single == 1:
    print('positive review')
else:
    print('Negative')

