In [1]:
import os
import codecs 
import sys
import pandas as pd

def create_dataset(directory):
	data = []

	# Files counter.
	file_count = 0

	# Loop for files.
	for filename in os.listdir(directory):
		
		file = os.path.join(directory, filename)
		if os.path.isfile(file):
			with codecs.open(file, "r", encoding="utf-8") as f:
				
				# File name parsing to get id and rating.
				split_extension = filename.split(".")
				split_id_rating = split_extension[0].split("_")
				id_str = split_id_rating[0]
				rating_str = split_id_rating[1]
				rating = -1

				try:
					rating = int(rating_str)
				except ValueError:
					sys.exit("Error casting rating to int")

				review = f.read()
				sentiment = 1 if directory.__contains__("pos") else 0
				data.append([review, rating, sentiment])
				file_count += 1
				
	print("file_count : {}".format(file_count))
	return pd.DataFrame(data, columns=["review", "rating", "sentiment"])

In [2]:
df_train_pos = create_dataset("./data/train/pos/")
df_train_pos

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1
...,...,...,...
12495,"Seeing as the vote average was pretty low, and...",9,1
12496,"The plot had some wretched, unbelievable twist...",8,1
12497,I am amazed at how this movie(and most others ...,10,1
12498,A Christmas Together actually came before my t...,8,1


In [3]:
df_train_neg = create_dataset("./data/train/neg/")
df_train_neg

file_count : 12500


Unnamed: 0,review,rating,sentiment
0,Story of a man who has unnatural feelings for ...,3,0
1,Airport '77 starts as a brand new luxury 747 p...,4,0
2,This film lacked something I couldn't put my f...,4,0
3,"Sorry everyone,,, I know this is supposed to b...",1,0
4,When I was little my parents took me along to ...,1,0
...,...,...,...
12495,"Towards the end of the movie, I felt it was to...",4,0
12496,This is the kind of movie that my enemies cont...,3,0
12497,I saw 'Descent' last night at the Stockholm Fi...,3,0
12498,Some films that you pick up for a pound turn o...,1,0


In [4]:
df_train = pd.concat([df_train_pos, df_train_neg], axis=0)
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,review,rating,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,Homelessness (or Houselessness as George Carli...,8,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,This is easily the most underrated film inn th...,7,1
4,This is not the typical Mel Brooks film. It wa...,8,1
...,...,...,...
24995,"Towards the end of the movie, I felt it was to...",4,0
24996,This is the kind of movie that my enemies cont...,3,0
24997,I saw 'Descent' last night at the Stockholm Fi...,3,0
24998,Some films that you pick up for a pound turn o...,1,0


In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.tag import pos_tag

from nltk.tokenize import word_tokenize

import string
from bs4 import BeautifulSoup

DEBUG = False
TEST = 2000

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess(df):
	corpus = []
	for index, row in df.iterrows():
		review = row["review"]
		sentiment = row["sentiment"]
		
		review = BeautifulSoup(review, "html.parser").text

		verbs = ["VBP", "VBN", "VBG", "VBD", "VB"]
		proper_noun = ["NNS", "NNP", "NNPS"]
		review = ' '.join(
			[lemmatizer.lemmatize(word, pos = "v") if tag in verbs 
			else lemmatizer.lemmatize(word) 
			for word, tag in pos_tag(word_tokenize(review)) 
			if tag not in proper_noun])
		
		review = review.translate(str.maketrans('', '', string.punctuation))
		
		review = ' '.join([word.lower() for word in review.split()])

		review = ' '.join([word for word in review.split() if word.lower() not in ["movie", "film", "story"]])
		
		review = ' '.join([word for word in review.split() if not word.isnumeric()])
		
		review = ' '.join([word for word in review.split() if word.lower() not in stop_words])

		corpus.append((review, sentiment))

	return corpus

In [7]:
if DEBUG:
	df_train = df_train.sample(TEST)

In [8]:
corpus_train = preprocess(df_train)



In [9]:
corpus_train[0]

('cartoon comedy run time school life teaching profession lead believe satire much closer reality scramble survive financially insightful see right pathetic pomp pettiness whole situation remind know saw episode student repeatedly try burn school immediately recall classic line inspector sack one student welcome expect many age think far fetch pity nt',
 1)

In [10]:
df_test_pos = create_dataset("./data/test/pos/")
df_test_neg = create_dataset("./data/test/neg/")
df_test = pd.concat([df_test_pos, df_test_neg], axis=0)
df_test.reset_index(drop=True, inplace=True)
df_test

file_count : 12500
file_count : 12500


Unnamed: 0,review,rating,sentiment
0,I went and saw this movie last night after bei...,10,1
1,Actor turned director Bill Paxton follows up h...,7,1
2,As a recreational golfer with some knowledge o...,9,1
3,"I saw this film in a sneak preview, and it is ...",8,1
4,Bill Paxton has taken the true story of the 19...,8,1
...,...,...,...
24995,I occasionally let my kids watch this garbage ...,1,0
24996,When all we have anymore is pretty much realit...,1,0
24997,The basic genre is a thriller intercut with an...,3,0
24998,Four things intrigued me as to this film - fir...,3,0


In [11]:
if DEBUG:
	df_test = df_test.sample(TEST)

In [12]:
corpus_test = preprocess(df_test)



In [13]:
corpus_test[0]

('go saw last night coax mine admit reluctant see know able comedy wrong play character well play professionalism sign good toy one exactly entire theater sell overcome laughter first half move second half exit theater saw many many full grow well try desperately let anyone see cry great suggest go see judge',
 1)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
corpus_train_review = [review for review, sentiment in corpus_train]
X_train = cv.fit_transform(corpus_train_review).toarray()
y_train = df_train["sentiment"].values

X_train.shape, y_train.shape

((25000, 1500), (25000,))

In [23]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [17]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    # "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [19]:
corpus_test_review = [review for review, sentiment in corpus_test]
X_test = cv.fit_transform(corpus_test_review).toarray()
y_test = df_test["sentiment"].values

X_test.shape, y_test.shape

((25000, 1500), (25000,))

In [22]:
from sklearn.metrics import confusion_matrix, classification_report

for name, classifier in classifiers.items():
	print(name)
	classifier.fit(X_train, y_train)
	y_pred = classifier.predict(X_test)
	print(confusion_matrix(y_test, y_pred))
	print(classification_report(y_test, y_pred))

BernoulliNB
[[6854 5646]
 [3304 9196]]
              precision    recall  f1-score   support

           0       0.67      0.55      0.60     12500
           1       0.62      0.74      0.67     12500

    accuracy                           0.64     25000
   macro avg       0.65      0.64      0.64     25000
weighted avg       0.65      0.64      0.64     25000

ComplementNB
[[7817 4683]
 [4200 8300]]
              precision    recall  f1-score   support

           0       0.65      0.63      0.64     12500
           1       0.64      0.66      0.65     12500

    accuracy                           0.64     25000
   macro avg       0.64      0.64      0.64     25000
weighted avg       0.64      0.64      0.64     25000

MultinomialNB
[[7817 4683]
 [4200 8300]]
              precision    recall  f1-score   support

           0       0.65      0.63      0.64     12500
           1       0.64      0.66      0.65     12500

    accuracy                           0.64     25000
   macro

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[9780 2720]
 [7269 5231]]
              precision    recall  f1-score   support

           0       0.57      0.78      0.66     12500
           1       0.66      0.42      0.51     12500

    accuracy                           0.60     25000
   macro avg       0.62      0.60      0.59     25000
weighted avg       0.62      0.60      0.59     25000

MLPClassifier
[[8286 4214]
 [5503 6997]]
              precision    recall  f1-score   support

           0       0.60      0.66      0.63     12500
           1       0.62      0.56      0.59     12500

    accuracy                           0.61     25000
   macro avg       0.61      0.61      0.61     25000
weighted avg       0.61      0.61      0.61     25000

AdaBoostClassifier
[[7417 5083]
 [4222 8278]]
              precision    recall  f1-score   support

           0       0.64      0.59      0.61     12500
           1       0.62      0.66      0.64     12500

    accuracy                           0.63     25000
   macro avg  