In [14]:
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [8]:
# Method to get data from each file 
def get_data(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text    

In [9]:
# Train data
train_pos = os.listdir('data/train/pos') #12500 files
train_neg = os.listdir('data/train/neg')
# Test data
test_pos = os.listdir('data/test/pos')
test_neg = os.listdir('data/test/neg')

In [15]:
train_data = {"text" : [], "sentiment": []}
test_data = {"text": [], "sentiment": []}
pathname = os.getcwd() 
for i in range(len(train_pos)):
    # Fetch positive and negative sentimental data from both train and test set
    train_pos_text = get_data(str(pathname + '/data/train/pos/' + train_pos[i]))
    train_neg_text = get_data(str(pathname + '/data/train/neg/' + train_neg[i]))
    test_pos_text = get_data(str(pathname + '/data/test/pos/' + test_pos[i]))
    test_neg_text = get_data(str(pathname + '/data/test/neg/' + test_neg[i]))
    
    # Append the text and sentiment to the new train dataset
    train_data['text'].append(train_pos_text)
    # Positive sentiment as 1
    train_data['sentiment'].append(1)
    train_data['text'].append(train_neg_text)
    # Negative sentiment as 0
    train_data['sentiment'].append(0)
    
    # Append the text and sentiment to the new test dataset
    test_data['text'].append(test_pos_text)
    # Positive sentiment as 1
    test_data['sentiment'].append(1)
    test_data['text'].append(test_neg_text)
    # Negative sentiment as 0
    test_data['sentiment'].append(0)
    
# Creating pandas datafram for train and test data    
train_data_df = pd.DataFrame(train_data)
test_data_df = pd.DataFrame(test_data)

In [11]:
# Creating csv files for merged test and train data
test_data_df.to_csv(os.getcwd() + '/data/test_data.csv', index=False, header=True) # Test data
train_data_df.to_csv(os.getcwd() + '/data/train_data.csv', index=False, header=True) # Train data

In [12]:
train_data_df.head()

Unnamed: 0,text,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Working with one of the best Shakespeare sourc...,0
2,Bizarre horror movie filled with famous faces ...,1
3,"Well...tremors I, the original started off in ...",0
4,"A solid, if unremarkable film. Matthau, as Ein...",1


In [7]:
# Method for cleaning the dataframe 
def clean_text_content(text):
    # Remove special characters
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    # Remove square brackers 
    text = re.sub('\[[^]]*\][.;:!\'?,\"()\[\]] ', '', text)
    # Converting the text to lowercase 
    text = text.lower()
    # Remove break elements from the text 
    text = re.sub("(<br\s*/><br\s*/>)|(\-)|(\/)", '', text)
    return text

# Cleaing the text part of the datframes 
train_data_df['text'] = train_data_df['text'].apply(clean_text_content)
test_data_df['text'] = test_data_df['text'].apply(clean_text_content)
train_data_df['text'][80]


'mesmerizing breathtaking and horrifying this hauntingly beautiful film is the apocalypse now without fiction slow in pace quiet in mood it gives good glimpses of the poisoned patches of earth that may well be signs of an inevitable doombr br there is no doubt in my mind  the nature is plagued and we are the disease greed the very essence of humanity that drives evolution and progress has turned us into something like cancer on its way to consume the host and die with itbr br manufactured landscapes is quite an unforgettable viewing experience  at least ill never regard my toaster and iron the same way again'

In [8]:
# Tokenize the words 
train_data_df['text'] = train_data_df['text'].apply(word_tokenize)
test_data_df['text'] = test_data_df['text'].apply(word_tokenize)

In [9]:
# Method to remove stopwords from the dataframe
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    final_words = [word for word in text if word not in stop_words]
    return " ".join(final_words)
# Removing stopwords from the train and test data reviews 
train_data_df['text'] = train_data_df['text'].apply(remove_stopwords)
test_data_df['text'] = test_data_df['text'].apply(remove_stopwords)

In [10]:
# Creating bag of words model 
normalised_array_train = train_data_df['text']
normalised_array_test = test_data_df['text']
vectorizer = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1, 3))
count_train_vectorizer = vectorizer.fit_transform(normalised_array_train)
count_test_vectorizer = vectorizer.fit_transform(normalised_array_test)

# Read the unique words with uni, bi and trigrams
vectorizer.get_feature_names()
# Reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

['00 agent difficult',
 '00 agent level',
 '00 comes',
 '00 comes back',
 '00 including',
 '00 including unusual',
 '00 schneider directly',
 '00 schneider murderervillain',
 '00 schneider opinion',
 '000',
 '000 000',
 '000 000 overboard',
 '000 overboard',
 '00000001',
 '00000001 population',
 '00000001 population states',
 '001',
 '001 believe',
 '001 believe chases',
 '002',
 '002 hope',
 '002 hope could',
 '0069',
 '0069 tries',
 '0069 tries vacillate',
 '007 active',
 '007 active young',
 '007 aficionado',
 '007 aficionado nevertheless',
 '007 appearances',
 '007 appearances script',
 '007 atmospherebr',
 '007 atmospherebr br',
 '007 back',
 '007 back one',
 '007 best',
 '007 best movie',
 '007 blue',
 '007 blue water',
 '007 cool',
 '007 cool tough',
 '007 debut',
 '007 debut goldeneyebr',
 '007 decides',
 '007 decides oldschool',
 '007 didnt',
 '007 didnt swing',
 '007 difficulty',
 '007 difficulty modern',
 '007 eighties',
 '007 eighties yet',
 '007 facing',
 '007 facing quest

In [11]:
# Fitting a Naive Bayes Model 
X_train = count_train_vectorizer.toarray()
Y_train = train_data_df['sentiment'].values
X_test = count_test_vectorizer.toarray()
Y_test = test_data_df['sentiment']
gnb = GaussianNB()
# Predicted values of sentiments
y_prediction = gnb.fit(X_train, Y_train).predict(X_test)

In [12]:
# Getting the accuracy of the model
print('Accuracy: ', accuracy_score(Y_test, Y_prediction))
# 0.82c

# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, Y_prediction))

# Precision score
print('Precision Score: ', precision_score(Y_test, Y_prediction))

# Recall score
print('Recall Score: ', recall_score(Y_test, Y_prediction))

# TRUE POSITIVE