In [1]:
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to /Users/neeraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Method to get data from each file 
def get_data(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text    

In [3]:
# Train data
train_pos = os.listdir('data/train/pos') #12500 files
train_neg = os.listdir('data/train/neg')
# Test data
test_pos = os.listdir('data/test/pos')
test_neg = os.listdir('data/test/neg')

In [4]:
train_data = {"text" : [], "sentiment": []}
test_data = {"text": [], "sentiment": []}
pathname = os.getcwd() 
for i in range(len(train_pos)):
    # Fetch positive and negative sentimental data from both train and test set
    train_pos_text = get_data(str(pathname + '/data/train/pos/' + train_pos[i]))
    train_neg_text = get_data(str(pathname + '/data/train/neg/' + train_neg[i]))
    test_pos_text = get_data(str(pathname + '/data/test/pos/' + test_pos[i]))
    test_neg_text = get_data(str(pathname + '/data/test/neg/' + test_neg[i]))
    
    # Append the text and sentiment to the new train dataset
    train_data['text'].append(train_pos_text)
    # Positive sentiment as 1
    train_data['sentiment'].append(1)
    train_data['text'].append(train_neg_text)
    # Negative sentiment as 0
    train_data['sentiment'].append(0)
    
    # Append the text and sentiment to the new test dataset
    test_data['text'].append(test_pos_text)
    # Positive sentiment as 1
    test_data['sentiment'].append(1)
    test_data['text'].append(test_neg_text)
    # Negative sentiment as 0
    test_data['sentiment'].append(0)
    
# Creating pandas datafram for train and test data    
train_data_df = pd.DataFrame(train_data)
test_data_df = pd.DataFrame(test_data)

In [5]:
# Creating csv files for merged test and train data
test_data_df.to_csv(os.getcwd() + '/data/test_data.csv', index=False, header=True) # Test data
train_data_df.to_csv(os.getcwd() + '/data/train_data.csv', index=False, header=True) # Train data

In [6]:
train_data_df.head()

Unnamed: 0,text,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Working with one of the best Shakespeare sourc...,0
2,Bizarre horror movie filled with famous faces ...,1
3,"Well...tremors I, the original started off in ...",0
4,"A solid, if unremarkable film. Matthau, as Ein...",1


In [7]:
# Method for cleaning the dataframe 
def clean_text_content(text):
    # Remove special characters
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    # Remove square brackers 
    text = re.sub('\[[^]]*\][.;:!\'?,\"()\[\]] ', '', text)
    # Converting the text to lowercase 
    text = text.lower()
    # Remove break elements from the text 
    text = re.sub("(<br\s*/><br\s*/>)|(\-)|(\/)", '', text)
    return text

# Cleaing the text part of the datframes 
train_data_df['text'] = train_data_df['text'].apply(clean_text_content)
test_data_df['text'] = test_data_df['text'].apply(clean_text_content)
train_data_df['text'][80]


'mesmerizing breathtaking and horrifying this hauntingly beautiful film is the apocalypse now without fiction slow in pace quiet in mood it gives good glimpses of the poisoned patches of earth that may well be signs of an inevitable doombr br there is no doubt in my mind  the nature is plagued and we are the disease greed the very essence of humanity that drives evolution and progress has turned us into something like cancer on its way to consume the host and die with itbr br manufactured landscapes is quite an unforgettable viewing experience  at least ill never regard my toaster and iron the same way again'

In [7]:
# Tokenize the words 
train_data_df['text'] = train_data_df['text'].apply(word_tokenize)
test_data_df['text'] = test_data_df['text'].apply(word_tokenize)

In [8]:
# Method to remove stopwords from the dataframe
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    final_words = [word for word in text if word not in stop_words]
    return " ".join(final_words)
# Removing stopwords from the train and test data reviews 
train_data_df['text'] = train_data_df['text'].apply(remove_stopwords)
test_data_df['text'] = test_data_df['text'].apply(remove_stopwords)

In [9]:
# Creating bag of words model 
normalised_array_train = train_data_df['text']
normalised_array_test = test_data_df['text']
vectorizer = CountVectorizer()
count_train_vectorizer = vectorizer.fit_transform(normalised_array_train)
count_test_vectorizer = vectorizer.transform(normalised_array_test)

# Read the unique words
vectorizer.get_feature_names()

['00',
 '000',
 '0000000000001',
 '00001',
 '00015',
 '000s',
 '001',
 '003830',
 '006',
 '007',
 '0079',
 '0080',
 '0083',
 '0093638',
 '00am',
 '00pm',
 '00s',
 '01',
 '01pm',
 '02',
 '020410',
 '029',
 '03',
 '04',
 '041',
 '05',
 '050',
 '06',
 '06th',
 '07',
 '08',
 '087',
 '089',
 '08th',
 '09',
 '0f',
 '0ne',
 '0r',
 '0s',
 '10',
 '100',
 '1000',
 '1000000',
 '10000000000000',
 '1000lb',
 '1000s',
 '1001',
 '100b',
 '100k',
 '100m',
 '100min',
 '100mph',
 '100s',
 '100th',
 '100x',
 '100yards',
 '101',
 '101st',
 '102',
 '102nd',
 '103',
 '104',
 '1040',
 '1040a',
 '1040s',
 '105',
 '1050',
 '105lbs',
 '106',
 '106min',
 '107',
 '108',
 '109',
 '10am',
 '10lines',
 '10mil',
 '10min',
 '10minutes',
 '10p',
 '10pm',
 '10s',
 '10star',
 '10th',
 '10x',
 '10yr',
 '11',
 '110',
 '1100',
 '11001001',
 '1100ad',
 '111',
 '112',
 '1138',
 '114',
 '1146',
 '115',
 '116',
 '117',
 '11f',
 '11m',
 '11th',
 '12',
 '120',
 '1200',
 '1200f',
 '1201',
 '1202',
 '123',
 '12383499143743701',
 '1

In [10]:
# Fitting a Multinomial Naive Bayes Model 
X_train = count_train_vectorizer
Y_train = train_data_df['sentiment'].values
X_test = count_test_vectorizer
Y_test = test_data_df['sentiment'].values

mnb = MultinomialNB()
# Predicted values of sentiments
Y_prediction = mnb.fit(X_train, Y_train).predict(X_test)

In [11]:
# Getting the accuracy of the model
print('Accuracy: ', accuracy_score(Y_test, Y_prediction))
# 0.82c

# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, Y_prediction))

# Precision score
print('Precision Score: ', precision_score(Y_test, Y_prediction))

# Recall score
print('Recall Score: ', recall_score(Y_test, Y_prediction))

# TRUE POSITIVE

Accuracy:  0.8232
Confusion Matrix:
 [[11026  1474]
 [ 2946  9554]]
Precision Score:  0.8663402248821183
Recall Score:  0.76432


In [None]:
# For Gaussian naive bayes, take a dense graph
# Fitting a Naive Bayes Model 
X_train_ = count_train_vectorizer.toarray()
Y_train_ = train_data_df['sentiment'].values
X_test_ = count_test_vectorizer.toarray()
Y_test_ = test_data_df['sentiment']
gnb = GaussianNB()
# Predicted values of sentiments
Y_prediction_ = gnb.fit(X_train_, Y_train).predict(X_test_)

In [None]:
# Getting the accuracy of the model
print('Accuracy: ', accuracy_score(Y_test_, Y_prediction_))
# 0.67

# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test_, Y_prediction_))

# Precision score
print('Precision Score: ', precision_score(Y_test_, Y_prediction_))

# Recall score
print('Recall Score: ', recall_score(Y_test_, Y_prediction_))

# TRUE POSITIVE