In [61]:
import pandas as pd
import nltk
import glob
import os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [62]:
#reading all the files for training using pandas and concatenating data from all the training files
all_files = ["imdb_labelled.txt","amazon_cells_labelled.txt","yelp_labelled.txt"]
dataframe = pd.concat(pd.read_csv(file, sep='\t', names = ['txt', 'label'],index_col=None, header=0) for file in all_files)

In [63]:
# Creating a normalized version of Naive Bayes Classifier
# Normalisation of text is done here
stop_set = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stop_set)

y = dataframe.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
x = vectorizer.fit_transform(dataframe.txt)    #transforming data in the dataframe to features from text

# Training testing split
# Using a random state to guarantee the same results whenever training is done
x_training, x_testing, y_training, y_testing = train_test_split(x, y, random_state=23)

# Training the normalized Naive Bayes Classifier
norm_nbClassifier = naive_bayes.MultinomialNB()
norm_nbClassifier.fit(x_training, y_training)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
# Checking the accuracy of the normalized Naive Bayes classifier
roc_auc_score(y_testing, norm_nbClassifier.predict_proba(x_testing)[:,1])

0.8901946818613484

In [71]:
# Using TFIDF, short for term frequency–inverse document frequency
# This transforms text to feature vectors

# Creating an unnormalized version of Naive Bayes Classifier
unNorm_vectorizer = TfidfVectorizer(use_idf=False, lowercase=False, strip_accents=None)

unNorm_y = dataframe.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
unNorm_x = unNorm_vectorizer.fit_transform(dataframe.txt)    #transforming data in the dataframe to features from text

# Training testing split
# Using a random state to guarantee the same results whenever training is done
unNorm_x_training, unNorm_x_testing, unNorm_y_training, unNorm_y_testing = train_test_split(unNorm_x, unNorm_y, random_state=23)

# Training the unnormalized Naive Bayes Classifier
unNorm_nbClassifier = naive_bayes.MultinomialNB()
unNorm_nbClassifier.fit(unNorm_x_training, unNorm_y_training)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
# Checking the accuracy of the unnormalized Naive Bayes classifier
roc_auc_score(unNorm_y_testing, unNorm_nbClassifier.predict_proba(unNorm_x_testing)[:,1])

0.8850435829602497

In [None]:
def testNormNB(testfile):
    