In [186]:
import pandas as pd
import nltk
import glob
import os
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [187]:
#reading all the files for training using pandas and concatenating data from all the training files
all_files = ["imdb_labelled.txt","amazon_cells_labelled.txt","yelp_labelled.txt"]
dataframe = pd.concat(pd.read_csv(file, sep='\t', names = ['txt', 'label'],index_col=None, header=0) for file in all_files)

In [188]:
def trainNormNB():
    
    # Using TFIDF, short for term frequency–inverse document frequency
    # This transforms text to feature vectors
    
    # Creating a normalized version of Naive Bayes Classifier
    # Normalisation of text is done here
    stop_set = set(stopwords.words('english'))
    norm_vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stop_set)

    y = dataframe.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
    x = norm_vectorizer.fit_transform(dataframe.txt)    #transforming data in the dataframe to features from text

    # Training testing split
    # Using a random state to guarantee the same results whenever training is done
    x_training, x_testing, y_training, y_testing = train_test_split(x, y, random_state=29)

    # Training the normalized Naive Bayes Classifier
    norm_nbClassifier = naive_bayes.MultinomialNB()
    norm_nbClassifier.fit(x_training, y_training)
    
    return norm_nbClassifier, norm_vectorizer

In [189]:
def trainUnNormNB():
    # Using TFIDF, short for term frequency–inverse document frequency
    # This transforms text to feature vectors

    # Creating an unnormalized version of Naive Bayes Classifier
    unNorm_vectorizer = TfidfVectorizer(use_idf=False, lowercase=False, strip_accents=None)

    unNorm_y = dataframe.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
    unNorm_x = unNorm_vectorizer.fit_transform(dataframe.txt)    #transforming data in the dataframe to features from text

    # Training testing split
    # Using a random state to guarantee the same results whenever training is done
    unNorm_x_training, unNorm_x_testing, unNorm_y_training, unNorm_y_testing = train_test_split(unNorm_x, unNorm_y, random_state=29)

    # Training the unnormalized Naive Bayes Classifier
    unNorm_nbClassifier = naive_bayes.MultinomialNB()
    unNorm_nbClassifier.fit(unNorm_x_training, unNorm_y_training)
    
    return unNorm_nbClassifier, unNorm_nbClassifier


In [200]:
def testNormNB(testfile):
    df = pd.read_csv(testfile,sep='\t', names = ['txt', 'label'],index_col=None, header=-1)
    
    norm_nbClassifier, norm_vectorizer = trainNormNB()
    
    y = df.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
    x = df.txt    #setting the independent variables of features from text
    
    predict_list = []
    
    for i in range(len(x)):
        sentiment = np.array([str(x[i])])
        sentiment_tranform = norm_vectorizer.transform(sentiment)
        prediction = norm_nbClassifier.predict(sentiment_tranform)
        
        predict_list.append(prediction[0])
    
    print(predict_list)
        
#     print(x)

In [201]:
testNormNB("tester.txt")

[1, 0, 0, 0, 0, 0, 0, 0, 1]


In [203]:
def testUnNormNB(testfile):
    df = pd.read_csv(testfile,sep='\t', names = ['txt', 'label'],index_col=None, header=-1)
    
    unNorm_nbClassifier, unNorm_vectorizer = trainUnNormNB()
    
    y = df.label    #setting dependent variables, the labels 0 for negative and 1 for positive sentiment
    x = df.txt    #setting the independent variables of features from text
    
    predict_list = []
    
    for i in range(len(x)):
        sentiment = np.array([str(x[i])])
        sentiment_tranform = unNorm_vectorizer.transform(sentiment)
        prediction = unNorm_nbClassifier.predict(sentiment_tranform)
        
        predict_list.append(prediction[0])
    
    print(predict_list)

In [204]:
testUnNormNB("tester.txt")

NameError: name 'trainUnNormNB' is not defined