In [1]:
import pandas as pd
from sklearn.utils import shuffle
import os
import unicodedata
import re
import nltk
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import math
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [3]:
def remove_html_markup(s):
    tag = False
    quote = False
    out = ""
    for c in s:
        if c == '<' and not quote:
            tag = True
        elif c == '>' and not quote:
            tag = False
        elif (c == '"' or c == "'") and tag:
            quote = not quote
        elif not tag:
            out = out + c
    return out

In [4]:
def preprocess_sentence(w):
    w = remove_html_markup(w)
    w = w.lower().strip()
    if w == '':
        return 0
    else:
        w = unicode_to_ascii(w)
        w = re.sub(r"[^a-z]+", " ", w)
        w = w.strip()
        w = re.sub(r'\s+', ' ', w)
    w = w.split(' ')
    w = [i for i in w if i != '']
    stopwords = set(nltk.corpus.stopwords.words('english'))
    w = [i for i in w if i not in stopwords]
    stemmer = nltk.stem.porter.PorterStemmer()
    w = [stemmer.stem(i) for i in w]
    w = ' '.join(w)
    return w

In [5]:
def lines_to_text(lines, sep):
    text = ''
    for i in range(len(lines)):
        if i == len(lines) - 1:
            text += str(lines[i])
        else:
            text += str(lines[i]) + sep
    return text

In [6]:
def retrieve_frequent_words(unique_words):
    new_unique_words = ['unk']
    for i in list(unique_words.keys()):
        if unique_words[i] > 5:
            new_unique_words.append(i)
    return new_unique_words

In [7]:
def create_vocabulary(pos_lines, neg_lines):
    lines = pos_lines + neg_lines
    text = lines_to_text(lines, ' ')
    unique_words = Counter(text.split(' '))
    print('No. of unique words in dataset: ', len(unique_words.keys()))
    print()
    unique_words = retrieve_frequent_words(unique_words)
    print('New no. of unique words in dataset: ', len(unique_words))
    print()
    word_index = {i: unique_words.index(i) for i in unique_words}
    return word_index

In [8]:
def create_dataset(lines):
    new_lines = []
    for i in lines:
        s = preprocess_sentence(i)
        if s != 0:
            new_lines.append(s)
    return new_lines

In [9]:
def text_retrieve(file_names):
    text_files = []
    for i in file_names:
        with open(i, 'r') as f:
            text_files.append(f.read())
        f.close()
    return text_files

In [10]:
def find_files():
    path = '/Users/preethamganesh/Downloads/aclImdb/'
    pos_files_train = [os.path.join(path + 'train/pos/', f) for f in os.listdir(path + 'train/pos/') if
                       os.path.isfile(os.path.join(path + 'train/pos/', f))]
    pos_files_test = [os.path.join(path + 'test/pos/', f) for f in os.listdir(path + 'test/pos/') if
                      os.path.isfile(os.path.join(path + 'test/pos/', f))]
    pos_files = pos_files_train + pos_files_test
    neg_files_train = [os.path.join(path + 'train/neg/', f) for f in os.listdir(path + 'train/neg/') if
                       os.path.isfile(os.path.join(path + 'train/neg/', f))]
    neg_files_test = [os.path.join(path + 'test/neg/', f) for f in os.listdir(path + 'test/neg/') if
                      os.path.isfile(os.path.join(path + 'test/neg/', f))]
    neg_files = neg_files_train + neg_files_test
    print('No. of positive reviews: ', len(pos_files))
    print('No. of negative reviews: ', len(neg_files))
    print()
    return pos_files, neg_files

In [11]:
def find_probability(word, lines, vocabulary):
    if word.lower() in vocabulary:
        text = lines_to_text(lines, ' ')
        unique_words = Counter(text.split(' '))
        return unique_words[word]/len(text.split(' '))
    else:
        return 0

In [12]:
def remove_unk_words(lines, vocabulary):
    new_lines = []
    for i in lines:
        new_line = []
        for j in i.split(' '):
            if j in vocabulary:
                new_line.append(j)
            else:
                new_line.append('unk')
        line = ' '.join(new_line)
        new_lines.append(line)
    return new_lines

In [13]:
def tokenize(train, test):
    vectorizer = CountVectorizer()
    vectorizer.fit(train)
    train = vectorizer.transform(train)
    test = vectorizer.transform(test)
    train = train.toarray()
    test = test.toarray()
    return train, test, vectorizer

In [14]:
def calculate_probability(x, mean, stdev):
    exponent = math.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

In [15]:
def class_separate(train_x, train_y):
    classes = np.unique(train_y)
    class_index = {}
    subdata = {}
    cls, counts = np.unique(train_y, return_counts=True)
    class_frequency = dict(zip(cls, counts))
    new_class_frequency = dict(zip(cls, counts))
    for i in classes:
        class_index[i] = np.argwhere(train_y == i)
        subdata[i] = train_x[class_index[i], :]
        subdata[i] = np.reshape(subdata[i], [subdata[i].shape[0], subdata[i].shape[2]])
        new_class_frequency[i] = class_frequency[i] / sum(list(class_frequency.values()))
    return subdata, new_class_frequency

In [16]:
def model_training(train_x, train_y):
    separated_dataset, class_frequency = class_separate(train_x, train_y)
    classes = np.unique(train_y)
    means, stdev = {}, {}
    for i in classes:
        means[i] = np.mean(separated_dataset[i], axis=0)
        stdev[i] = np.std(separated_dataset[i], axis=0)
    return means, stdev, class_frequency, classes

In [17]:
def predict_proba(x, means, stdev, class_frequency, classes):
    probability = {i: 1 for i in classes}
    for i in classes:
        for j in range(len(means)):
            probability[i] *= calculate_probability(x[j], means[i][j], stdev[i][j])
    probability = {i: probability[i] for i in probability}
    return probability

In [18]:
def predict(test_x, means, stdev, class_frequency, classes):
    pred = []
    for i in test_x:
        pred_class, max_prob = None, 0
        for class_index, prob in predict_proba(i, means, stdev, class_frequency, classes).items():
            if prob > max_prob:
                max_prob = prob
                pred_class = class_index
        pred.append(pred_class)
    return pred

In [19]:
def accuracy_score(true, pred):
    acc = 0
    for i, j in zip(true, pred):
        try:
            if int(i) == int(j):
                acc += 1
        except:
            continue
    return acc/len(true)

In [20]:
def cross_validation(data, n_cross_val):
    data_split = np.array_split((data), n_cross_val)
    return data_split

In [27]:
def naive_bayes(train, test):
    n_cross_val = 5
    train_index = cross_validation(train, n_cross_val)
    items = []
    accuracy = []
    for i in range(len(train_index)):
        val = train_index[i]
        new_train = train.drop(val.index)
        train_x, val_x, vectorizer = tokenize(new_train['sentences'], val['sentences'])
        train_y = np.array(new_train['sentiment'])
        val_y = np.array(val['sentiment'])
        means, stdev, class_frequency, classes = model_training(train_x, train_y)
        val_pred = predict(val_x, means, stdev, class_frequency, classes)
        val_acc = accuracy_score(val_y, val_pred)
        accuracy.append(val_acc)
        items.append([means, stdev, class_frequency, classes, vectorizer])
        print('Validation Accuracy for iteration ' + str(i) + ' = ' + str(round(val_acc, 3)))
    min_index = accuracy.index(min(accuracy))
    new_items = items[min_index]
    test_x = new_items[4].transform(test['sentences'])
    test_x = test_x.toarray()
    test_pred = predict(test_x, new_items[0], new_items[1], new_items[2], new_items[3])
    test_y = np.array(test['sentiment'])
    test_acc = accuracy_score(test_y, test_pred)
    print()
    print('Test accuracy = ', round(test_acc, 3))
    print()
    words = new_items[4].get_feature_names()
    pos_mean = means[1]
    neg_mean = means[0]
    pos_ind = pos_mean.argsort()
    neg_ind = neg_mean.argsort()
    pos_words = list(zip(*(sorted(zip(pos_mean, words)))))[1]
    neg_words = list(zip(*(sorted(zip(neg_mean, words)))))[1]
    print('Top 10 positive words: ', pos_words[:10])
    print('Top 10 negative words: ', neg_words[:10])

In [22]:
pos_files, neg_files = find_files()
pos_text = text_retrieve(pos_files)
neg_text = text_retrieve(neg_files)
pos_lines = create_dataset(pos_text)
print('New no. of positive reviews after data preprocessing: ', len(pos_lines))
neg_lines = create_dataset(neg_text)
print('New no. of negative reviews after data preprocessing: ', len(neg_lines))

No. of positive reviews:  25000
No. of negative reviews:  25000

New no. of positive reviews after data preprocessing:  25000
New no. of negative reviews after data preprocessing:  25000


In [23]:
vocabulary = create_vocabulary(pos_lines, neg_lines)
pos_lines = remove_unk_words(pos_lines, vocabulary)
print('Unknown words in positive sentences')
neg_lines = remove_unk_words(neg_lines, vocabulary)
print('Unknown words in negative sentences')

No. of unique words in dataset:  70842

New no. of unique words in dataset:  24437

Unknown words in positive sentences
Unknown words in negative sentences


In [24]:
train_pos, test_pos = train_test_split(pos_lines, test_size=0.1)
train_neg, test_neg = train_test_split(neg_lines, test_size=0.1)
print('No. of positive reviews in training set: ', len(train_pos))
print('No. of negative reviews in training set: ', len(train_neg))
print('No. of positive reviews in testing set: ', len(test_pos))
print('No. of negative reviews in testing set: ', len(test_neg))

No. of positive reviews in training set:  22500
No. of negative reviews in training set:  22500
No. of positive reviews in testing set:  2500
No. of negative reviews in testing set:  2500


In [25]:
train = {}
train['sentences'] = train_pos
train['sentiment'] = [1 for i in range(len(train_pos))]
train['sentences'] = train['sentences'] + train_neg
train['sentiment'] = train['sentiment'] + [0 for i in range(len(train_neg))]
test = {}
test['sentences'] = test_pos
test['sentiment'] = [1 for i in range(len(test_pos))]
test['sentences'] = test['sentences'] + test_neg
test['sentiment'] = test['sentiment'] + [0 for i in range(len(test_neg))]
train = pd.DataFrame(train, columns=['sentences', 'sentiment'])
train = shuffle(train)
test = pd.DataFrame(test, columns=['sentences', 'sentiment'])
test = shuffle(test)

In [28]:
naive_bayes(train, test)

Validation Accuracy for iteration 0 = 0.502
Validation Accuracy for iteration 1 = 0.504
Validation Accuracy for iteration 2 = 0.503
Validation Accuracy for iteration 3 = 0.502
Validation Accuracy for iteration 4 = 0.493

Test accuracy =  0.5

Top 10 positive words:  ('acacia', 'acharya', 'acromegali', 'aden', 'adjl', 'adonijah', 'advani', 'advisori', 'agey', 'agren')
Top 10 negative words:  ('abhay', 'abolitionist', 'acquiesc', 'acrobatti', 'aday', 'ade', 'adelin', 'adentro', 'adroit', 'adventist')


Notebook can be downloaded at https://github.com/preetham7897/website/blob/master/documents/ganesh_03.ipynb