In [556]:
from nltk import word_tokenize, RegexpTokenizer, download

from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams

from nltk.lm import MLE
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline

from nltk.tokenize.treebank import TreebankWordDetokenizer

from numpy import array, savetxt

from csv import writer

import pandas as pd

Sentiment = 0|1

In [557]:
def insert_dict( x: str, d: dict, s: Sentiment ):
    d[x][s] = d.get(x)[s]+1

def init_dict( x: str, d: dict ):
    d[x] = {0: 0, 1: 0}

In [558]:
train_df = pd.read_csv("./files/train.csv", names=['text', 'sentiment'])
c_prior = train_df['sentiment'].value_counts()
p_prior = c_prior/(c_prior[0] + c_prior[1])

tokenizer = RegexpTokenizer(r'\w+')
train_df['tok_text'] = train_df.apply(lambda row: tokenizer.tokenize(row['text'].lower()), axis=1)

In [559]:
p_prior = p_prior.rename({0: 'negative', 1: 'positive'})

In [560]:
p_prior = list(zip(p_prior.index, p_prior))

In [561]:
def mk_train_vocab(df):
    wc_pos = 0
    wc_neg = 0
    df_neg = df[df['sentiment'] == 0]
    df_pos = df[df['sentiment'] == 1]
    vocab = {}
    for p in df['tok_text']:
        for q in p:
            init_dict(q, vocab)
    for p in df_neg['tok_text']:
        for q in p:
            insert_dict(q, vocab, 0)
            wc_neg += 1
    for p in df_pos['tok_text']:
        for q in p:
            insert_dict(q, vocab, 1)
            wc_pos += 1
    return vocab, wc_pos, wc_neg
    
vocab, wc_pos, wc_neg = mk_train_vocab(train_df)

In [562]:
from math import log

pos_array = []
neg_array = []
for word in vocab:
    pos_array.append([word, 'positive', (vocab.get(word)[1]+1)/(wc_pos+len(vocab))])
    neg_array.append([word, 'negative', (vocab.get(word)[0]+1)/(wc_neg+len(vocab))])
maxp = ['', '', 0]
minp = ['', '', 1]

In [563]:
log_p_prior = list(map(lambda x: [x[0], log(x[1])], p_prior))
logneg_array = list(map(lambda x: [x[0], x[1], log(x[2])],neg_array))
logpos_array = list(map(lambda x: [x[0], x[1], log(x[2])],pos_array))

In [564]:
from os import remove, path

filename = 'model.csv'

if path.exists(filename):
    remove(filename)

with open(filename, 'a') as f:
    wr = writer(f, delimiter=',')
    f.write('PP\n')
    wr.writerows(log_p_prior)
    f.write('\nLP\n')
    wr.writerows(logneg_array)
    wr.writerows(logpos_array)

In [565]:
p_df = pd.read_csv(filename, skiprows=0, nrows=2)
c_df = pd.read_csv(filename, skiprows=4)
p_df = p_df.rename(index={0: 'negative', 1: 'positive'}, columns={'PP': 'probability'})
c_df = c_df.rename(columns={'LP': 'probability'})

In [568]:
def cl_probability( vocab: dict, cl: str, p_model: pd.DataFrame, c_model: pd.DataFrame ):
    prob = [p_model.loc[cl, 'probability']]
    for word in vocab:
        prob.append(c_model.loc[(word, cl), 'probability'] * vocab[word])
    return sum(prob)

In [569]:
def classify( text: str, p_model: pd.DataFrame, c_model: pd.DataFrame ):
    tokenizer = RegexpTokenizer(r'\w+')
    t_arr = tokenizer.tokenize(text.lower())
    cni_df = c_df.reset_index()
    vocab = {}
    for word in t_arr:
        if word not in cni_df.loc[:, 'level_0'].values:
            continue
        if word in vocab.keys():
            vocab[word] = vocab.get(word) + 1
        else:
            vocab[word] = 1

    p_neg = cl_probability( vocab, 'negative', p_model, c_model)
    p_pos = cl_probability( vocab, 'positive', p_model, c_model)
    return 1 if p_neg < p_pos else 0

In [570]:
test_df = pd.read_csv("./files/test.csv", names=['text', 'sentiment'])

predict = []
for text in test_df['text']:
    predict.append(classify( text, p_df, c_df ))

test_df['prediction'] = predict

In [571]:
test_df['sentiment'] = test_df.apply(lambda row: 'positive' if row['sentiment'] == 1 else 'negative', axis=1)
test_df['prediction'] = test_df.apply(lambda row: 'positive' if row['prediction'] == 1 else 'negative', axis=1)

In [572]:
test_df = test_df.rename(columns={'sentiment': 'actual'})
test_df.to_csv('test_predictions.csv')
test_df

Unnamed: 0,text,actual,prediction
0,Server did a great job handling our large rowd...,positive,positive
1,Would come back again if I had a sushi craving...,positive,negative
2,He deserves 5 stars.,positive,positive
3,My boyfriend and I came here for the first tim...,positive,positive
4,They have great dinners.,positive,positive
5,Not my thing.,negative,negative
6,If you are reading this please don't go there.,negative,negative
7,Tonight I had the Elk Filet special...and it s...,negative,negative
8,We ordered some old classics and some new dish...,negative,positive
9,A FLY was in my apple juice.. A FLY!!!!!!!!,negative,negative
