In [343]:
from nltk import word_tokenize, RegexpTokenizer, download

from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams

from nltk.lm import MLE
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline

from nltk.tokenize.treebank import TreebankWordDetokenizer

from numpy import array, savetxt

from csv import writer

import pandas as pd

Sentiment = 0|1

In [344]:
def insert_dict( x: str, d: dict, s: Sentiment ):
    d[x][s] = d.get(x)[s]+1

def init_dict( x: str, d: dict ):
    d[x] = {0: 0, 1: 0}

In [345]:
train_df = pd.read_csv("./files/yelp_train.csv", names=['text', 'sentiment'])
c_prior = train_df['sentiment'].value_counts()
p_prior = c_prior/(c_prior[0] + c_prior[1])

tokenizer = RegexpTokenizer(r'\w+')
train_df['tok_text'] = train_df.apply(lambda row: tokenizer.tokenize(row['text'].lower()), axis=1)

In [346]:
p_prior = p_prior.rename({0: 'negative', 1: 'positive'})

In [347]:
p_prior = list(zip(p_prior.index, p_prior))

In [351]:
def mk_train_vocab(df):
    wc_pos = 0
    wc_neg = 0
    df_neg = df[df['sentiment'] == 0]
    df_pos = df[df['sentiment'] == 1]
    vocab = {}
    for p in df['tok_text']:
        for q in p:
            init_dict(q, vocab)
    for p in df_neg['tok_text']:
        for q in p:
            insert_dict(q, vocab, 0)
            wc_neg += 1
    for p in df_pos['tok_text']:
        for q in p:
            insert_dict(q, vocab, 1)
            wc_pos += 1
    return vocab, wc_pos, wc_neg
    
vocab, wc_pos, wc_neg = mk_train_vocab(train_df)

In [352]:
from math import log

pos_array = []
neg_array = []
for word in vocab:
    pos_array.append([word, 'positive', (vocab.get(word)[0]+1)/(wc_pos+len(vocab))])
    neg_array.append([word, 'negative', (vocab.get(word)[1]+1)/(wc_neg+len(vocab))])
maxp = ['', '', 0]
minp = ['', '', 1]

In [353]:
for e in neg_array:
    if e[2] > maxp[2]:
        maxp = e
    if e[2] < minp[2]:
        minp = e
print(maxp)
print(minp)

['the', 'negative', 0.026595744680851064]
['crust', 'negative', 0.0026595744680851063]


In [363]:
p_prior

[('negative', 0.5), ('positive', 0.5)]

In [366]:
log_p_prior = list(map(lambda x: [x[0], log(x[1])], p_prior))
logneg_array = list(map(lambda x: [x[0], x[1], log(x[2])],neg_array))
logpos_array = list(map(lambda x: [x[0], x[1], log(x[2])],pos_array))

In [244]:
maxp = ['', '', 100]
minp = ['', '', -100]
for e in logneg_array:
    if e[2] < maxp[2]:
        maxp = e
    if e[2] > minp[2]:
        minp = e
print(maxp)
print(minp)

['crust', 'negative', -6.263398262591624]
['the', 'negative', -3.960813169597578]


In [367]:
from os import remove, path

filename = 'c_model.csv'

if path.exists(filename):
    remove(filename)

with open(filename, 'a') as f:
    wr = writer(f, delimiter=',')
    f.write('PP\n')
    wr.writerows(log_p_prior)
    f.write('\nLP\n')
    wr.writerows(logneg_array)
    wr.writerows(logpos_array)

In [368]:
p_df = pd.read_csv(filename, skiprows=0, nrows=2)
c_df = pd.read_csv(filename, skiprows=4)
p_df = p_df.rename(index={0: 'negative', 1: 'positive'}, columns={'PP': 'probability'})
c_df = c_df.rename(columns={'LP': 'probability'})

In [322]:
c_df.loc[('wow', 'negative'), 'LP']

-5.570251082031678

In [374]:
p_df.loc['negative', 'probability']

-0.6931471805599453

In [376]:
test_df = pd.read_csv("./files/yelp_test.csv", names=['text', 'sentiment'])
test_df

Unnamed: 0,text,sentiment
0,Server did a great job handling our large rowd...,1
1,Would come back again if I had a sushi craving...,1
2,He deserves 5 stars.,1
3,My boyfriend and I came here for the first tim...,1
4,They have great dinners.,1
5,Not my thing.,0
6,If you are reading this please don't go there.,0
7,Tonight I had the Elk Filet special...and it s...,0
8,We ordered some old classics and some new dish...,0
9,A FLY was in my apple juice.. A FLY!!!!!!!!,0


In [377]:
def cl_probability( vocab: dict, cl: str, p_model: pd.DataFrame, c_model: pd.DataFrame ):
    prob = [p_model.loc[cl, 'probability']]
    for word in vocab:
        prob.append(c_model.loc[word, cl, 'probability'])
    return sum(prob)

In [378]:
def classify( text: str, p_model: pd.DataFrame, c_model: pd.DataFrame ):
    tokenizer = RegexpTokenizer(r'\w+')
    t_arr = tokenizer.tokenize(text.lower())
    vocab = {}
    for word in t_arr:
        if word in vocab.keys():
            vocab[word] = vocab.get(word) + 1
        else:
            vocab[word] = 1

    p_neg = cl_probability( vocab, 'negative', p_model, c_model)
    p_pos = cl_probability( vocab, 'positive', p_model, c_model)
    return 'positive' if p_neg < p_pos else 'negative'


In [381]:
classify(test_df.loc[0, 'text'], p_df, c_df)

IndexingError: Too many indexers