In [1]:
# importing libraries for data modeling

import pandas as pd
import numpy as np
import collections
from nltk.tokenize.treebank import TreebankWordTokenizer
from common import utils, vocabulary
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# loading model data created in data exploration

model_data = pd.read_csv("data/got_comments_top_10_sampled.csv")

train, test = train_test_split(model_data, test_size=0.1)

train_x = train['body']
train_y = train['author']
test_x = test['body']
test_y = test['author']

train_x.to_csv('data/train_x.csv', index = False, header = False)
train_y.to_csv('data/train_y.csv', index = False, header = False)
test_x.to_csv('data/test_x.csv', index = False, header = False)
test_y.to_csv('data/test_y.csv', index = False, header = False)

In [3]:
# tokenize data in comment body using Penn Treebank tokenizer

tokenizer = TreebankWordTokenizer()

train_x_tokens = []
test_x_tokens = []

for x in train_x:
    train_x_tokens.append(tokenizer.tokenize(x))
    
for x in test_x:
    test_x_tokens.append(tokenizer.tokenize(x))

In [4]:
# define vocabulary using w266 common vocab (skipping cannonicalization for now)

train_text_all = [item for sublist in train_x_tokens for item in sublist]

vocab = vocabulary.Vocabulary(train_text_all, size=None)
print("Vocabulary size: {:,}".format(vocab.size))
print("Vocabulary dict: ", vocab.word_to_id)

train_x_ids = []
test_x_ids = []

for x in train_x_tokens:
    train_x_ids.append(vocab.words_to_ids(x))
    
for x in test_x_tokens:
    test_x_ids.append(vocab.words_to_ids(x))

Vocabulary size: 9,709


In [5]:
# count token occurences and convert to feature vector

train_x_fdict = []
test_x_fdict = []

for x in train_x_ids:
    train_x_fdict.append(collections.Counter(x))
    
for x in test_x_ids:
    test_x_fdict.append(collections.Counter(x))

train_x_vector = []
test_x_vector = []

num_features = vocab.size

for x in train_x_fdict:
    train_x_vector.append([x.get(i, 0) for i in range(num_features)])
    
for x in test_x_fdict:
    test_x_vector.append([x.get(i, 0) for i in range(num_features)])

In [6]:
# use w266 common utils to convert id lists to sparse bow matrix

train_x_sparse_bow = utils.id_lists_to_sparse_bow(train_x_fdict, vocab.size)
test_x_sparse_bow = utils.id_lists_to_sparse_bow(test_x_fdict, vocab.size)

In [7]:
# training Multinomial Naive Bayes for simple baseline model

nb = MultinomialNB()
nb.fit(train_x_sparse_bow, train_y)
y_pred = nb.predict(test_x_sparse_bow)

acc = accuracy_score(test_y, y_pred)
print("Accuracy on test set: {:.02%}".format(acc))

Accuracy on test set: 39.00%
