In [355]:
import json
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import naive_bayes

In [105]:
#Load in the Hu & Liu (2004) word dictionary
negative_words = open('negative-words.txt', 'r').read()
negative_words = negative_words.split('\n')
positive_words = open('positive-words.txt', 'r').read()
positive_words = positive_words.split('\n')
word_list = negative_words + positive_words
word_list = list(set(word_list))

In [5]:
reviews_data = json.load(open("cleaned_reviews_subset.json"))

In [35]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

In [360]:
ohio_reviews = []
ohio_stars = []
for review in reviews_data['OH']:
    ohio_reviews.append(review['text'])
    ohio_stars.append(review['stars'])

In [105]:
#Load in the Hu & Liu (2004) word dictionary
negative_words = open('negative-words.txt', 'r').read()
negative_words = negative_words.split('\n')
positive_words = open('positive-words.txt', 'r').read()
positive_words = positive_words.split('\n')
word_list = negative_words + positive_words
word_list = list(set(word_list))

In [114]:
vectorizer.get_feature_names()[1000:1005]

[u'artsy', u'arturo', u'artwork', u'arugula', u'ary']

In [361]:
#View the top words in the LDA representation
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names()
display_topics(lda_fit, tf_feature_names, no_top_words)

Topic 0:
fun warm cheap unfortunately weird awful ridiculously favor cute shake
Topic 1:
good like pretty better bad nice work great wrong bland
Topic 2:
cold horrible disgusting hype crap correct stuck regret overwhelming lacked
Topic 3:
modern shit blind devoid snobby uneven detract fuck ghetto vomit
Topic 4:
promises refuse amenable patronize burns hurts safely lure perplexed danger
Topic 5:
bomb wasted joy celebration pretend disrespectful bullshit hilarious peaceful strongest
Topic 6:
consistently comfortable cons expensive picky boring pros falls annoying happily
Topic 7:
pricey cheaper creative refreshing noisy worry greatest neat romantic charming
Topic 8:
inflated novelty grisly finer suffice scant qualms accuse abomination unnatural
Topic 9:
reasonably pokey quiet lover bizarre steady dissatisfied thinner smelling complements
Topic 10:
best hot love sweet good delicious like fresh fantastic fast
Topic 11:
prosperity divine innovative modest weirdly lying struggling fiery kill

In [382]:
#Train a Random Forest Classifier on the LDA input matrix
vectorizer = CountVectorizer(stop_words='english', vocabulary = word_list)
tf = vectorizer.fit_transform(ohio_reviews)
lda = LatentDirichletAllocation(n_topics=50).fit_transform(tf)
lda_fit = LatentDirichletAllocation(n_topics=50).fit(tf)
#Train
X_train, X_test, y_train, y_test = train_test_split(lda, ohio_stars, test_size=0.2)
#Create a decision tree classifier object
lda_classifier = svm.SVC(kernel='linear')
#kernel='sigmoid'
#Train the Decision Forest Classifier
lda_classifier.fit(X_train, y_train)
#Predict on the test set
test_prediction = lda_classifier.predict(X_test)



In [383]:
#Print the Accuracy
print accuracy_score(y_test, test_prediction)
#Print the Precision
print precision_score(y_test, test_prediction, average='weighted')
#Print the confusion matrix
print confusion_matrix(y_test, test_prediction)

0.473526473526
0.444969595819
[[ 97   1   0  19   6]
 [ 53   3   0  39  14]
 [ 36   1   0  79  19]
 [ 21   0   0 181  83]
 [ 20   0   0 136 193]]


In [377]:
#Train a linear support vector machine on the tf-idf input matrix using the H&L dictionary
tf_vectorizer = TfidfVectorizer(vocabulary = word_list)
tf_features = tf_vectorizer.fit_transform(ohio_reviews)
X_train, X_test, y_train, y_test = train_test_split(tf_features, ohio_stars, test_size=0.2)
#Create a decision tree classifier object
tf_classifier = svm.SVC(kernel='linear')
#Train the Decision Forest Classifier
tf_classifier.fit(X_train, y_train)
#Predict on the test set
test_prediction = tf_classifier.predict(X_test)

In [378]:
#Print the Accuracy
print accuracy_score(y_test, test_prediction)
#Print the Precision
print precision_score(y_test, test_prediction, average='weighted')
#Print the confusion matrix
print confusion_matrix(y_test, test_prediction)

0.508491508492
0.489525644744
[[ 80  20   4  12   5]
 [ 27  27  17  26   6]
 [ 19  15  27  61  26]
 [ 10   2  20 122 128]
 [  8   2   2  82 253]]
