In [167]:
import pandas as pd
import numpy as np
import tqdm
import json
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [23]:
#Display the top features in each LDA category
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

In [11]:
#Read in the H&L dictionaries
negative_words = open('../input/negative-words.txt', 'r').read()
negative_words = negative_words.split('\n')
positive_words = open('../input/positive-words.txt', 'r').read()
positive_words = positive_words.split('\n')
total_words = negative_words + positive_words
total_words = list(set(total_words))

In [164]:
#Read in the subsetted JSON data and then create a dataframe 
#with the reviews and stars as separate columns
review_data = json.load(open('../input/cleaned_reviews_subset.json'))
reviews = []
stars = []
for state in review_data.keys():
    for review in review_data[state]:
        reviews.append(review['text'])
        if review['stars'] >= 4:
            stars.append(review['stars'])
        else:
            stars.append(1)
cleaned_data = pd.DataFrame({'review_text': reviews, 'review_stars': stars})

In [126]:
#Learn a 200-topic LDA representation using the full corpus
vectorizer = CountVectorizer(stop_words='english', vocabulary = total_words )
tf = vectorizer.fit_transform(cleaned_data['review_text'])
lda_fit = LDA(n_topics=100, learning_offset = 1.5).fit(tf)



In [127]:
#View the top 10 words in the LDA representation
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names()
display_topics(lda_fit, tf_feature_names, no_top_words)

Topic 0:
nice happy tender smile disappoint impeccable thrilled easier appropriate generously
Topic 1:
cold fine like poor disgusting stuck bad hype noise overwhelming
Topic 2:
hot fresh like delicious ready liked seasoned die sour incredible
Topic 3:
friendly favorite super clean great fresh fantastic beautiful greasy good
Topic 4:
anxiously well well-established praise imaginative misaligns smallish cleanly expropriation dissolute
Topic 5:
horrible awful like strong smell sadly smelled ruined complained sorry
Topic 6:
great concede revere negation inadequate treacherously hectic cranky curt profound
Topic 7:
juddering perturb stall muddle well-positioned improving conflicted assault avidly droops
Topic 8:
odd correct lacked impressive smoke annoyed compliment hassle complementary ideal
Topic 9:
world-famous inappropriately cuss dissident condemn shortage scornful eccentricity villainous aborted
Topic 10:
lacking bumped suffers disbelief master originality good pretty favorite hard
To

In [140]:
#Transform our reviews into LDA representations
lda_features = vectorizer.transform(cleaned_data['review_text'])
lda_features = lda_fit.transform(lda_features)

In [145]:
#Do a baseline test using linear SVM, print classification report
#Train, test, split
X_train, X_test, y_train, y_test = tts(lda_features, cleaned_data['review_stars'], test_size=0.2)
#Create a decision tree classifier object
svm_clf = svm.LinearSVC()
#kernel='sigmoid'
#Train the Decision Forest Classifier
svm_clf.fit(X_train, y_train)
#Predict on the test set
test_prediction = svm_clf.predict(X_test)
#Print a classification report
print classification_report(y_test, test_prediction)
print confusion_matrix(y_test, test_prediction)

             precision    recall  f1-score   support

          1       0.67      0.76      0.71       378
          4       0.46      0.23      0.31       282
          5       0.59      0.73      0.65       341

avg / total       0.58      0.60      0.58      1001

[[289  43  46]
 [ 86  66 130]
 [ 59  33 249]]


In [147]:
#Train a tf-idf vector using the H&L dictionary
tf_vectorizer = TfidfVectorizer(ngram_range = (3,3))
tf_features = tf_vectorizer.fit_transform(cleaned_data['review_text'])
X_train, X_test, y_train, y_test = train_test_split(tf_features, cleaned_data['review_stars'], test_size=0.2)
#Create a decision tree classifier object
tf_classifier = svm.LinearSVC()
#Train the Decision Forest Classifier
tf_classifier.fit(X_train, y_train)
#Predict on the test set
test_prediction = tf_classifier.predict(X_test)
#Print a classification report
print classification_report(y_test, test_prediction)
print confusion_matrix(y_test, test_prediction)

             precision    recall  f1-score   support

          1       0.60      0.92      0.72       375
          4       0.46      0.12      0.19       279
          5       0.64      0.65      0.64       347

avg / total       0.57      0.60      0.55      1001

[[344  11  20]
 [137  33 109]
 [ 93  28 226]]


In [165]:
#Create a combined dictionary of the features from LDA and tf-idf
tf_idf_mod = TfidfVectorizer(ngram_range = (3,3))
lda_mod = LDA(n_topics=200, learning_offset = 1.5)
combined_features = FeatureUnion([
         ('lda', Pipeline([
      ('counts', CountVectorizer(stop_words='english', vocabulary = total_words)),
      ('tf_idf', lda_mod)
    ])),
    ('tf_idf', tf_idf_mod)])
comb_features = combined_features.fit(cleaned_data['review_text']).transform(cleaned_data['review_text'])



In [None]:
#Export the pickled version of this classifier


In [166]:
#We can do better then simply looking at the tf-idf or LDA matrices on their own
#Train, test, split
X_train, X_test, y_train, y_test = train_test_split(comb_features, cleaned_data['review_stars'], test_size=0.2)
#Create a decision tree classifier object
comb_clf = svm.LinearSVC()
#Train the Classifier
comb_clf.fit(X_train, y_train)
#Predict on the test set
test_prediction = comb_clf.predict(X_test)
#Print a classification report
print classification_report(y_test, test_prediction)
print confusion_matrix(y_test, test_prediction)

             precision    recall  f1-score   support

          1       0.77      0.88      0.82      2506
          4       0.73      0.47      0.57      2027
          5       0.72      0.82      0.77      2469

avg / total       0.74      0.74      0.73      7002

[[2204  150  152]
 [ 417  956  654]
 [ 226  210 2033]]


In [168]:
#Print the cross validated scores
scores = cross_val_score(svm.LinearSVC(), comb_features, cleaned_data['review_stars'], cv=12)
print scores

[ 1.          1.          1.          0.81727803  0.65752485  0.64929722
  0.62495715  0.62461433  0.65923894  0.66815221  0.66918067  0.68015084]


In [169]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.30)
