In [125]:
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer as Tfidf
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import LinearSVC

import nltk
from nltk.stem.porter import PorterStemmer

%matplotlib inline

In [116]:
data = pd.read_json('json_2015.json')
data = data.sample(frac=1)

In [117]:
data.head()

Unnamed: 0,business_id,cool,date,funny,stars,text,useful,user_id
382418,lPONCVSGRrobKoZtZQh0Gw,2,2015-07-31,1,5,I've been vaping for a while now and have trie...,0,UkASldEgm9snvR0LVfZm4w
445629,iUPJmJvHy9fVfRxsuwwdLQ,1,2015-08-31,1,4,Outstanding set up they have here. Food was br...,1,0QZ1jroPjmJbHtKeX-jk6g
436085,RIPnl1BAUaY2rSW8cPuQWQ,0,2015-08-26,1,4,I get Star Wok when I am at work as I can't le...,0,f8PvRnAj7KYb8s-aPjRhtA
366946,8DKEWD2DaU6XlMNRN4QNlA,0,2015-07-24,0,5,Hek Yeah's brisket is where its at!\n\n It's m...,1,WOFZoOQrzYSLxJ6td6cecg
195840,d_QoB6QfsGVqSzB3cGO_tA,0,2015-04-25,0,3,This place is pretty good when you want a pizz...,0,o19jYvtqRsI9LSO0we8ROg


We'll start by drawing a 50,000 observation sample for the dataset to make parameter tuning reasonable – the full dataset can take a very long time to run.

I've also decided to take only the top 10,000 words in the vocabulary across reviews. Messing with this number seemed to make relatively little difference in accuracy (slight improvement over no limit).

In [118]:
#standardize predictor variables

def scorer_pos(estimator, X, y): #custom scoring functions to get positive and negative accuracy
    y_pred = estimator.predict(X)
    return np.mean(y_pred[y == 1] == y[y == 1])

def scorer_neg(estimator, X, y):
    estimator.fit(X, y)
    y_pred = estimator.predict(X)
    return np.mean(y_pred[y == 0] == y[y == 0])

In [150]:
def test_model(x, binary_y, model, title, coefs=False):
    model.fit(x, binary_y)
    y_pred = model.predict(x)
    binary_y = np.array(binary_y)

    tn, fp, fn, tp = confusion_matrix(binary_y, y_pred).ravel() #from sklearn docs

    print title
    print confusion_matrix(binary_y, y_pred)

    print "False Positive Rate:", fp/float(fp+tn)
    print "False Negative Rate:", fn/float(fn+tp)
    print "True Positive Rate:", tp/float(tp+fp)
    print "True Negative Rate:", tn/float(tn+fn)
    
    print "Positive Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_pos).mean()
    print "Negative Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_neg).mean()
    print "Cross Validated Accuracy on Sample:", cross_val_score(model, x, binary_y, n_jobs=-1).mean()
    print "Train Set Accuracy on Sample:", np.mean(y_pred == binary_y)
    print
    if coefs == True:
        mydict = zip(model.coef_[0], vectorizer.get_feature_names())
        words = sorted([(i[0], i[1].encode('utf-8')) for i in mydict], reverse=True, key=lambda x: x[0])
        return words

In [5]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [142]:
samp = data.sample(50000) #draw samples
samp.shape
frac_useful = 1 - data['useful'].value_counts()[0] / float(data.shape[0])
frac_funny = 1 - data['funny'].value_counts()[0] / float(data.shape[0])
frac_cool = 1 - data['cool'].value_counts()[0] / float(data.shape[0])

In [143]:
vectorizer = CountVectorizer(stop_words='english', 
                             binary=False, 
                             max_features=10000,
                             analyzer='word',
                             tokenizer=tokenize
                            ) #using non-binary Count Vec.
reviews = samp.text.values

#tokenize words
x = vectorizer.fit_transform(reviews)

y_useful = [] #class observations according to whether they have at least one "helpful" vote
y_cool = []
y_funny = []


for score in samp.useful.values:
    i = 1 if score > 0 else 0
    y_useful.append(i)

for score in samp.cool.values:
    i = 1 if score > 0 else 0
    y_cool.append(i)

for score in samp.funny.values:
    i = 1 if score > 0 else 0
    y_funny.append(i)

In [151]:
u_tune = 0.03
f_tune = 0.03
c_tune = 0.03

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
log_model = LogReg(C=0.01, penalty='l2', class_weight=weights) #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=500, criterion='gini', 
                              max_features= 17, max_depth=15, class_weight='balanced_subsample')


useful_words = test_model(x, y_useful, log_model, "Useful")
funny_words = test_model(x, y_funny, 
                         log_model.set_params(class_weight={0:1/(1 - frac_funny + f_tune), 1:1/(frac_funny - f_tune)}), "Funny")
cool_words = test_model(x, y_cool, 
                        log_model.set_params(class_weight={0:1/(1 - frac_cool + c_tune), 1:1/(frac_cool - c_tune)}), "Cool")

Useful
[[22140  9002]
 [ 6655 12203]]
False Positive Rate: 0.289063001734
False Negative Rate: 0.352900625729
True Positive Rate: 0.575477481726
True Negative Rate: 0.768883486716
Positive Accuracy: 0.56808781419
Negative Accuracy: 0.73412118392
Cross Validated Accuracy on Sample: 0.629859952903
Train Set Accuracy on Sample: 0.68686

Funny
[[28419 12858]
 [ 2193  6530]]
False Positive Rate: 0.311505196599
False Negative Rate: 0.251404333372
True Positive Rate: 0.336806271921
True Negative Rate: 0.928361426891
Positive Accuracy: 0.58936148685
Negative Accuracy: 0.720861496717
Cross Validated Accuracy on Sample: 0.654100021324
Train Set Accuracy on Sample: 0.69898

Cool
[[26204 12752]
 [ 3150  7894]]
False Positive Rate: 0.327343669781
False Negative Rate: 0.285222745382
True Positive Rate: 0.382350092028
True Negative Rate: 0.892689241671
Positive Accuracy: 0.582217048629
Negative Accuracy: 0.702767218898
Cross Validated Accuracy on Sample: 0.627839954011
Train Set Accuracy on Sample: 0

In [None]:
print "10 Coolest Words:"
for word in cool_words[:10]:
    print word[1], word[0]

print "\n10 Least Cool Words:"
for word in cool_words[-10:]:
    print word[1], word[0]
    
print "\n10 Funniest Words:"
for word in funny_words[:10]:
    print word[1], word[0]

print "\n10 Least Funny Words:"
for word in funny_words[-10:]:
    print word[1], word[0]
    
print "\n10 Most Useful Words:"
for word in useful_words[:10]:
    print word[1], word[0]

print "\n10 Least Useful Words:"
for word in useful_words[-10:]:
    print word[1], word[0]

In [152]:
u_tune = 0.0
f_tune = 0.0
c_tune = 0.0

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
svm = LinearSVC(C=0.001, penalty='l2', class_weight='balanced') #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=500, criterion='gini', 
                              max_features= 17, max_depth=15, class_weight='balanced_subsample')


useful_words = test_model(x, y_useful, svm, "Useful")
funny_words = test_model(x, y_funny, 
                         svm, "Funny")
cool_words = test_model(x, y_cool, 
                        svm, "Cool")

Useful
[[24406  6736]
 [ 8607 10251]]
False Positive Rate: 0.21629953118
False Negative Rate: 0.456411072224
True Positive Rate: 0.603461470536
True Negative Rate: 0.739284524278
Positive Accuracy: 0.487061194188
Negative Accuracy: 0.794875048699
Cross Validated Accuracy on Sample: 0.652779944521
Train Set Accuracy on Sample: 0.69314

Funny
[[32815  8462]
 [ 3510  5213]]
False Positive Rate: 0.205005208712
False Negative Rate: 0.402384500745
True Positive Rate: 0.381206581353
True Negative Rate: 0.903372333104
Positive Accuracy: 0.476441827163
Negative Accuracy: 0.810863192577
Cross Validated Accuracy on Sample: 0.726599962181
Train Set Accuracy on Sample: 0.76056

Cool
[[30468  8488]
 [ 4761  6283]]
False Positive Rate: 0.217886846699
False Negative Rate: 0.431093806592
True Positive Rate: 0.42536050369
True Negative Rate: 0.86485565869
Positive Accuracy: 0.474556500113
Negative Accuracy: 0.797001782851
Cross Validated Accuracy on Sample: 0.695439996626
Train Set Accuracy on Sample: 0