In [105]:
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer as Tfidf
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import LinearSVC

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

%matplotlib inline

In [3]:
data = pd.read_json('json_2015.json')
data = data.sample(frac=1)

In [4]:
data.head()

Unnamed: 0,business_id,cool,date,funny,stars,text,useful,user_id
376221,eltTG6JOr9cg5FRAgGy8Jw,0,2015-07-28,0,5,This place is always great! A lovely atmospher...,1,R0b6BJOdOgNyK4P6ENIF-w
659818,Xo9a2Vm83Fao6MpprxWcKw,0,2015-12-28,0,5,Shelly and her husband know good food for a gr...,1,C0XLbD06lHEudAgsKDfl-Q
48448,h7R_IMg3nss8I5nV7sbmfg,0,2015-01-30,0,5,Had a great time with Courtney the bartender! ...,0,z6i2Ue7rXouGrlG4Hm7QdA
456797,_aM3SuEk5xWO535ayY3qIA,0,2015-09-06,0,5,Got a full set of french tips with a design. F...,0,uHXGRXA0q4fAujhq2XZypA
314146,ylCLWOLH7eAi2aEtsS9I-Q,0,2015-06-28,0,4,The Veal Picatta & the Veal Milanese were both...,1,YNM-Rmrh79x0ARxElGbjSw


We'll start by drawing a 50,000 observation sample for the dataset to make parameter tuning reasonable – the full dataset can take a very long time to run.

I've also decided to take only the top 10,000 words in the vocabulary across reviews. Messing with this number seemed to make relatively little difference in accuracy (slight improvement over no limit).

In [146]:
#standardize predictor variables

def scorer_pos(estimator, X, y): #custom scoring functions to get positive and negative accuracy
    y_pred = estimator.predict(X)
    return np.float(np.mean(y_pred[y == 1] == y[y == 1]))

def scorer_neg(estimator, X, y):
    estimator.fit(X, y)
    y_pred = estimator.predict(X)
    return np.float(np.mean(y_pred[y == 0] == y[y == 0]))




In [147]:
def test_model(x, binary_y, model, title, coefs=False):
    model.fit(x, binary_y)
    y_pred = model.predict(x)
    binary_y = np.array(binary_y)

    tn, fp, fn, tp = confusion_matrix(binary_y, y_pred).ravel() #from sklearn docs

    print title
    print confusion_matrix(binary_y, y_pred)

    print "False Positive Rate:", fp/float(fp+tn)
    print "False Negative Rate:", fn/float(fn+tp)
    print "True Positive Rate:", tp/float(tp+fp)
    print "True Negative Rate:", tn/float(tn+fn)
    
    print "Positive Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_pos).mean()
    print "Negative Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_neg).mean()
    print "Cross Validated Accuracy on Sample:", cross_val_score(model, x, binary_y, n_jobs=-1).mean()
    print "Train Set Accuracy on Sample:", np.mean(y_pred == binary_y)
    print
    if coefs == True:
        mydict = zip(model.coef_[0], vectorizer.get_feature_names())
        words = sorted([(i[0], i[1].encode('utf-8')) for i in mydict], reverse=True, key=lambda x: x[0])
        return words

In [110]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [143]:
samp = data.sample(300000) #draw samples

#lengths = [len(i[1]['text']) for i in samp.iterrows()]

samp.shape
frac_useful = 1 - data['useful'].value_counts()[0] / float(data.shape[0])
frac_funny = 1 - data['funny'].value_counts()[0] / float(data.shape[0])
frac_cool = 1 - data['cool'].value_counts()[0] / float(data.shape[0])

In [144]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             binary=False, 
                             max_features=10000,
                             analyzer='word',
                             tokenizer=tokenize,
                             sublinear_tf=False
                            ) #using non-binary Count Vec.
reviews = samp.text.values

#tokenize words
x = vectorizer.fit_transform(reviews)

y_useful = [] #class observations according to whether they have at least one "helpful" vote
y_cool = []
y_funny = []


for score in samp.useful.values:
    i = 1 if score > 0 else 0
    y_useful.append(i)

for score in samp.cool.values:
    i = 1 if score > 0 else 0
    y_cool.append(i)

for score in samp.funny.values:
    i = 1 if score > 0 else 0
    y_funny.append(i)

In [149]:
u_tune = 0.0
f_tune = 0.0
c_tune = 0.0

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
log_model = LogReg(C=0.01, penalty='l2', class_weight=weights) #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=500, criterion='gini', 
                              max_features= 17, max_depth=15, class_weight='balanced_subsample')


useful_words = test_model(x, y_useful, log_model, "Useful")
funny_words = test_model(x, y_funny, 
                         log_model.set_params(class_weight={0:1/(1 - frac_funny + f_tune), 1:1/(frac_funny - f_tune)}), "Funny")
cool_words = test_model(x, y_cool, 
                        log_model.set_params(class_weight={0:1/(1 - frac_cool + c_tune), 1:1/(frac_cool - c_tune)}), "Cool")

Useful
[[119849  67053]
 [ 43179  69919]]
False Positive Rate: 0.358760205883
False Negative Rate: 0.381783939592
True Positive Rate: 0.510461992232
True Negative Rate: 0.735143656304
Positive Accuracy: 0.611894117369
Negative Accuracy: 0.632058510638
Cross Validated Accuracy on Sample: 0.625123325275
Train Set Accuracy on Sample: 0.63256

Funny
[[161265  86719]
 [ 17749  34267]]
False Positive Rate: 0.349695948126
False Negative Rate: 0.341221931713
True Positive Rate: 0.28323111765
True Negative Rate: 0.900851330064
Positive Accuracy: 0.645282138911
Negative Accuracy: 0.643497150266
Cross Validated Accuracy on Sample: 0.64454999311
Train Set Accuracy on Sample: 0.651773333333

Cool
[[154500  79916]
 [ 26291  39293]]
False Positive Rate: 0.340915295884
False Negative Rate: 0.400875213467
True Positive Rate: 0.329614374754
True Negative Rate: 0.854577938061
Positive Accuracy: 0.585737297126
Negative Accuracy: 0.652911907872
Cross Validated Accuracy on Sample: 0.637823329043
Train Set A

In [None]:
print "10 Coolest Words:"
for word in cool_words[:10]:
    print word[1], word[0]

print "\n10 Least Cool Words:"
for word in cool_words[-10:]:
    print word[1], word[0]
    
print "\n10 Funniest Words:"
for word in funny_words[:10]:
    print word[1], word[0]

print "\n10 Least Funny Words:"
for word in funny_words[-10:]:
    print word[1], word[0]
    
print "\n10 Most Useful Words:"
for word in useful_words[:10]:
    print word[1], word[0]

print "\n10 Least Useful Words:"
for word in useful_words[-10:]:
    print word[1], word[0]

In [142]:
u_tune = 0.0
f_tune = 0.0
c_tune = 0.0

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
svm = LinearSVC(C=0.001, penalty='l2', class_weight='balanced') #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=500, criterion='gini', 
                              max_features= 17, max_depth=15, class_weight='balanced_subsample')


useful_words = test_model(x, y_useful, svm, "Useful")
funny_words = test_model(x, y_funny, 
                         svm, "Funny")
cool_words = test_model(x, y_cool, 
                        svm, "Cool")

Useful
[[19473 11585]
 [ 7167 11775]]
False Positive Rate: 0.373011784403
False Negative Rate: 0.378365536902
True Positive Rate: 0.504066780822
True Negative Rate: 0.730968468468
Positive Accuracy: 0.611656636047
Negative Accuracy: 0.626988226227
Cross Validated Accuracy on Sample: 0.614820048493
Train Set Accuracy on Sample: 0.62496

Funny
[[26449 14789]
 [ 3020  5742]]
False Positive Rate: 0.358625539551
False Negative Rate: 0.344670166629
True Positive Rate: 0.279674638352
True Negative Rate: 0.897519427195
Positive Accuracy: 0.635013459455
Negative Accuracy: 0.645472622339
Cross Validated Accuracy on Sample: 0.636559954508
Train Set Accuracy on Sample: 0.64382

Cool
[[25088 13780]
 [ 4416  6716]]
False Positive Rate: 0.354533292168
False Negative Rate: 0.396694214876
True Positive Rate: 0.327673692428
True Negative Rate: 0.85032537961
Positive Accuracy: 0.58740528329
Negative Accuracy: 0.645440979726
Cross Validated Accuracy on Sample: 0.6260799457
Train Set Accuracy on Sample: 0.

In [84]:
u_tune = 0.00
f_tune = 0.00
c_tune = 0.00

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}

log_model = LogReg(C=0.01, penalty='l2', class_weight=weights) #initialize logistic regression model
log_model.fit(np.array(lengths[:40000]).reshape(-1,1), np.array(y_useful[:40000]).reshape(-1,1))

print log_model.score(np.array(lengths[:40000]).reshape(-1,1), np.array(y_useful[:40000]).reshape(-1,1))

test_model(np.array(lengths[:40000]).reshape(-1,1), np.array(y_useful[:40000]).reshape(-1,1), log_model, "Useful")

 0.650525
Useful
[[18822  6155]
 [ 7824  7199]]
False Positive Rate: 0.246426712576
False Negative Rate: 0.520801437795
True Positive Rate: 0.539089411412
True Negative Rate: 0.70637243864
Positive Accuracy:

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


IndexError: too many indices for array