In [3]:
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cmx
import matplotlib.colors as colors
import pandas as pd
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer as Tfidf
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split as sk_split
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import LinearSVC

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

%matplotlib inline

In [4]:
data = pd.read_json('json_2015.json')
data = data.sample(frac=1)

ValueError: Expected object or value

In [None]:
data.head()

In [None]:
#standardize predictor variables

def scorer_pos(estimator, X, y): #custom scoring functions to get positive and negative accuracy
    y_pred = estimator.predict(X)
    return np.float(np.mean(y_pred[y == 1] == y[y == 1]))

def scorer_neg(estimator, X, y):
    estimator.fit(X, y)
    y_pred = estimator.predict(X)
    return np.float(np.mean(y_pred[y == 0] == y[y == 0]))

In [5]:
def test_model(x, binary_y, model, title, coefs=False):
    model.fit(x, binary_y)
    y_pred = model.predict(x)
    binary_y = np.array(binary_y)

    tn, fp, fn, tp = confusion_matrix(binary_y, y_pred).ravel() #from sklearn docs

    print title
    print confusion_matrix(binary_y, y_pred)

    print "False Positive Rate:", fp/float(fp+tn)
    print "False Negative Rate:", fn/float(fn+tp)
    print "True Positive Rate:", tp/float(tp+fp)
    print "True Negative Rate:", tn/float(tn+fn)
    
    print "Positive Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_pos).mean()
    print "Negative Accuracy:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring=scorer_neg).mean()
    print "Cross Validated Accuracy on Sample:", cross_val_score(model, x, binary_y, n_jobs=-1).mean()
    print "Cross Validated AUC on Sample:", cross_val_score(model, x, binary_y, n_jobs=-1, scoring='roc_auc').mean()
    print "Train Set Accuracy on Sample:", np.mean(y_pred == binary_y)
    print
    if coefs == True:
        mydict = zip(model.coef_[0], vectorizer.get_feature_names())
        words = sorted([(i[0], i[1].encode('utf-8')) for i in mydict], reverse=True, key=lambda x: x[0])
        return words

In [6]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [6]:
samp = data.sample(100000) #draw samples

#lengths = [len(i[1]['text']) for i in samp.iterrows()]

samp.shape
frac_useful = 1 - data['useful'].value_counts()[0] / float(data.shape[0])
frac_funny = 1 - data['funny'].value_counts()[0] / float(data.shape[0])
frac_cool = 1 - data['cool'].value_counts()[0] / float(data.shape[0])

NameError: name 'data' is not defined

In [36]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             binary=False, 
                             max_features=10000,
                             analyzer='word',
                             #tokenizer=tokenize,
                             sublinear_tf=False
                            ) #using non-binary Count Vec.
reviews = samp.text.values

#tokenize words
x = vectorizer.fit_transform(reviews)

y_useful = [] #class observations according to whether they have at least one "helpful" vote
y_cool = []
y_funny = []


for score in samp.useful.values:
    i = 1 if score > 0 else 0
    y_useful.append(i)

for score in samp.cool.values:
    i = 1 if score > 0 else 0
    y_cool.append(i)

for score in samp.funny.values:
    i = 1 if score > 0 else 0
    y_funny.append(i)

In [2]:
u_tune = 0.00
f_tune = 0.00
c_tune = 0.00

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
log_model = LogReg(C=0.01, penalty='l2', class_weight=weights) #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=300, criterion='gini', 
                              max_features= 17, max_depth=5, class_weight='balanced')



useful_words = test_model(x, y_useful, 
                          log_model,
                          "Useful",
                          coefs=True)
funny_words = test_model(x, y_funny, 
                         log_model.set_params(class_weight={0:1/(1 - frac_funny + f_tune), 1:1/(frac_funny - f_tune)}), 
                         "Funny",
                          coefs=True)
cool_words = test_model(x, y_cool, 
                        log_model.set_params(class_weight={0:1/(1 - frac_cool + c_tune), 1:1/(frac_cool - c_tune)}),
                        #rand,
                        "Cool",
                          coefs=True)

NameError: name 'frac_useful' is not defined

In [1]:
print "10 Coolest Words:"
for word in cool_words[:10]:
    print word[1], word[0]

print "\n10 Least Cool Words:"
for word in cool_words[-10:]:
    print word[1], word[0]
    
print "\n10 Funniest Words:"
for word in funny_words[:10]:
    print word[1], word[0]

print "\n10 Least Funny Words:"
for word in funny_words[-10:]:
    print word[1], word[0]
    
print "\n10 Most Useful Words:"
for word in useful_words[:10]:
    print word[1], word[0]

print "\n10 Least Useful Words:"
for word in useful_words[-10:]:
    print word[1], word[0]

10 Coolest Words:


NameError: name 'cool_words' is not defined

In [31]:
u_tune = 0.0
f_tune = 0.0
c_tune = 0.0

weights = {0:1/(1 - frac_useful + u_tune), 1:1/(frac_useful - u_tune)}
svm = LinearSVC(C=0.001, penalty='l2', class_weight='balanced') #initialize logistic regression model

rand = RandomForestClassifier(n_estimators=500, criterion='gini', 
                              max_features= 17, max_depth=15, class_weight='balanced_subsample')


useful_words = test_model(x, y_useful, svm, "Useful")
funny_words = test_model(x, y_funny, 
                         svm, "Funny")
cool_words = test_model(x, y_cool, 
                        svm, "Cool")

Useful
[[166388  82731]
 [ 58412  92469]]
False Positive Rate: 0.332094300314
False Negative Rate: 0.387139533805
True Positive Rate: 0.52779109589
True Negative Rate: 0.740160142349
Positive Accuracy: 0.599452569747
Negative Accuracy: 0.661314481825
Cross Validated Accuracy on Sample: 0.634405008157
Cross Validated AUC on Sample: 0.673775543874
Train Set Accuracy on Sample: 0.6471425

Funny
[[224376 106569]
 [ 23641  45414]]
False Positive Rate: 0.322014231972
False Negative Rate: 0.342350300485
True Positive Rate: 0.298809735299
True Negative Rate: 0.904679921134
Positive Accuracy: 0.628861110506
Negative Accuracy: 0.672997023675
Cross Validated Accuracy on Sample: 0.662530004008
Cross Validated AUC on Sample: 0.70180118236
Train Set Accuracy on Sample: 0.674475

Cool
[[214325  98403]
 [ 34632  52640]]
False Positive Rate: 0.314660024046
False Negative Rate: 0.396828306903
True Positive Rate: 0.348510026946
True Negative Rate: 0.860891639922
Positive Accuracy: 0.577413101408
Negative