In [70]:
import pandas as pd
import numpy as np


# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

from sklearn.decomposition import PCA

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

pd_train = pd.read_json('https://raw.githubusercontent.com/mdemaster/w207_Final_Project/master/train.json', orient='columns')
pd_test = pd.read_json('https://raw.githubusercontent.com/mdemaster/w207_Final_Project/master/test.json', orient='columns')

np_test = np.array(pd_test)
np_train = np.array(pd_train)

print np_train.shape


(4040, 32)


In [71]:


X = np_train[:,7]
Y = np_train[:,22]
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

print 'data shape: ', X.shape
print 'label shape:', Y.shape

l=len(X)
train_data, train_labels = X[:l/2], Y[:l/2]
dev_data, dev_labels = X[l/2:(3*l)/4], Y[l/2:(3*l)/4]
test_data, test_labels = X[(3*l)/4:], Y[(3*l)/4:]


categories = ['Didn\'t get pizza','Got Pizza']


data shape:  (4040,)
label shape: (4040,)


In [72]:
import re 

def better_preprocessor(s):
    repl = re.sub('&', ' and ', s)
    repl = repl.lower()
    repl = repl.replace('0',' zero ')
    repl = repl.replace('1',' one ')
    repl = repl.replace('2',' two ')
    repl = repl.replace('3',' three ')
    repl = repl.replace('4',' four ')
    repl = repl.replace('5',' five ')
    repl = repl.replace('6',' six ')
    repl = repl.replace('7',' seven ')
    repl = repl.replace('8',' eight ')
    repl = repl.replace('9',' nine ')
    repl = re.sub('[^a-z]+',' ', repl)
    return repl


#Use np.where to binarize train and dev set where values above and below 0.5.
b=train_labels
trainlabels=np.where(b==True, 1, 0)

bl=dev_labels
devlabels=np.where(bl==True, 1, 0)

b2=test_labels
testlabels=np.where(b2==True, 1, 0)

print('Sum of train(Got pizza)', sum(trainlabels),' (Didn\'t get pizza:)', len(trainlabels) - sum(trainlabels))
print('Sum of dev(Got pizza)', sum(devlabels),' (Didn\'t get pizza:)', len(devlabels) - sum(devlabels))
print('Sum of test(Got pizza)', sum(testlabels),' (Didn\'t get pizza:)', len(testlabels) - sum(testlabels))


('Sum of train(Got pizza)', 472, " (Didn't get pizza:)", 1548)
('Sum of dev(Got pizza)', 258, " (Didn't get pizza:)", 752)
('Sum of test(Got pizza)', 264, " (Didn't get pizza:)", 746)


In [38]:
#Run initial vectorizer and fit_transform on train_data and find vocab size from shape attribute.
vect=CountVectorizer(ngram_range=(1, 2))
data=vect.fit_transform(train_data).toarray()
devdata=vect.transform(dev_data).toarray()


print 'Baseline Scores...'
#Run MultinomialNB Classifier
best_nb = []
alphas = [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]
for k in range(len(alphas)):
    mnb_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2),preprocessor=better_preprocessor)), 
                        ('mnclf', MultinomialNB(alpha=alphas[k]))])
    mnb_clf = mnb_clf.fit(train_data, trainlabels)
    pred = mnb_clf.predict(dev_data)
    metrics.accuracy_score(devlabels,pred)
    best_nb.append(metrics.accuracy_score(devlabels,pred))
bestAlphaAccuracy = max(best_nb)
bestAlphaValue = alphas[best_nb.index(bestAlphaAccuracy)]
print 'Naive Bayes Baseline:'
print 'Best Alpha =', bestAlphaValue, ' accuracy:', bestAlphaAccuracy
print ''



#Run Logistic Regression classifier
log_clf = Pipeline([('vect', CountVectorizer()),('lgclf', LogisticRegression(C=0.5))])
log_clf = log_clf.fit(train_data, trainlabels) 
pred = log_clf.predict(dev_data)        
score2= metrics.accuracy_score(devlabels,pred)
#print 'Logistic Regression Score:',score2
best_logit = []
C = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for k in range(len(C)):
    log_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2),preprocessor=better_preprocessor)),
                     ('lgclf', LogisticRegression(C=C[k], tol=0.1))]);
    log_clf = log_clf.fit(train_data, trainlabels)
    pred = log_clf.predict(dev_data)
    metrics.accuracy_score(devlabels,pred)
    best_logit.append(metrics.accuracy_score(devlabels,pred))
    weights = log_clf.named_steps['lgclf'].coef_
bestCAccuracy = max(best_logit)
bestCValue = C[best_logit.index(bestCAccuracy)]
print 'Logistic Regression Baseline:'
print 'Best C =', bestCValue, ' accuracy:', bestCAccuracy
print ''



Baseline Scores...
Naive Bayes Baseline:
Best Alpha = 1.0  accuracy: 0.750495049505

Logistic Regression Baseline:
Best C = 0.0001  accuracy: 0.750495049505



In [191]:

mnb_clf = Pipeline([('vect', CountVectorizer(preprocessor=better_preprocessor)), 
                        ('mnclf', MultinomialNB())])
mnb_clf = mnb_clf.fit(train_data, trainlabels)
pred = mnb_clf.predict(test_data)
acc = metrics.accuracy_score(testlabels,pred)
print('Naive Bayes Baseline:')
print('Pred sum(got pizza):',sum(pred))
print('Acutal sum(got pizza):',sum(testlabels))
print('accuracy:', acc)
print metrics.classification_report(testlabels, pred,
               target_names=categories)
print('')

log_clf = Pipeline([('vect', TfidfVectorizer(preprocessor=better_preprocessor,
                                             ngram_range=(3,5),max_df=0.5, min_df=3)),
                     ('lgclf', LogisticRegression(C=100, tol=0.1))]);
print train_data.shape
log_clf = log_clf.fit(train_data, trainlabels)
pred = log_clf.predict(test_data)
predProb = log_clf.predict_log_proba(test_data)
acc = metrics.accuracy_score(testlabels,pred)


vect = log_clf.named_steps['vect']
lgclf = log_clf.named_steps['lgclf']
features = tfidvec.get_feature_names();
weights = lgclf.coef_

print 'positive weights:'
weight_indexes = []
positive_features = []
weight_index = weights[0].argsort()[-5:][::-1].tolist()
weight_indexes += (weight_index)    
for i in range(len(weight_indexes)):
        index = weight_indexes[i]
        positive_features.append(features[index])
        print 'Feature Name:', features[index]
        print weights[0][index]
        print ''
print positive_features
print ''
print 'negative weights:'
weight_indexes = []
negative_features = []
weight_index = weights[0].argsort()[:5].tolist()
weight_indexes += (weight_index)    
for i in range(len(weight_indexes)):
        index = weight_indexes[i]
        negative_features.append(features[index])
        print 'Feature Name:', features[index]
        print weights[0][index]
        print ''
print negative_features
print ''


print 'predicted prob:', predProb

print('Logistic Regression Baseline:')
print('Pred sum(got pizza):',sum(pred))
print('Acutal sum(got pizza):',sum(testlabels))
print('accuracy:', acc)
print metrics.classification_report(testlabels, pred,
               target_names=categories)
print('')


test_ids = np_test[:,1]
test_X = np_test[:,2]
predictions = log_clf.predict(test_X)


print('Actual Test data:')
print('Test data shape: ', test_X.shape)
print('Pred sum(got pizza):',sum(predictions))
#print(sum(np.where(predictions==1, 1, 0)))
d = {
    'request_id' :test_ids, 
    'requester_received_pizza':predictions
}
submission = pd.DataFrame(d)
#print(submission)

Naive Bayes Baseline:
('Pred sum(got pizza):', 37)
('Acutal sum(got pizza):', 249)
('accuracy:', 0.73069306930693068)
                  precision    recall  f1-score   support

Didn't get pizza       0.75      0.96      0.84       761
       Got Pizza       0.19      0.03      0.05       249

     avg / total       0.61      0.73      0.65      1010


(2020,)
positive weights:
Feature Name: my first check
7.26684098015

Feature Name: because of the
7.00823699016

Feature Name: to ask for
6.51943502784

Feature Name: an empty stomach
6.24212919068

Feature Name: some kind redditor
5.87033434187

[u'my first check', u'because of the', u'to ask for', u'an empty stomach', u'some kind redditor']

negative weights:
Feature Name: some pizza to
-4.99748697751

Feature Name: would be awesome
-4.99389088301

Feature Name: to get pizza
-4.80528637052

Feature Name: me and the
-4.54810714418

Feature Name: started new job
-4.38352364647

[u'some pizza to', u'would be awesome', u'to get pizza', u'm