In [11]:
"""

Baseline submission for Random Acts of Pizza
Michael Alexander

"""

%matplotlib inline

import json
import pandas
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 



In [51]:
#Loading in initial json data from kaggle

path = 'C:/Users/malexander/Dropbox/MIDS/W207/Final_Project/Data/RAOP/'
train_file = 'train.json'
test_file = 'test.json'

#loading json files to dataframes
train_json = json.load(open(path+train_file))
test_json = json.load(open(path+test_file))

train_df = pandas.io.json.json_normalize(train_json)
test_df = pandas.io.json.json_normalize(test_json)

#Creating label, true/false
train_label_df = train_df.requester_received_pizza

#Subset of columns in train that are available in test
train_df = train_df[list(test_df)]

#Converting to np arrays
train_base = np.array(train_df.values)
test_base = np.array(test_df.values)
train_labels_base = np.array(train_label_df.values)

#Shuffling data and splitting into test and dev for training
np.random.seed(0)
shuffle = np.random.permutation(np.arange(train_base.shape[0]))
train_base, train_labels_base = train_base[shuffle], train_labels_base[shuffle]

#Only using numeric variables for now
numeric_variables = [4,5,6,7,8,9,10,12,13,15,16]

train_data, train_labels = train_base[:3000,numeric_variables], train_labels_base[:3000]
dev_data, dev_labels = train_base[3000:,numeric_variables], train_labels_base[3000:]
test_data = test_base[:,numeric_variables]

In [52]:
# Training decision tree, random forest, and ada boost models

dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)

print 'Accuracy (a decision tree):', dt.score(dev_data, dev_labels)

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train_data, train_labels)

print 'Accuracy (a random forest):', rfc.score(dev_data, dev_labels)

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)

abc.fit(train_data, train_labels)
print 'Accuracy (adaboost with decision trees):', abc.score(dev_data, dev_labels)

Accuracy (a decision tree): 0.655769230769
Accuracy (a random forest): 0.718269230769
Accuracy (adaboost with decision trees): 0.749038461538


In [73]:
# Creating test predictions

test_predictions = abc.predict(test_data)

test_request_ids = test_base[:,1]

# Converting predictions to binary
test_predictions_int = []

for i in range(len(test_predictions)):
    test_predictions_int.append(int(test_predictions[i]))

# Exporting submission csv
submission_df = pandas.DataFrame({"request_id" : test_request_ids, "requester_received_pizza" : test_predictions_int})
submission_df.to_csv(path+"Submission.csv", index=False)
