In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import *

import pandas as pd

#Julia sample code
using DataFrames
using MachineLearning
using JSON

function read_data(file_name)
    f = open(file_name)
    json = JSON.parse(readall(f))
    close(f)

    colnames = keys(json[1])
    columns  = Any[[json[i][name] for i=1:length(json)] for name=colnames]
    DataFrame(columns, Symbol[name for name=colnames])
end

train = read_data("../data/train.json")
test  = read_data("../data/test.json")

println(@sprintf("There are %d rows in the training set", nrow(train)))
println(@sprintf("There are %d rows in the test set", nrow(test)))

feature_names = Symbol["requester_account_age_in_days_at_request",
                       "requester_days_since_first_post_on_raop_at_request",
                       "requester_number_of_comments_at_request",
                       "requester_number_of_comments_in_raop_at_request",
                       "requester_number_of_posts_at_request",
                       "requester_number_of_posts_on_raop_at_request",
                       "requester_number_of_subreddits_at_request",
                       "requester_upvotes_minus_downvotes_at_request",
                       "requester_upvotes_plus_downvotes_at_request",
                       "unix_timestamp_of_request_utc"]

for feature = feature_names
    train[feature] = float64(train[feature])
    test[feature]  = float64(test[feature])
end

columns_to_keep = cat(1, feature_names, [:requester_received_pizza])

rf = fit(train[columns_to_keep], :requester_received_pizza, classification_forest_options(num_trees=200, display=true))
println("")
println(rf)
println("")
predictions = predict_probs(rf, test)[:,2]
submission = DataFrame(request_id=test[:request_id], requester_received_pizza=predictions)
writetable("simple_julia_benchmark.csv", submission)

In [7]:

train_data["request_text"]

404     The title pretty much explains it all. I've be...
405     I'm going into a residential rehab tomorrow fo...
406     My son will be 1 on Friday. He does eat pizza ...
407     Would love pizza for me and the family on Tues...
408     Here's a list of current edible food (edible, ...
409                                                      
410     Alright...so, I'll try to keep it brief, not s...
411     I'm not poor or anything but my fellow reddito...
412     I've been debating whether to post here or not...
413     Hi my cousin kellie told me about this website...
414     We just moved to the area, and as a result of ...
415     I only have a pizza hut nearby plus some local...
416     It's not about the food. I have dinner already...
417     To whom it may concern, I just bombed a Math t...
418                CAN WE STILL CELEBRATE WITH A PIZZA???
419     Busy and hungry university student in Edmonton...
420     I would like to request pizza this evening. A ...
421     https:

In [2]:
# Load Data
train_data  = pd.read_json('./data/train.json')
train_labels = train_data['requester_received_pizza']
train_data = train_data.drop('requester_received_pizza', 1)
dev_size = int(train_data.shape[0]*.1)

train_data, train_labels  = train_data[dev_size:], train_labels[dev_size:]
dev_data, dev_labels = train_data[:dev_size], train_labels[:dev_size]

names = list(train_data.columns.values)

test_data  = pd.read_json('./data/test.json')
#test_label = test_data['requester_received_pizza']
#test_data = test_data.drop('requester_received_pizza', 1)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(dev_data.shape)
#print(test_label.shape)
print(list(test_data.columns.values))
print(dev_size)
#print(train_data.dtypes)

(3636, 31)
(3636,)
(1631, 17)
(404, 31)
['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_username', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc']
404


In [3]:
#Here we split the number variables from the object or string type variables
obj_columns = ['giver_username_if_known','request_id','request_text','request_text_edit_aware','request_title',
              'requester_subreddits_at_request','requester_user_flair','requester_username']
num_columns = [i for i in names if i not in obj_columns]

#Here we split the test data columns into
test_names = list(test_data.columns.values)
test_num_columns = [i for i in test_names if i not in obj_columns]
test_obj_columns = [i for i in test_names if i not in test_num_columns]

print(len(test_num_columns))
print(len(test_obj_columns))

11
6


In [4]:
#Build an initial model based on number value columns
lr = LogisticRegression(C=1)
lr.fit(train_data[test_num_columns], train_labels)

y_pred = lr.predict(test_data[test_num_columns])
print(y_pred.shape)
print(type(y_pred))

preds = pd.DataFrame()
preds['request_id'] = test_data['request_id']
preds['requester_received_pizza'] = y_pred.astype(int)
num = sum(preds['requester_received_pizza'])

print(num, sum(train_labels))

preds.to_csv('./data/submission1.csv', index=False)

(1631,)
<class 'numpy.ndarray'>
0 894


In [14]:
#Normalized Dataframe
num_train_data = pd.DataFrame()

for column in train_data[test_num_columns]:
    num_train_data[column] = (train_data[column]-train_data[column].mean())/ train_data[column].std()
    
lr = LogisticRegression(C=1)
lr.fit(num_train_data, train_labels)

y_pred = lr.predict(test_data[test_num_columns])
print(y_pred.shape)
print(type(y_pred))

preds = pd.DataFrame()
preds['request_id'] = test_data['request_id']
preds['requester_received_pizza'] = y_pred.astype(int)
num = sum(preds['requester_received_pizza'])

print(num, sum(train_labels))

preds.to_csv('./data/submission1.csv', index=False)

(1631,)
<class 'numpy.ndarray'>
0 894


In [5]:
#Try looking at request only
train_req = train_data['request_text_edit_aware']
test_req = test_data['request_text_edit_aware']
vec = CountVectorizer()
train = vec.fit_transform(train_req)
vocab_train = vec.get_feature_names()
vec2 = CountVectorizer(vocabulary=vocab_train)
test = vec2.fit_transform(test_req)

mb =  MultinomialNB()
mb.fit(train, train_labels)
y_pred = mb.predict(test)

print(y_pred.shape)
print(type(y_pred))

preds = pd.DataFrame()
preds['request_id'] = test_data['request_id']
preds['requester_received_pizza'] = y_pred.astype(int)
num = sum(preds['requester_received_pizza'])

print(num, sum(train_labels))

preds.to_csv('./data/submission1.csv', index=False)

(1631,)
<class 'numpy.ndarray'>
81 894
