In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

#Julia sample code
using DataFrames
using MachineLearning
using JSON

function read_data(file_name)
    f = open(file_name)
    json = JSON.parse(readall(f))
    close(f)

    colnames = keys(json[1])
    columns  = Any[[json[i][name] for i=1:length(json)] for name=colnames]
    DataFrame(columns, Symbol[name for name=colnames])
end

train = read_data("../data/train.json")
test  = read_data("../data/test.json")

println(@sprintf("There are %d rows in the training set", nrow(train)))
println(@sprintf("There are %d rows in the test set", nrow(test)))

feature_names = Symbol["requester_account_age_in_days_at_request",
                       "requester_days_since_first_post_on_raop_at_request",
                       "requester_number_of_comments_at_request",
                       "requester_number_of_comments_in_raop_at_request",
                       "requester_number_of_posts_at_request",
                       "requester_number_of_posts_on_raop_at_request",
                       "requester_number_of_subreddits_at_request",
                       "requester_upvotes_minus_downvotes_at_request",
                       "requester_upvotes_plus_downvotes_at_request",
                       "unix_timestamp_of_request_utc"]

for feature = feature_names
    train[feature] = float64(train[feature])
    test[feature]  = float64(test[feature])
end

columns_to_keep = cat(1, feature_names, [:requester_received_pizza])

rf = fit(train[columns_to_keep], :requester_received_pizza, classification_forest_options(num_trees=200, display=true))
println("")
println(rf)
println("")
predictions = predict_probs(rf, test)[:,2]
submission = DataFrame(request_id=test[:request_id], requester_received_pizza=predictions)
writetable("simple_julia_benchmark.csv", submission)

In [4]:
#json libraries
import os
import json

# Load Data
train_data = json.load(open("../data/train.json"))
print("Train len:",len(train_data))
test_data = json.load(open("../data/test.json"))
print("Test len:",len(test_data))
print(type(train_data[0]))
print("Train example:\n",train_data[2])
print("Test example:\n",test_data[8])

# split train set into train and development for initial testing before submitting final predictions on test.json

# DATA SUMMARY:
# list of dictionaries
# 'request_id' - sample point identifier, for final submission
# 'requester_received_pizza' - boolean True, False
# 'request_text' - raw text request, not always populated
# 'request_text_edit_aware' - with "edit" comment removed, e.g. "thanks for pizza"

FileNotFoundError: [Errno 2] No such file or directory: '../data/train.json'

In [47]:
# pull raw text and labels from train and test
train_raw_text_list = []
train_labels = []

test_raw_text_list = []

for ob in train_data:
    #print(type(ob))
    train_raw_text_list.append(ob['request_text_edit_aware'])
    train_labels.append(ob['requester_received_pizza'])
print(len(train_raw_text_list), len(train_labels))
print(train_raw_text_list[0:2], "\n", train_labels[0:2])

for ob in test_data:
    test_raw_text_list.append(ob['request_text_edit_aware'])
print(len(test_raw_text_list))

4040 4040
['Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated', 'I spent the last money I had on gas today. Im broke until next Thursday :('] 
 [False, False]
1631


In [49]:
#vectorize

#preprocess
def preprocessor(s):
    s = s.lower()
    return s

#vectorize text
vectorizer = CountVectorizer(preprocessor = preprocessor)
train_text_feats = vectorizer.fit_transform(train_raw_text_list)
print("(obs, features):", train_text_feats.shape)

#train Naive Bayes classifier
nb = BernoulliNB()
nb.fit(train_text_feats, train_labels)
nb_train_preds = nb.predict(train_text_feats)
train_f1_score = metrics.f1_score(train_labels, nb_train_preds, average='micro')
print("Training set F1 score:", "{:.4}".format(train_f1_score))

#predict on test dataset
test_feats = vectorizer.transform(test_raw_text_list)
nb_test_preds = nb.predict(test_feats)
print(nb_test_preds.shape)
print(nb_test_preds[0:50])

(obs, features): (4040, 12317)
Training set F1 score: 0.7656
(1631,)
[False False False False False False False False False False False False
 False False False False False False False False False False False False
  True  True False False False False False False False False  True  True
 False False False  True False  True False False False  True False False
 False False]
