In [163]:
'''
This cell's function:
Import all libraries that will be needed throughout document
'''

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint
import datetime
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [176]:
class data_handler():
    
    if __name__ == "__main__":
        dfTrain_full = pd.DataFrame()
        dfTest = pd.DataFrame()
        dfTrain = pd.DataFrame()
        dfDev = pd.DataFrame()
        trainColumnNames = []
        testColumnNames = []
    
    def initialize_data(self,train_json, test_json, prop_train):
        #Load in data as panda dataframes
        with open(train_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTrain_full = pd.io.json.json_normalize(json_data)

        with open(test_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTest = pd.io.json.json_normalize(json_data)

        # Set np seed
        np.random.seed(0)

        #Shuffle train data and split into train and dev
        self.dfTrain_full.reindex(np.random.permutation(self.dfTrain_full.index)) #shuffle
        nTrain_full = self.dfTrain_full.shape[0]
        self.dfTrain = self.dfTrain_full[:int(nTrain_full*prop_train)]
        self.dfDev = self.dfTrain_full[int(nTrain_full*prop_train):]
        
        #Save number of observations in train and dev
        nTrain = self.dfTrain.shape[0]
        nDev = self.dfDev.shape[0]

        #Save column names for reference
        self.trainColumnNames = self.dfTrain.columns.tolist()
        self.testColumnNames = self.dfTest.columns.tolist() #Note test features is only a subset!
        
        return nTrain_full, nTrain, nDev

# Load in our data to master_data. Proportion in train vs dev is set here. Changing it will take effect throughout everything else in the script (the magic of classes!)
master_data = data_handler()
master_data.initialize_data('train.json', 'test.json',.75)


(4040, 3030, 1010)

In [179]:
class model():
    if __name__ == "__main__":
        train_data = pd.DataFrame
        test_data = pd.DataFrame
        train_labels = np.array
        prediction = np.array
    
    def init_test(self):
        self.train_data = master_data.dfTrain
        self.test_data = master_data.dfDev
        self.prediction = np.zeros((len(self.test_data.values)))
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def init_final(self):
        self.train_data = master_data.dfTrain_full
        self.test_data = master_data.dfTest
        self.prediction = np.zeros((len(self.test_data.values)))
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def test(self):
        try:
            dev_labels = self.test_data['requester_received_pizza'].values
            return metrics.classification_report(dev_labels,self.prediction)
        except:
            return "Failed! Did you initialize as test?"
    
    def finalize(self,fileName='submit_to_kaggle.csv'):
        try:
            '''
            Ensure the test data hasn't been shuffled or your labels won't match the request_id's.
            '''
            #extract request_id so we can match against predictions for submission to kaggle
            req = self.test_data['request_id']
            #make prediction into a pandas series
            print self.prediction.astype(int)
            pred_series = pd.Series(self.prediction.astype(int),name="requester_received_pizza")
            #now join into data frame
            out = pd.concat([req,pred_series], axis=1)
            #write data frame to csv (using kaggles sample submission csv for correct format)
            out.to_csv(fileName,index=False)
        except:
            return "Failed! Did you initialize as final?"

In [217]:
class baseline(model):
    '''
    Any model just needs to implicitly inherit the model class
    Test this with baseline.init_test() and baseline.test()
    Generate output with baseline.init_final() and baseline.finalize()
    '''

    def run_model(self):
        train = self.train_data['request_text_edit_aware'].values
        test = self.test_data['request_text_edit_aware'].values
        v_train, v_test = self.vectorize(train,test)
        self.prediction = self.log_reg(v_train,v_test)

    def vectorize(self,train,test):
        # transform the train data
        vectorizer_train = CountVectorizer()
        v_train = vectorizer_train.fit_transform(train)
        vocab_train = vectorizer_train.get_feature_names()
        # transform the dev data using the same vocab
        v_test = vectorizer_train.transform(test)     # 'transform' function will preserve previous vocab
        return v_train, v_test

    def log_reg(self,v_train,v_test):
        lor = LogisticRegression()
        lor.fit(v_train, self.train_labels)
        lor_pred = lor.predict(v_test)
        # Return the prediction matrix, coefficients
        return lor_pred

In [218]:
'''
To use:
1) instantiate your model class
2) initialize it as either test or final
3) run it and either test or finalize it

'''

'\n1) look at my examples and figure out how to make superclasses work\n2) implement superclass baseline(model)\n3) make this script just something that gets called in to actual model scripts\n\n'

In [219]:
baseline_model = baseline()

In [220]:
# Run test of baseline
baseline_model.init_test()
baseline_model.run_model()
print baseline_model.test()

(1010L,)
             precision    recall  f1-score   support

      False       0.78      0.85      0.81       761
       True       0.36      0.25      0.30       249

avg / total       0.67      0.70      0.68      1010



In [221]:
# Run final of baseline
baseline_model.init_final()
baseline_model.run_model()
print baseline_model.finalize('test_finalize.csv')

[0 0 0 ..., 0 1 0]
None
