Notes:
    
14Apr15: *Changed Model's 'test' method to take the type of report as input. We should be using AUC most of the time (that is how the comp is scored) but might be useful to use classificaition report sometimes.
*Classes start with caps, functions/methods in camel case

In [18]:
'''
This cell's function:
Import all libraries that will be needed throughout document
'''

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint
import datetime
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [25]:
class Data_handler():
    
    if __name__ == "__main__":
        dfTrain_full = pd.DataFrame()
        dfTest = pd.DataFrame()
        dfTrain = pd.DataFrame()
        dfDev = pd.DataFrame()
        trainColumnNames = []
        testColumnNames = []
    
    def initialize_data(self,train_json, test_json, prop_train):
        #Load in data as panda dataframes
        with open(train_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTrain_full = pd.io.json.json_normalize(json_data)

        with open(test_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTest = pd.io.json.json_normalize(json_data)

        # Set np seed
        np.random.seed(0)

        #Shuffle train data and split into train and dev
        self.dfTrain_full.reindex(np.random.permutation(self.dfTrain_full.index)) #shuffle
        nTrain_full = self.dfTrain_full.shape[0]
        self.dfTrain = self.dfTrain_full[:int(nTrain_full*prop_train)]
        self.dfDev = self.dfTrain_full[int(nTrain_full*prop_train):]
        
        #Save number of observations in train and dev
        nTrain = self.dfTrain.shape[0]
        nDev = self.dfDev.shape[0]

        #Save column names for reference
        self.trainColumnNames = self.dfTrain.columns.tolist()
        self.testColumnNames = self.dfTest.columns.tolist() #Note test features is only a subset!
        
        return nTrain_full, nTrain, nDev ################# WHY DOES THIS GET RETURNED? WHAT IS IT USED FOR?
    
    def getTrainFull(self):
        return self.dfTrain_full   
    def getTest(self):
        return self.dfTest   
    def getTrain(self):
        return self.dfTrain   
    def getDev(self):
        return self.dfDev    
    def getTrainColumnNames(self):
        return self.trainColumnNames   
    def getTestColumnNames(self):
        return self.testColumnNames
    
# Load in our data to master_data. Proportion in train vs dev is set here. Changing it will take effect throughout everything else in the script (the magic of classes!)
master_data = Data_handler()
master_data.initialize_data('train.json', 'test.json',.75)


(4040, 3030, 1010)

In [41]:
class FeatureEngineer():
    '''
    Module that contains some of our feature engineering methods. 
    Does not contain instance variables!
    '''
    def separateTimestamp(self,df):
        '''
        separates time stamp (UTC) into month, day, hour. If user's local time is of interest, 
        use the non UTC data
        input: any dataFrame containing the timestamp data
        '''

        timeStamps = df['unix_timestamp_of_request_utc'].values #numpy array of timestamps
        timeStampsSeparate = [] #init new

        # Loop over timestamps
        for ts in timeStamps:
            # Pull out relevant time info
            month = datetime.datetime.fromtimestamp(ts).strftime("%m")
            day_of_month = datetime.datetime.fromtimestamp(ts).strftime("%d")
            hour = datetime.datetime.fromtimestamp(ts).strftime("%H")
            # Append to results
            timeStampsSeparate.append([int(month),int(day_of_month),int(hour)])

        #convert from python list to ndarray
        return np.asarray(timeStampsSeparate)

fe = FeatureEngineer()

In [26]:
class Model():
    if __name__ == "__main__":
        train_data = pd.DataFrame
        test_data = pd.DataFrame
        train_labels = np.array
        prediction = np.array
    
    def init_test(self):
        self.train_data = master_data.getTrain()
        self.test_data = master_data.getDev()
        self.prediction = np.zeros((len(self.test_data.values))) 
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def init_final(self):
        self.train_data = master_data.getTrainFull()
        self.test_data = master_data.getTest()
        self.prediction = np.zeros((len(self.test_data.values)))
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def test(self,criteria=metrics.roc_auc_score):
        try:
            dev_labels = self.test_data['requester_received_pizza'].values
            return criteria(dev_labels,self.prediction)
        except:
            return "Failed! Did you initialize as test? What criteria did you use?"
    
    def finalize(self,fileName='submit_to_kaggle.csv'):
        try:
            '''
            Ensure the test data hasn't been shuffled or your labels won't match the request_id's.
            '''
            #extract request_id so we can match against predictions for submission to kaggle
            req = self.test_data['request_id']
            #make prediction into a pandas series
            print self.prediction.astype(int)
            pred_series = pd.Series(self.prediction.astype(int),name="requester_received_pizza")
            #now join into data frame
            out = pd.concat([req,pred_series], axis=1)
            #write data frame to csv (using kaggles sample submission csv for correct format)
            out.to_csv(fileName,index=False)
        except:
            return "Failed! Did you initialize as final?"

In [27]:
class Baseline(Model):
    '''
    Any model just needs to implicitly inherit the model class
    Test this with baseline.init_test() and baseline.test()
    Generate output with baseline.init_final() and baseline.finalize()
    
    This is the first model we submitted to Kaggle
    '''

    def run_model(self):
        train = self.train_data['request_text_edit_aware'].values
        test = self.test_data['request_text_edit_aware'].values
        v_train, v_test = self.vectorize(train,test)
        self.prediction = self.log_reg(v_train,v_test)

    def vectorize(self,train,test):
        # transform the train data
        vectorizer_train = CountVectorizer()
        v_train = vectorizer_train.fit_transform(train)
        vocab_train = vectorizer_train.get_feature_names()
        # transform the dev data using the same vocab
        v_test = vectorizer_train.transform(test)     # 'transform' function will preserve previous vocab
        return v_train, v_test

    def log_reg(self,v_train,v_test):
        lor = LogisticRegression()
        lor.fit(v_train, self.train_labels)
        lor_pred = lor.predict(v_test)
        # Return the prediction matrix, coefficients
        return lor_pred

In [56]:

class NumericModel(Model):
    '''
    Simple models with numeric data straight out of df
    '''
    def run_model(self):
        colNames = [master_data.getTestColumnNames()[i] for i in [5,7,8,9]] #hand picked to be plausible 
        #numeric data
        train_num = self.train_data[colNames].values
        test_num = self.test_data[colNames].values
        #convert time stamp into nice format
        train_time = fe.separateTimestamp(self.train_data)
        test_time = fe.separateTimestamp(self.test_data)
        #merge
        train = np.column_stack((train_num,train_time))
        test = np.column_stack((test_num,test_time))
        #predict
        self.prediction = self.classify(train,test)    
    
    def classify(self,train,test):
        nb = GaussianNB()
        print train
        nb.fit(train, self.train_labels)
        return nb.predict(test) 


In [29]:
'''
TUTORIAL: This cell shows how the dataframes above get accessed and turned into usable numpy arrays
'''

###Task: Extracting message text AND title text into a feature vector:
#first find name of column by printing out the list of names
trainColumnNames = master_data.getTrainColumnNames()
pprint(trainColumnNames) #looks like we want 'request_text_edit_aware' and 'request_title'
#find which number this is or manually type column name
print '\n'
print trainColumnNames[7]
print trainColumnNames[8]

#two ways to get data we want:
print '\n'
X_train = master_data.getTrain()[['request_text_edit_aware','request_title']] #method 1
X_train = master_data.getTrain()[[trainColumnNames[7],trainColumnNames[8]]] #method 2
print X_train.head() #.head() just prints the first 5 rows

#The above X_train is still a pandas dataframe. Converting to numpy array for sklearn is as simple as:
print '\n'
X_train = X_train.values
print type(X_train)
print X_train.shape

#In summary (quick way):
X_train = master_data.getTrain()[['request_text_edit_aware','request_title']].values 

###Task: Join 2 numpy arrays horizontally (e.g. merge train and dev for final submission)
train_data = master_data.getTrain()['request_text_edit_aware'].values
dev_data = master_data.getDev()['request_text_edit_aware'].values
merged_data = np.concatenate((train_data,dev_data),axis=0)
print '\n'
print train_data.shape,' ',dev_data.shape,' ',merged_data.shape

###Task: Join 2 numpy arrays vertically (e.g. add a bunch of features)
train_data1 = master_data.getTrain()['request_text_edit_aware'].values
#now we want more features... say from some feature engineering process
train_data2 = master_data.getTrain()['request_title'].values
train_data_merged = np.column_stack((train_data1,train_data2)) #<---- where the action is at!
print '\n'
print train_data1.shape,' ',train_data2.shape,' ',train_data_merged.shape

###Task: Use the Classes
'''
To use:
1) instantiate your model class
2) initialize it as either test or final
3) run it and either test or finalize it

'''

[u'giver_username_if_known',
 u'number_of_downvotes_of_request_at_retrieval',
 u'number_of_upvotes_of_request_at_retrieval',
 u'post_was_edited',
 u'request_id',
 u'request_number_of_comments_at_retrieval',
 u'request_text',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_account_age_in_days_at_retrieval',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_days_since_first_post_on_raop_at_retrieval',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_at_retrieval',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_comments_in_raop_at_retrieval',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_at_retrieval',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_posts_on_raop_at_retrieval',
 u'requester_number_of_subreddits_at_request',
 u'requester_received_pizza',
 u'requester_subreddits_at_request',
 u'requester_upvo

'\nTo use:\n1) instantiate your model class\n2) initialize it as either test or final\n3) run it and either test or finalize it\n\n'

In [30]:
baseline_model = Baseline()

In [31]:
# Run test of baseline
baseline_model.init_test()
baseline_model.run_model()
print baseline_model.test()

0.55160457863


In [13]:
# Run final of baseline
baseline_model.init_final()
baseline_model.run_model()
print baseline_model.finalize('test_finalize.csv')

[0 0 0 ..., 0 1 0]
None


In [57]:
#Run NumericModel
numeric_model = NumericModel()
numeric_model.init_test()
numeric_model.run_model()
print numeric_model.test()

[[  0.          0.          0.        ...,  10.          6.          8.       ]
 [  0.          0.         15.        ...,   3.         25.         15.       ]
 [  0.          0.          0.        ...,  10.         27.          3.       ]
 ..., 
 [  3.8171412   1.          0.        ...,  10.          2.          8.       ]
 [  0.          0.          0.        ...,   6.         21.          1.       ]
 [  0.          0.          0.        ...,   7.         22.         15.       ]]
0.528009541451
