Notes:
    
14Apr15: *Changed Model's 'test' method to take the type of report as input. We should be using AUC most of the time (that is how the comp is scored) but might be useful to use classificaition report sometimes.
*Classes start with caps, functions/methods in camel case

TODO:
*Add getter to "model" for classification results
*Fix init (Mike to do) to make pythonic. Also have it accept params so our classes can be used for grid searching WITHOUT breaking existing classes

In [81]:
'''
This cell's function:
Import all libraries that will be needed throughout document
'''

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint
import datetime
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [25]:
class Data_handler():
    
    if __name__ == "__main__":
        dfTrain_full = pd.DataFrame()
        dfTest = pd.DataFrame()
        dfTrain = pd.DataFrame()
        dfDev = pd.DataFrame()
        trainColumnNames = []
        testColumnNames = []
    
    def initialize_data(self,train_json, test_json, prop_train):
        #Load in data as panda dataframes
        with open(train_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTrain_full = pd.io.json.json_normalize(json_data)

        with open(test_json,'r') as fp: 
            json_data = json.load(fp)
        self.dfTest = pd.io.json.json_normalize(json_data)

        # Set np seed
        np.random.seed(0)

        #Shuffle train data and split into train and dev
        self.dfTrain_full.reindex(np.random.permutation(self.dfTrain_full.index)) #shuffle
        nTrain_full = self.dfTrain_full.shape[0]
        self.dfTrain = self.dfTrain_full[:int(nTrain_full*prop_train)]
        self.dfDev = self.dfTrain_full[int(nTrain_full*prop_train):]
        
        #Save number of observations in train and dev
        nTrain = self.dfTrain.shape[0]
        nDev = self.dfDev.shape[0]

        #Save column names for reference
        self.trainColumnNames = self.dfTrain.columns.tolist()
        self.testColumnNames = self.dfTest.columns.tolist() #Note test features is only a subset!
        
        return nTrain_full, nTrain, nDev ################# WHY DOES THIS GET RETURNED? WHAT IS IT USED FOR?
    
    def getTrainFull(self):
        return self.dfTrain_full   
    def getTest(self):
        return self.dfTest   
    def getTrain(self):
        return self.dfTrain   
    def getDev(self):
        return self.dfDev    
    def getTrainColumnNames(self):
        return self.trainColumnNames   
    def getTestColumnNames(self):
        return self.testColumnNames
    
# Load in our data to master_data. Proportion in train vs dev is set here. Changing it will take effect throughout everything else in the script (the magic of classes!)
master_data = Data_handler()
master_data.initialize_data('train.json', 'test.json',.75)


(4040, 3030, 1010)

In [251]:
class FeatureEngineer():
    '''
    Module that contains some of our feature engineering methods. 
    Does not contain instance variables!
    Outputs np arrays or dataframes as needed
    '''
    def separateTimestamp(self,df):
        '''
        separates time stamp (UTC) into month, day, hour. If user's local time is of interest, 
        use the non UTC data
        input: any dataFrame containing the timestamp data
        '''

        timeStamps = df['unix_timestamp_of_request_utc'].values #numpy array of timestamps
        timeStampsSeparate = [] #init new

        # Loop over timestamps
        for ts in timeStamps:
            # Pull out relevant time info
            d = datetime.datetime.fromtimestamp(ts)
            month = d.strftime("%m")
            day_of_month = d.strftime("%d")
            hour = d.strftime("%H")
            # Append to results
            timeStampsSeparate.append([int(month),int(day_of_month),int(hour)])

        #convert from python list to ndarray
        return np.asarray(timeStampsSeparate)

    def newTimeInfo(self,df):
        '''
        New expressions of time e.g day of week
        '''
        
        timeStamps = df['unix_timestamp_of_request'].values #LOCAL time
        timeStampsSeparate = [] #init new

        # Loop over timestamps
        for ts in timeStamps:
            # Pull out relevant time info
            d = datetime.datetime.fromtimestamp(ts)
            day_of_week = d.isoweekday()
            local_time =d.strftime("%H")
            # Append to results
            timeStampsSeparate.append([int(day_of_week),int(local_time)])
        
        #convert from python list to ndarray
        return np.asarray(timeStampsSeparate)
        
    def selectedNumericFeatures(self,df):
        '''
        Some hand picked numeric features that seem to work well in logistic regression
        Dependency: separateTimestamp
        '''
        colNames = [master_data.getTestColumnNames()[i] for i in [5,7,8,9]] #hand picked to be plausible, not optimal 
        #numeric data
        dfNum = df[colNames].values
        #convert time stamp into nice format
        dfTime = self.separateTimestamp(df)
        #merge
        combinedData = np.column_stack((dfNum,dfTime))
        return combinedData
    
    def allNumericFeatures(self,df):
        '''
        All of the numeric features that come standard.
        Dependency: separateTimestamp
        '''
        colNames = [master_data.getTestColumnNames()[i] for i in [4,5,6,7,8,9,10,12,13]] 
        #numeric data
        dfNum = df[colNames].values
        #convert time stamp into nice format
        dfTime = self.separateTimestamp(df)
        #merge
        combinedData = np.column_stack((dfNum,dfTime))
        return combinedData
    
    def simpleNewFeatures(self,df):
        '''
        Basic self explanatory features.
        Input: whole df
        '''
        
        #From text
        title_length = [len(t) for t in df['request_title'].values]
        req_length = [len(t) for t in df['request_text_edit_aware'].values]
        avg_word_length = [float(len(t))/len(t.split(' ')) for t in df['request_text_edit_aware'].values] #complexity of lang
        title_caps_norm = [sum(1 for c in t if c.isupper())/(float(len(t))+1) for t in df['request_title'].values]
        req_caps_norm = [sum(1 for c in t if c.isupper())/(float(len(t))+1) for t in df['request_text_edit_aware'].values]
        #num_all_caps_words
        #num_repeated_words #perhaps normalize and/or remove shorter words
        
        ###special chars in text
        num_numbers_norm = [sum(1 for c in t if c.isdigit())/(float(len(t))+1) for t in df['request_text_edit_aware'].values]
        num_currency_chars_norm = [sum(1 for c in t if c in '$')/(float(len(t))+1) for t in df['request_text_edit_aware'].values]
        num_exclamation_norm = [sum(1 for c in t if c in '!')/(float(len(t))+1) for t in df['request_text_edit_aware'].values]
        num_commas_norm = [sum(1 for c in t if c in ',')/(float(len(t))+1) for t in df['request_text_edit_aware'].values]
        
        ###language
        #grammar_errors #normalized for request length
        #spelling_errors #normalized for request length
        
        return np.asarray([title_length,
                          req_length,
                          avg_word_length,
                          title_caps_norm,
                          req_caps_norm,
                          num_numbers_norm,
                          num_currency_chars_norm,
                          num_exclamation_norm,
                          num_commas_norm]).T
    
    def augNumericFeatures(self,df):
        f1 = self.simpleNewFeatures(df)
        f2 = self.allNumericFeatures(df)
        return np.column_stack((f1,f2))
    
    def customNGrams(self,df):
        '''
        N Grams built by intuition
        '''
        #pay_it_forward = some regex to find phrase "pay it forward"
        raise NotImplementedError
        
fe = FeatureEngineer()

In [246]:
fe.simpleNewFeatures(master_data.getTrain()).shape


(3030L, 9L)

In [269]:
class Model():
    if __name__ == "__main__":
        train_data = pd.DataFrame
        test_data = pd.DataFrame
        train_labels = np.array
        prediction = np.array
    
    def init_test(self):
        self.train_data = master_data.getTrain()
        self.test_data = master_data.getDev()
        self.prediction = np.zeros((len(self.test_data.values))) 
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def mod_for_ensemble(self):
        '''
        Replace test data with train again. 
        Use this after init_test or init_final if you want to predict on the same data you fit with.
        '''
        self.test_data = self.train_data
        self.prediction = np.zeros((len(self.test_data.values))) 
    
    def init_final(self):
        self.train_data = master_data.getTrainFull()
        self.test_data = master_data.getTest()
        self.prediction = np.zeros((len(self.test_data.values)))
        self.train_labels = self.train_data['requester_received_pizza'].values
    
    def test(self,criteria=metrics.roc_auc_score):
        try:
            predict_labels = self.test_data['requester_received_pizza'].values
            return criteria(predict_labels,self.prediction)
        except:
            return "Failed! Did you initialize as test? What criteria did you use?"
    
    def finalize(self,fileName='submit_to_kaggle.csv'):
        try:
            '''
            Ensure the test data hasn't been shuffled or your labels won't match the request_id's.
            '''
            #extract request_id so we can match against predictions for submission to kaggle
            req = self.test_data['request_id']
            #make prediction into a pandas series
            print self.prediction.astype(int)
            pred_series = pd.Series(self.prediction.astype(int),name="requester_received_pizza")
            #now join into data frame
            out = pd.concat([req,pred_series], axis=1)
            #write data frame to csv (using kaggles sample submission csv for correct format)
            out.to_csv(fileName,index=False)
        except:
            return "Failed! Did you initialize as final?"
    
    def getPrediction(self):
        return self.prediction

In [270]:
class Baseline(Model):
    '''
    Any model just needs to implicitly inherit the model class
    Test this with baseline.init_test() and baseline.test()
    Generate output with baseline.init_final() and baseline.finalize()
    
    This is the first model we submitted to Kaggle
    '''

    def run_model(self):
        train = self.train_data['request_text_edit_aware'].values
        test = self.test_data['request_text_edit_aware'].values
        v_train, v_test = self.vectorize(train,test)
        self.prediction = self.log_reg(v_train,v_test)

    def vectorize(self,train,test):
        # transform the train data
        vectorizer_train = CountVectorizer()
        v_train = vectorizer_train.fit_transform(train)
        vocab_train = vectorizer_train.get_feature_names()
        # transform the dev data using the same vocab
        v_test = vectorizer_train.transform(test)     # 'transform' function will preserve previous vocab
        return v_train, v_test

    def log_reg(self,v_train,v_test):
        lor = LogisticRegression()
        lor.fit(v_train, self.train_labels)
        lor_pred = lor.predict(v_test)
        # Return the prediction matrix, coefficients
        return lor_pred

In [281]:
class NumericModel(Model):
    '''
    Simple models with numeric data straight out of df
    '''
    def run_model(self):
        train = fe.augNumericFeatures(self.train_data)
        test = fe.augNumericFeatures(self.test_data)
        #predict
        self.prediction = self.classify(train,test)    
    
    def classify(self,train,test):
        nb = GaussianNB()
        nb.fit(train, self.train_labels)
        return nb.predict(test) 


In [260]:
class DecisionTreeModel(Model):
    '''
    Basic decision tree. Runs on numeric features
    '''
    def run_model(self):
        train = fe.allNumericFeatures(self.train_data)
        test = fe.allNumericFeatures(self.test_data)
        #predict
        self.prediction = self.classify(train,test)  
    
    def classify(self,train,test):
        dt = DecisionTreeClassifier(criterion='entropy') #criterion can be 'gini' or 'entropy'
        dt.fit(train, self.train_labels)
        return dt.predict(test)
        
        
        

In [278]:
class RandomForestModel(Model):
    '''
    Random Forest
    -Hypothesise that the decision function of people granting pizza is quite short, 
    maybe max of 5 steps (counting the request text as one step)
    
    '''
    def run_model(self):
        train = fe.augNumericFeatures(self.train_data)
        test = fe.augNumericFeatures(self.test_data)
        #predict
        self.prediction = self.classify(train,test)  
    
    def classify(self,train,test):
        rf = RandomForestClassifier(n_estimators=100,
                                    criterion='gini')#,
                                    #max_features=10) #criterion can be 'gini' or 'entropy'
        rf.fit(train, self.train_labels)
        return rf.predict(test)
    

In [279]:
class EnsembleForest(RandomForestModel):
    '''
    RF that incorporates text data via votes of another classifier
    BROKEN: Because ensemble1 init_test and doesn't ever init_final, you can't use this model's 
    init_final or finalize.
    '''
    def ensemble1(self):
        e = Baseline()
        e.init_test()
        e.run_model()
        #get dev data prediction
        devPrediction = e.getPrediction()
        #now reset internal data so we can get test data prediction
        e.mod_for_ensemble()
        e.run_model()
        testPrediction = e.getPrediction()
        return testPrediction, devPrediction
    
    def init_final(self):
        raise NotImplementedError('Low level ensemble is hard coded to init_test at the moment')
    
    def run_model(self):
        train1 = fe.augNumericFeatures(self.train_data)
        test1 = fe.augNumericFeatures(self.test_data)
        train2,test2 = self.ensemble1()
        train = np.column_stack((train1,train2))
        test = np.column_stack((test1,test2))
        print train.shape
        print train1.shape
        print train2.shape
        print test.shape
        print test1.shape
        print test2.shape
        #predict
        self.prediction = self.classify(train,test)      

In [284]:
class EnsembleNB(NumericModel):
    '''
    NB that incorporates text data via votes of another classifier
    BROKEN: Because ensemble1 init_test and doesn't ever init_final, you can't use this model's 
    init_final or finalize.
    '''
    def ensemble1(self):
        e = Baseline()
        e.init_test()
        e.run_model()
        #get dev data prediction
        devPrediction = e.getPrediction()
        #now reset internal data so we can get test data prediction
        e.mod_for_ensemble()
        e.run_model()
        testPrediction = e.getPrediction()
        return testPrediction, devPrediction
    
    def init_final(self):
        raise NotImplementedError('Low level ensemble is hard coded to init_test at the moment')
    
    def run_model(self):
        train1 = fe.augNumericFeatures(self.train_data)
        test1 = fe.augNumericFeatures(self.test_data)
        train2,test2 = self.ensemble1()
        train = np.column_stack((train1,train2))
        test = np.column_stack((test1,test2))
        print train.shape
        print train1.shape
        print train2.shape
        print test.shape
        print test1.shape
        print test2.shape
        #predict
        self.prediction = self.classify(train,test)  

In [60]:
class StackedRegression():
    '''
    Stacking with logistic regression.
    Makes a linear combination of outputs of other models
    http://link.springer.com/article/10.1007%2FBF00117832
    '''
    raise NotImplementedError("Need a 28 hour day")

NotImplementedError: Need a 28 hour day

In [29]:
'''
TUTORIAL: This cell shows how the dataframes above get accessed and turned into usable numpy arrays
'''

###Task: Extracting message text AND title text into a feature vector:
#first find name of column by printing out the list of names
trainColumnNames = master_data.getTrainColumnNames()
pprint(trainColumnNames) #looks like we want 'request_text_edit_aware' and 'request_title'
#find which number this is or manually type column name
print '\n'
print trainColumnNames[7]
print trainColumnNames[8]

#two ways to get data we want:
print '\n'
X_train = master_data.getTrain()[['request_text_edit_aware','request_title']] #method 1
X_train = master_data.getTrain()[[trainColumnNames[7],trainColumnNames[8]]] #method 2
print X_train.head() #.head() just prints the first 5 rows

#The above X_train is still a pandas dataframe. Converting to numpy array for sklearn is as simple as:
print '\n'
X_train = X_train.values
print type(X_train)
print X_train.shape

#In summary (quick way):
X_train = master_data.getTrain()[['request_text_edit_aware','request_title']].values 

###Task: Join 2 numpy arrays horizontally (e.g. merge train and dev for final submission)
train_data = master_data.getTrain()['request_text_edit_aware'].values
dev_data = master_data.getDev()['request_text_edit_aware'].values
merged_data = np.concatenate((train_data,dev_data),axis=0)
print '\n'
print train_data.shape,' ',dev_data.shape,' ',merged_data.shape

###Task: Join 2 numpy arrays vertically (e.g. add a bunch of features)
train_data1 = master_data.getTrain()['request_text_edit_aware'].values
#now we want more features... say from some feature engineering process
train_data2 = master_data.getTrain()['request_title'].values
train_data_merged = np.column_stack((train_data1,train_data2)) #<---- where the action is at!
print '\n'
print train_data1.shape,' ',train_data2.shape,' ',train_data_merged.shape

###Task: Use the Classes
'''
To use:
1) instantiate your model class
2) initialize it as either test or final
3) run it and either test or finalize it

'''

[u'giver_username_if_known',
 u'number_of_downvotes_of_request_at_retrieval',
 u'number_of_upvotes_of_request_at_retrieval',
 u'post_was_edited',
 u'request_id',
 u'request_number_of_comments_at_retrieval',
 u'request_text',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_account_age_in_days_at_retrieval',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_days_since_first_post_on_raop_at_retrieval',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_at_retrieval',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_comments_in_raop_at_retrieval',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_at_retrieval',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_posts_on_raop_at_retrieval',
 u'requester_number_of_subreddits_at_request',
 u'requester_received_pizza',
 u'requester_subreddits_at_request',
 u'requester_upvo

'\nTo use:\n1) instantiate your model class\n2) initialize it as either test or final\n3) run it and either test or finalize it\n\n'

In [262]:
baseline_model = Baseline()

In [263]:
# Run test of baseline
baseline_model.init_test()
baseline_model.run_model()
print baseline_model.test()

0.55160457863


In [13]:
# Run final of baseline
baseline_model.init_final()
baseline_model.run_model()
print baseline_model.finalize('test_finalize.csv')

[0 0 0 ..., 0 1 0]
None


In [264]:
#Run NumericModel
numeric_model = NumericModel()
numeric_model.init_test()
numeric_model.run_model()
print numeric_model.test()

0.528009541451


In [267]:
#Run test of decision tree
decision_tree_model = DecisionTreeModel()
decision_tree_model.init_test()
decision_tree_model.run_model()
print decision_tree_model.test()

0.528431729546


In [268]:
#Run test of random forest
rf_model = RandomForestModel()
rf_model.init_test()
rf_model.run_model()
print rf_model.test()


0.516212022861


In [283]:
#Run test of EnsembleForest
ef_model = EnsembleForest()
ef_model.init_test()
ef_model.run_model()
print ef_model.test()

####### WHY THE HECK IS THIS PERFORMANCE EXACTLY THE SAME AS BASELINE MODEL. 
####### THis should totally be incorporating numerical features as well...

(3030L, 22L)
(3030L, 21L)
(3030L,)
(1010L, 22L)
(1010L, 21L)
(1010L,)
0.55160457863


In [285]:
#Run test of EnsembleNB
enb_model = EnsembleForest()
enb_model.init_test()
enb_model.run_model()
print enb_model.test()


(3030L, 22L)
(3030L, 21L)
(3030L,)
(1010L, 22L)
(1010L, 21L)
(1010L,)
0.550947548407


In [103]:
"""
EXPERIMENTS Cell
Cell for all experiments that don't fit neatly elsewhere.
"""

def timeVisualizer():
    '''
    Visualize time related trends. Does RAOP get more or less generous over time? 
    Certain times of day? Certain days of week?
    '''
    raise NotImplementedError
    
def colNamePrint():
    '''
    Just print out TEST column names w indices for ref.
    '''
    c = master_data.getTestColumnNames()
    for i,n in enumerate(c):
        print i, '_',n
    #END

    

0 _ giver_username_if_known
1 _ request_id
2 _ request_text_edit_aware
3 _ request_title
4 _ requester_account_age_in_days_at_request
5 _ requester_days_since_first_post_on_raop_at_request
6 _ requester_number_of_comments_at_request
7 _ requester_number_of_comments_in_raop_at_request
8 _ requester_number_of_posts_at_request
9 _ requester_number_of_posts_on_raop_at_request
10 _ requester_number_of_subreddits_at_request
11 _ requester_subreddits_at_request
12 _ requester_upvotes_minus_downvotes_at_request
13 _ requester_upvotes_plus_downvotes_at_request
14 _ requester_username
15 _ unix_timestamp_of_request
16 _ unix_timestamp_of_request_utc
