Notes:
*using transform and not fit_transform on dev data allows you to not have to make a whole new vectorizer with supplied vocab
*What's the idea behind makign train_data 3x the size of dev_data? I know train is usually bigger but just wondering if you have a specific idea   

In [429]:
'''
This cell's function:
Import all libraries that will be needed throughout document
'''

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import json
from pprint import pprint
import datetime
import pandas as pd

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [219]:
'''
This cell's function:
Load data into working space.
The data created here should never be edited - if you want to modify (e.g. trim features) make a new copy in later cells
'''

#file names of data (must be in same folder as this notebook)
train_dataset = 'train.json'
test_dataset = 'test.json'

#Load in data as panda dataframes
with open('train.json','r') as fp: 
    json_data = json.load(fp)
dfTrainRaw = pd.io.json.json_normalize(json_data)

with open('test.json','r') as fp: 
    json_data = json.load(fp)
dfTest = pd.io.json.json_normalize(json_data)

# Set np seed
np.random.seed(0)

#Shuffle train data and split into train and dev
dfTrainRaw.reindex(np.random.permutation(dfTrainRaw.index)) #shuffle
nTrain = dfTrainRaw.shape[0]
prop_train = 0.75 # proportion of train set to be used (remaining is dev set)
dfTrain = dfTrainRaw[:int(nTrain*prop_train)]
dfDev = dfTrainRaw[int(nTrain*prop_train):]

#Save column names for reference
trainColumnNames = dfTrain.columns.tolist()
testColumnNames = dfTest.columns.tolist() #Note test features is only a subset!

#Observe data
#pprint(trainColumnNames)
#dfTrain.describe() #summary statistics of numeric features

In [323]:
'''
TUTORIAL: This cell shows how the dataframes above get accessed and turned into usable numpy arrays
'''

###Task: Extracting message text AND title text into a feature vector:
#first find name of column by printing out the list of names
pprint(trainColumnNames) #looks like we want 'request_text_edit_aware' and 'request_title'
#find which number this is or manually type column name
print '\n'
print trainColumnNames[7]
print trainColumnNames[8]

#two ways to get data we want:
print '\n'
X_train = dfTrain[['request_text_edit_aware','request_title']] #method 1
X_train = dfTrain[[trainColumnNames[7],trainColumnNames[8]]]
print X_train.head() #.head() just prints the first 5 rows

#The above X_train is still a pandas dataframe. Converting to numpy array for sklearn is as simple as:
print '\n'
X_train = X_train.values
print type(X_train)
print X_train.shape

#In summary (quick way):
X_train = dfTrain[['request_text_edit_aware','request_title']].values 

###Task: Join 2 numpy arrays horizontally (e.g. merge train and dev for final submission)
train_data = dfTrain['request_text_edit_aware'].values
dev_data = dfDev['request_text_edit_aware'].values
merged_data = np.concatenate((train_data,dev_data),axis=0)
print '\n'
print train_data.shape,' ',dev_data.shape,' ',merged_data.shape

###Task: Join 2 numpy arrays vertically (e.g. add a bunch of features)
train_data1 = dfTrain['request_text_edit_aware'].values
#now we want more features... say from some feature engineering process
train_data2 = dfTrain['request_title'].values
train_data_merged = np.column_stack((train_data1,train_data2)) #<---- where the action is at!
print '\n'
print train_data1.shape,' ',train_data2.shape,' ',train_data_merged.shape



[u'giver_username_if_known',
 u'number_of_downvotes_of_request_at_retrieval',
 u'number_of_upvotes_of_request_at_retrieval',
 u'post_was_edited',
 u'request_id',
 u'request_number_of_comments_at_retrieval',
 u'request_text',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_account_age_in_days_at_retrieval',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_days_since_first_post_on_raop_at_retrieval',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_at_retrieval',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_comments_in_raop_at_retrieval',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_at_retrieval',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_posts_on_raop_at_retrieval',
 u'requester_number_of_subreddits_at_request',
 u'requester_received_pizza',
 u'requester_subreddits_at_request',
 u'requester_upvo

In [281]:
'''
This cell is for editing the format of the data for specific classifiers or experiments. 
Always create a new copy rather than change dfTrain, dfDev or dfTest.
'''

def separateTimestamp(df):
    '''
    separates time stamp (UTC) into month, day, hour. If user's local time is of interest, use the non UTC data
    '''

    timeStamps = df['unix_timestamp_of_request_utc'].values #numpy array of timestamps
    timeStampsSeparate = [] #init new
    
    # Loop over timestamps
    for ts in timeStamps:
        # Pull out relevant time info
        month = datetime.datetime.fromtimestamp(ts).strftime("%m")
        day_of_month = datetime.datetime.fromtimestamp(ts).strftime("%d")
        hour = datetime.datetime.fromtimestamp(ts).strftime("%H")
        # Append to results
        timeStampsSeparate.append([int(month),int(day_of_month),int(hour)])
    
    #convert from python list to ndarray
    return np.asarray(timeStampsSeparate)
        
X_train = separateTimestamp(dfTrain)
print X_train.shape

[ 8 15  3 ...,  8  1 15]


In [463]:
'''
This cell has quick access functions for many scipy vectorizers and classifiers
'''

def vectorize(train_data,dev_data):
    # transform the train data
    vectorizer_train = CountVectorizer()
    #vectorizer_train = TfidfVectorizer()
    v_data_train = vectorizer_train.fit_transform(train_data)
    vocab_train = vectorizer_train.get_feature_names()
    # transform the dev data using the same vocab
    v_data_dev = vectorizer_train.transform(dev_data)     # 'transform' function will preserve previous vocab
    return v_data_train, v_data_dev, vocab_train

def vectorize_bigram(train_data,dev_data):
    # transform the train data
    vectorizer_train = CountVectorizer(ngram_range=(2,2))
    v_data_train = vectorizer_train.fit_transform(train_data)
    vocab_train = vectorizer_train.get_feature_names()
    # transform the dev data using the same vocab
    vectorizer_dev = CountVectorizer(ngram_range=(2,2),vocabulary=vocab_train)
    v_data_dev = vectorizer_dev.fit_transform(dev_data)
    return v_data_train, v_data_dev, vocab_train

def vectorizer_attrs(v_data):
    '''
    Get attributes using nnz and shape
    '''
    nonzero = v_data.nnz
    examples = v_data.shape[0]
    distinct_words = v_data.shape[1]
    avg_nonzero = float(nonzero)/examples
    total_entries = examples*distinct_words
    pct_nz_entries = float(nonzero)/total_entries*100
    return ("Vocabulary size: " + str(distinct_words) + 
            "\nAverage non-zero features per example: " + 
            str(round(avg_nonzero,1)) + "\nFraction of non-zero entries in the matrix is " + 
            str(nonzero) + "/" + str(total_entries) + " (" + str(round(pct_nz_entries,2)) + "%)")

def log_reg(train_data,train_label,dev_data):
    lor = LogisticRegression()
    lor.fit(train_data, train_label)
    lor_pred = lor.predict(dev_data)
    lor_pred_pr = lor.predict_proba(dev_data)
    allcoefs = lor.coef_.copy()
    # Return the prediction matrix, coefficients
    return lor_pred, lor_pred_pr, allcoefs

def get_topn(top_n,lorcoefs,vocab):
    allcoefs = lorcoefs.copy()
    lbls=allcoefs.shape[0]
    index=[]
    words=[]
    for num in range(top_n):
        mxindex = allcoefs.argmax(axis=1)
        for lbl in range(lbls):
            allcoefs[lbl][mxindex[lbl]] = 0
            index.append(mxindex[lbl])
            words.append(vocab[mxindex[lbl]])
    # With our new index of the top n words in each label, get the coefficient matrix of these words
    coefs=np.zeros((len(index),lbls))
    for lbl in range(lbls):
        for element in range(len(index)):
            coefs[element][lbl] = lorcoefs[lbl][index[element]]
    return words, coefs

def get_vectorized_logreg(train,train_labels,test):
    train_vdata, dev_vdata, vocab = vectorize(train,test)
    prediction, predict_pr, allcoefs = log_reg(train_vdata,train_labels,dev_vdata) #Where did train_target come from? I had to change funciton input to run on full data set
    words, coefs = get_topn(10,allcoefs, vocab)
    print vectorizer_attrs(train_vdata)
    
    return prediction #NOTE! I changed the return value to only return prediction. Sorry if this broke something!

In [497]:
'''
Benchmark Model cell
This is the first model we submitted to Kaggle
'''
def benchmarkModel():
    #Need msg text only along with labels
    train_data = dfTrain['request_text_edit_aware'].values
    train_labels = dfTrain['requester_received_pizza'].values
    dev_data = dfDev['request_text_edit_aware'].values
    dev_labels = dfDev['requester_received_pizza'].values
    test_data = dfTest['request_text_edit_aware'].values

    #Make prediction of dev data using msg text only. 
    dev_labels_pred = get_vectorized_logreg(train_data, train_labels, dev_data)

    #Make baseline model prediction that simply predicts the most common class (no pizza) at all times
    baseline = [0]*len(msg_pred)

    #Compare models
    print "\nBaseline model (always predicts no pizza)"
    print metrics.classification_report(dev_labels,baseline)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,baseline)
    
    print "\nSimple Logistic Regression model w/ Count Vectorizer, no regularization"
    print metrics.classification_report(dev_labels,dev_labels_pred)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,dev_labels_pred)

    #Now make predictions on full dataset
    test_labels_pred = get_vectorized_logreg(np.concatenate((train_data,dev_data),axis=0),
                                             np.concatenate((train_labels,dev_labels),axis=0),
                                             test_data)
    
    return (dev_labels_pred,test_labels_pred)

dev_labels_pred,test_labels_pred = benchmarkModel() 

Vocabulary size: 10709
Average non-zero features per example: 53.9
Fraction of non-zero entries in the matrix is 163228/32448270 (0.5%)

Baseline model (always predicts no pizza)
             precision    recall  f1-score   support

      False       0.75      1.00      0.86       761
       True       0.00      0.00      0.00       249

avg / total       0.57      0.75      0.65      1010

ROC AUC:  0.5

Simple Logistic Regression model w/ Count Vectorizer, no regularization
             precision    recall  f1-score   support

      False       0.78      0.85      0.81       761
       True       0.36      0.25      0.30       249

avg / total       0.67      0.70      0.68      1010

ROC AUC:  0.55160457863
Vocabulary size: 12317
Average non-zero features per example: 53.6
Fraction of non-zero entries in the matrix is 216394/49760680 (0.43%)


In [None]:
Simple Logistic Regression model w/ Count Vectorizer, no regularization
             precision    recall  f1-score   support

      False       0.77      0.86      0.81       761
       True       0.34      0.22      0.27       249

avg / total       0.67      0.70      0.68      1010



\Logistic Regression model of most of the numeric data, no regularization
             precision    recall  f1-score   support

      False       0.76      0.99      0.86       761
       True       0.52      0.04      0.08       249

avg / total       0.70      0.75      0.67      1010



In [502]:
'''
Simple models with numeric data
'''
def simpleNumericModel():
    #numeric data straight out of df
    #colNames = [testColumnNames[i] for i in [4,5,6,7,8,9,10]] #hand picked to be plausible 
    colNames = [testColumnNames[i] for i in [4,7,9]] #hand picked to be plausible 

    print colNames
    
    X_train_numeric = dfTrain[colNames].values
    X_dev_numeric = dfDev[colNames].values
    X_test_numeric = dfTest[colNames].values
    #split time stamp
    X_train_time = separateTimestamp(dfTrain)
    X_dev_time = separateTimestamp(dfDev)
    X_test_time = separateTimestamp(dfTest)
    #merge
    train_data = np.column_stack((X_train_numeric,X_train_time))
    dev_data = np.column_stack((X_dev_numeric,X_dev_time))
    test_data = np.column_stack((X_test_numeric,X_test_time))
    #labels
    train_labels = dfTrain['requester_received_pizza'].values
    dev_labels = dfDev['requester_received_pizza'].values

    #Simple model
    dev_labels_pred = log_reg(train_data, train_labels, dev_data)[0]

    #Results
    print "\Logistic Regression model of most of the numeric data, no regularization"
    print metrics.classification_report(dev_labels,dev_labels_pred)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,dev_labels_pred)
    
    #Now make predictions on full dataset
    test_labels_pred = log_reg(np.concatenate((train_data,dev_data),axis=0), 
                                             np.concatenate((train_labels,dev_labels),axis=0),
                                             test_data)[0]
    return (dev_labels_pred,test_labels_pred)

simpleNumericModel()


[u'requester_account_age_in_days_at_request', u'requester_number_of_comments_in_raop_at_request', u'requester_number_of_posts_on_raop_at_request']
\Logistic Regression model of most of the numeric data, no regularization
             precision    recall  f1-score   support

      False       0.76      0.99      0.86       761
       True       0.65      0.04      0.08       249

avg / total       0.73      0.76      0.67      1010

ROC AUC:  0.518146172073


(array([False, False, False, ..., False, False, False], dtype=bool),
 array([False, False, False, ..., False, False, False], dtype=bool))

In [577]:
'''
Ensemble Cell
Creates a new Log Reg, combining outputs of other models
'''

def textModel(p='l2',c=1):
    '''
    Simple-ish text model with some regularization
    '''
    ###TRAINING
    train_data = dfTrain['request_text_edit_aware'].values
    train_labels = dfTrain['requester_received_pizza'].values
    dev_data = dfDev['request_text_edit_aware'].values
    dev_labels = dfDev['requester_received_pizza'].values
    test_data = dfTest['request_text_edit_aware'].values
    
    # transform the data
    cv = CountVectorizer()
    #cv = TfidfVectorizer()
    train_data_v = cv.fit_transform(train_data)
    # transform the dev data using the same vocab
    dev_data_v = cv.transform(dev_data)
    #fit classifier
    lr = LogisticRegression(penalty=p,C=c)
    lr.fit(train_data_v, train_labels)

    #predict labels and probabilities
    train_labels_pred = lr.predict(train_data_v)
    train_labels_pred_pr = lr.predict_proba(train_data_v)
    dev_labels_pred = lr.predict(dev_data_v)
    dev_labels_pred_pr = lr.predict_proba(dev_data_v)

    #Compare models
    print "\nText model w/ Count Vectorizer, LR w/ L1 regularization"
    print metrics.classification_report(dev_labels,dev_labels_pred)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,dev_labels_pred)
    
    ###SUBMISSION (USE ALL DATA)
    all_data = np.concatenate((train_data,dev_data),axis=0)
    all_labels = np.concatenate((train_labels,dev_labels),axis=0)
    # transform the data
    all_data_v = cv.fit_transform(all_data)
    # transform the test data using the same vocab
    test_data_v = cv.transform(test_data)     # 'transform' function will preserve previous vocab
    #fit classifier
    lr.fit(all_data_v, all_labels)

    #predict labels and probabilities of alldata (for training ensemble model for test prediction
    all_labels_pred = lr.predict(all_data_v)
    all_labels_pred_pr = lr.predict_proba(all_data_v)
    test_labels_pred = lr.predict(test_data_v)
    test_labels_pred_pr = lr.predict_proba(test_data_v)    
    
    #return all this for ensemble processing (just predicted labels at this stage, not probabilities)
    return (train_labels_pred,dev_labels_pred,all_labels_pred,test_labels_pred)

def numericModel(p='l2',c=0.5):
    #numeric data straight out of df
    colNames = [testColumnNames[i] for i in [5,7,8,9]] #hand picked to be plausible 
    X_train_numeric = dfTrain[colNames].values
    X_dev_numeric = dfDev[colNames].values
    X_test_numeric = dfTest[colNames].values
    #split time stamp
    X_train_time = separateTimestamp(dfTrain)
    X_dev_time = separateTimestamp(dfDev)
    X_test_time = separateTimestamp(dfTest)
    #merge
    train_data = np.column_stack((X_train_numeric,X_train_time))
    dev_data = np.column_stack((X_dev_numeric,X_dev_time))
    test_data = np.column_stack((X_test_numeric,X_test_time))
    #labels
    train_labels = dfTrain['requester_received_pizza'].values
    dev_labels = dfDev['requester_received_pizza'].values

    ###TRAINING
    #fit classifier
    #lr = LogisticRegression(penalty=p,C=c)
    lr = GaussianNB()
    lr.fit(train_data, train_labels)

    #predict labels and probabilities
    train_labels_pred = lr.predict(train_data)
    train_labels_pred_pr = lr.predict_proba(train_data)
    dev_labels_pred = lr.predict(dev_data)
    dev_labels_pred_pr = lr.predict_proba(dev_data)

    #Compare models
    print "\Logistic Regression model of most of the numeric data, LR w/ L1 regularization"
    print metrics.classification_report(dev_labels,dev_labels_pred)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,dev_labels_pred)
    
    ###SUBMISSION (USE ALL DATA)
    all_data = np.concatenate((train_data,dev_data),axis=0)
    all_labels = np.concatenate((train_labels,dev_labels),axis=0)
    #fit classifier
    lr.fit(all_data, all_labels)

    #predict labels and probabilities of alldata (for training ensemble model for test prediction
    all_labels_pred = lr.predict(all_data)
    all_labels_pred_pr = lr.predict_proba(all_data)
    test_labels_pred = lr.predict(test_data)
    test_labels_pred_pr = lr.predict_proba(test_data)    
    
    #return all this for ensemble processing (just predicted labels at this stage, not probabilities)
    return (train_labels_pred,dev_labels_pred,all_labels_pred,test_labels_pred)

def simpleEnsembleModel(models):
    train_labels_pred_list = []
    dev_labels_pred_list = []
    all_labels_pred_list = []
    test_labels_pred_list = [] #don't think this is needed
    
    #run all models provided and add to list of features
    for model in models:
        a,b,c,d = model()
        train_labels_pred_list.append(a)
        dev_labels_pred_list.append(b)
        all_labels_pred_list.append(c)
        test_labels_pred_list.append(d)
    #concatenate features into numpy arrays
    train_data_layer2 = np.column_stack(tuple(train_labels_pred_list))
    dev_data_layer2 = np.column_stack(tuple(dev_labels_pred_list))
    all_data_layer2 = np.column_stack(tuple(all_labels_pred_list))
    test_data_layer2 = np.column_stack(tuple(test_labels_pred_list))
    
    ###TRAINING
    #fit classifier
    #lr = BernoulliNB(alpha=0.1)
    lr = LogisticRegression()
    lr.fit(train_data_layer2, train_labels)

    #predict labels and probabilities
    dev_labels_pred_layer2 = lr.predict(dev_data_layer2)
    dev_labels_pred_layer2_pr = lr.predict_proba(dev_data_layer2)

    #Compare models
    print "\Ensemble model!"
    print metrics.classification_report(dev_labels,dev_labels_pred_layer2)
    print "ROC AUC: ", metrics.roc_auc_score(dev_labels,dev_labels_pred_layer2)
    
    ###SUBMISSION (USE ALL DATA)
    all_labels = np.concatenate((train_labels,dev_labels),axis=0)
    #fit classifier
    lr = LogisticRegression()
    lr.fit(all_data_layer2, all_labels)

    #predict test set labels
    test_labels_pred = lr.predict(test_data_layer2)
    test_labels_pred_pr = lr.predict_proba(test_data_layer2)        
    
    return test_labels_pred
    
test_labels_pred_ensemble = simpleEnsembleModel([textModel,numericModel])


Text model w/ Count Vectorizer, LR w/ L1 regularization
             precision    recall  f1-score   support

      False       0.78      0.85      0.81       761
       True       0.36      0.25      0.30       249

avg / total       0.67      0.70      0.68      1010

ROC AUC:  0.55160457863
\Logistic Regression model of most of the numeric data, LR w/ L1 regularization
             precision    recall  f1-score   support

      False       0.76      0.94      0.84       761
       True       0.39      0.12      0.18       249

avg / total       0.67      0.74      0.68      1010

ROC AUC:  0.528009541451
\Ensemble model!
             precision    recall  f1-score   support

      False       0.78      0.85      0.81       761
       True       0.36      0.25      0.30       249

avg / total       0.67      0.70      0.68      1010

ROC AUC:  0.55160457863



Text model w/ Count Vectorizer, LR w/ L1 regularization
             precision    recall  f1-score   support

      False       0.77      0.94      0.84       761
       True       0.42      0.14      0.21       249

avg / total       0.68      0.74      0.69      1010

\Logistic Regression model of most of the numeric data, LR w/ L1 regularization
             precision    recall  f1-score   support

      False       0.76      0.99      0.86       761
       True       0.52      0.04      0.08       249

avg / total       0.70      0.75      0.67      1010

\Ensemble model!
             precision    recall  f1-score   support

      False       0.77      0.92      0.84       761
       True       0.43      0.18      0.25       249

avg / total       0.69      0.74      0.70      1010

In [570]:
'''
This cell outputs test data to format for submission to Kaggle. 
Feed it the predicited test labels data to write in below!
'''

def writeSubmission(testLabelsPredict,fileName='submit_to_kaggle.csv'):
    '''
    Uses dfTest dataframe, so ensure the test data hasn't been shuffled or your labels won't match the request_id's.
    '''
    #extract request_id so we can match against predictions for submission to kaggle
    req = dfTest['request_id']

    #make prediction in previous cell into a pandas series
    test_pred_series = pd.Series(testLabelsPredict.astype(int),name="requester_received_pizza")

    #now join into data frame
    out = pd.concat([req,test_pred_series], axis=1)

    #write data frame to csv (using kaggles sample submission csv for correct format)
    out.to_csv(fileName,index=False)

writeSubmission(test_labels_pred_ensemble)

In [134]:
title_prob = get_vectorized_logreg(train_titletxt, dev_titletxt)

Vocabulary size: 3832
Average non-zero features per example: 11.3
Fraction of non-zero entries in the matrix is 34173/11610960 (0.29%)


1010


1010


[0.0, 0.0, 0, 0, 0, 0, 0, 0, 0]
['10', '05', '18']


In [171]:
feature_prediction, feature_predict_pr, feature_allcoefs = log_reg(train_features,train_target,dev_features)
ts_prediction, ts_predict_pr, ts_allcoefs = log_reg(train_ts,train_target,dev_ts)
print feature_predict_pr
print ts_predict_pr

[[ 0.79120797  0.20879203]
 [ 0.76458161  0.23541839]
 [ 0.79120797  0.20879203]
 ..., 
 [ 0.77608364  0.22391636]
 [ 0.78600322  0.21399678]
 [ 0.78588184  0.21411816]]
[[ 0.75388208  0.24611792]
 [ 0.78420871  0.21579129]
 [ 0.76309099  0.23690901]
 ..., 
 [ 0.75574299  0.24425701]
 [ 0.75759436  0.24240564]
 [ 0.75943616  0.24056384]]


In [221]:
prediction_array=[]
predict_score=[]
score_val=0.1
prediction=[]
for i in range(len(dev_features)):
    proba=[]
    proba.append(msg_prob[i])
    proba.append(title_prob[i])
    proba.append(feature_predict_pr[i,1])
    proba.append(ts_predict_pr[i,1])
    prediction_array.append(proba)
    predict_score.append(sum(proba))
    if sum(proba)>=score_val:
        prediction.append(1)
    else:
        prediction.append(0)

In [222]:
print "F1 score of final model is " + str(round(metrics.f1_score(dev_target,prediction),3))

F1 score of final model is 0.413
