In [8]:
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import sys
import numpy as np
import time

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

sns.set_style('ticks')

In [44]:
def intersec(d1, d2):
    return list(set(d1).intersection(set(d2)))

In [40]:
def union(d1,d2):
    return list(set(d1).union(set(d2)))

In [47]:
def consistent(d1,d2,train,test):
    int_d12_train = intersec(d1[d1.isTrain==1].device_id.values
                             ,d2[d2.isTrain==1].device_id.values)
    int_d12_test = intersec(d1[d1.isTrain==0].device_id.values
                            ,d2[d2.isTrain==0].device_id.values)

    un_d12_train = union(d1[d1.isTrain==1].device_id.unique()
                         ,d2[d2.isTrain==1].device_id.unique())
    un_d12_test = union(d1[d1.isTrain==0].device_id.unique()
                        ,d2[d2.isTrain==0].device_id.unique())

    print("len uniques ids: d1", len(d1.device_id.unique()), " d2: "
          , len(d2.device_id.unique()))
    print("sum: ", len(d1.device_id.unique())+len(d2.device_id.unique()))
    print("")
    print("intersection of d1 and d2: ", len(intersec(d1.device_id.values
                                                      ,d2.device_id.values)))
    print("")
    print("Unique train in d1 and d2: ", len(un_d12_train))
    print("Unique test in d1 and d2: ", len(un_d12_test))
    print("")
    print("intersec of train ids in both sets: ", len(intersec(un_d12_train,train)))
    print("intersec of test ids in both sets: ", len(intersec(un_d12_test,test)))    

In [33]:
def xcheckData(data):
    print("len: ",len(data))
    print("len train: ", len(data[data.isTrain==1]))
    print("len test: ", len(data[data.isTrain==0]))

In [48]:
print("The preamble handles a few imports and defines the following functions:\n")
print("printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss)")
print("createDataSet(predictions, group_encoder, device_ids)")

print("getBestPrediction(data,var='device_id')")
print("")
print("")
print("A few Debug functions")

The preamble handles a few imports and defines the following functions:

printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss)
createDataSet(predictions, group_encoder, device_ids)
getBestPrediction(data,var='device_id')


A few Debug functions


In [32]:
def createDataSet(predictions, group_encoder, device_ids):
    """
    Creates prediction dataset to save into csv from 
    multiclass predictions (ndarray).
    
    Arguments:
    predictions   - (ndarray) predictions from the MVA.
    group_encoder - (LabelEncoder) to transform column names.
    device_ids    - (pd.Series, array...) respective device_ids.
    """
    predictions = pd.DataFrame(predictions)
    predictions.columns = group_encoder.inverse_transform(predictions.columns)
    predictions["device_id"] = device_ids
    return predictions

In [23]:
def printLoss(y_tr,p_tr,y_te,p_te,loss=log_loss):
    """
    Function prints loss value for up to two datasets
    for a given loss function (default: log_loss)
    
    y_tr - (true) classes of training set (or single set).
    p_tr - predictions of training set (or single set).
    y_te - (true) classes of second set.
    p_te - predictions of the second set.
    loss - loss function.
    """
    print("Test MVA predictions on test and training set:\n")
    print("Log loss on training set: ", loss(y_tr,p_tr))
    print("Log loss on test set: ", loss(y_te,p_te))

In [24]:
def getBestPrediction(data,var="device_id"):
    """
    Function that returns data set with 
    unique `var` entries that satisfy:
    highest prediction probability for 
    the class with highest mean prediction.
    
    Arguments:
    data  - (pd.DataFrame) The dataset to work with
    var   - (str) Feature to find uniques
    """
    gb = data.groupby(var, as_index=False, sort=False)
    indeces = list()
    for i,g in enumerate(gb.groups):
        # this searches class with highest mean prediction
        most_likely_class= gb.get_group(g).drop(var,axis=1)\
                                 .mean()\
                                 .sort_values(ascending=False)\
                                 .index[0]

        # this searches prediction with highest probability in most likely class

        index = gb.get_group(g).drop(var,axis=1)[most_likely_class]\
                                     .sort_values(ascending=False)\
                                     .index[0]
        
        indeces.append(index)
        
        if i%3000==0 and i>0:
            print(i+1, "/", len(gb.groups), " groups processed...")
    
                
    return data.iloc[indeces]

In [1]:
def averagePredictions(preds):
    return preds.groupby("device_id", sort=False, as_index=False).agg(np.mean)