In [None]:
import numpy as np
import pandas as pd
from sklearn import *

In [None]:
#Define gini metric used by Kaggle Competition
def gini(y, pred):
    fpr, tpr, thr = metrics.roc_curve(y, pred, pos_label=1)
    g = 2 * metrics.auc(fpr, tpr) -1
    return g

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [None]:
#Load traing, test sets (-1 stands for missing values)
df_train = pd.read_csv('train.csv', na_values = -1)
df_test = pd.read_csv('test.csv')

In [None]:
#Cleanup dataset and drop uncorrelated features
col = [c for c in df_train.columns if c not in ['id','target']]
col = [c for c in col if not c.startswith('ps_calc_')]
dups = df_train[df_train.duplicated(subset=col, keep=False)]
df_train = df_train[~df_train['id'].isin(dups['id'].values)]

#Split target from dataset
dfy_train = df_train['target']
df_train = df_train[col]
id_test = df_test['id'].values
df_test = df_test[col]

In [None]:
#Statistical Encoding Features
d_median = df_train.median(axis=0)
d_mean = df_train.mean(axis=0)

#-1 seems to work better than NaN for xgboost
df_train = df_train.fillna(-1)


In [None]:
#Feature engineer onehot encoding and statistical encoding
def t_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns]
    
    #Get 2 way interaction of 2 most important features
    df['ps_car_13*ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    #Count missing entries
    df['missing'] = np.sum((df[dcol]==-1).values, axis=1)  
    for c in dcol:
        if '_bin' not in c:
            df[c+str('_median_range')] = (df[c].values > d_median[c]) 
            df[c+str('_mean_range')] = (df[c].values > d_mean[c])
        
        #Onehot encode variables
        if len(df[c].unique())>2 and len(df[c].unique()) < 7:
            df[c] = df[c].astype(np.int)
            df[c] = df[c].astype('category')
    df = pd.get_dummies(df)
    return df

In [None]:
#Transform datasets
df_train = t_df(df_train)
df_test = t_df(df_test)