# Reading data

In [26]:
import numpy as np
import pandas as pd

In [16]:
data_train = pd.read_csv('data/train.csv',index_col='id',na_values=-1)
data_test = pd.read_csv('data/test.csv',index_col='id',na_values=-1)

# Getting features names

In [17]:
cat_feat = data_test.filter(like='cat', axis=1).columns.values.tolist()
bin_feat = data_test.filter(like='bin', axis=1).columns.values.tolist()
num_feat = list(set(list(data_test)) - set(cat_feat)- set(bin_feat))

features_names = cat_feat + bin_feat + num_feat

# Transforming data

In [18]:
def Transform(train, fill_na=0, drop_first= False, drop_ps_car_11_cat=False):
    
    cat_f = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
    bin_f = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
    num_f = ['ps_car_12', 'ps_calc_12', 'ps_calc_09', 'ps_calc_03', 'ps_calc_11', 'ps_calc_13', 'ps_reg_01', 'ps_calc_06', 'ps_calc_04', 'ps_car_14', 'ps_calc_08', 'ps_car_13', 'ps_calc_01', 'ps_reg_02', 'ps_calc_10', 'ps_calc_05', 'ps_reg_03', 'ps_ind_01', 'ps_calc_02', 'ps_car_15', 'ps_car_11', 'ps_calc_14', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_calc_07']
    
    if drop_ps_car_11_cat : cat_f.remove('ps_car_11_cat')
        
    for var in bin_feat + cat_feat:
        train[var] = train[var].astype('category')
        
    train_cnt = train[num_f].fillna(fill_na) # On remlace les valeurs manquantes par 0 (Il semblerait que xgboost les gère mieux comme ça).
    train_bin = train[bin_f]
    train_cat = pd.get_dummies(train[cat_f],
                              prefix_sep='#',
                              drop_first=drop_first)

    return pd.concat([train_cnt, train_bin, train_cat], axis=1);

In [27]:
Train_mod = Transform(data_train, fill_na=0, drop_first= False, drop_ps_car_11_cat= False)
Test_mod = Transform(data_test, fill_na=0, drop_first= False, drop_ps_car_11_cat= False)

In [28]:
X = Train_mod.as_matrix()
X_test = Test_mod.as_matrix()
y = data_train['target']

In [29]:
np.save('/tmp/X', X)
np.save('/tmp/y', y)
np.save('/tmp/X_test', X_test)

#### We take a small subsample for learning

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_t, y_train, y_t = train_test_split(X, y, stratify=y, test_size=0.97, random_state=42)

In [23]:
X_train.shape, y_train.shape

((17856, 114), (17856,))

In [24]:
y_train.value_counts()[1]/len(y_train)

0.036458333333333336

In [25]:
np.save('/tmp/X_small', X_train)
np.save('/tmp/y_small', y_train)