In [3]:
from sklearn.pipeline import Pipeline
from transformers import \
    ColumnTransformer, ColumnClipper, ColumnBinner, ColumnDropper, \
    CountEncoder, CountRankEncoder, TargetEncoder
import numpy as np

def calculate_reg_01(X):
    return (X.ps_reg_01 * 10).astype('int')

def calculate_reg_02(X):
    return (X.ps_reg_02 * 10).astype('int')

def calculate_reg_04(X):
    return (((X.ps_reg_03.replace(-1, np.NaN) * 40) ** 2) % 27).fillna(-1).astype('int')

def calculate_reg_05(X):
    return (((X.ps_reg_03.replace(-1, np.NaN) * 40) ** 2) // 27).fillna(-1).astype('int')

def calculate_car_12(X):
    return ((X.ps_car_12 ** 2) * 10000).round().astype('int')

def calculate_car_13(X):
    return ((X.ps_car_13 * 220) ** 2).round(2).astype('int')

def calculate_car_14(X):
    return ((X.ps_car_14.replace(-1, np.NaN) ** 2) * 1000).round().fillna(-1).astype('int')

def calculate_car_15(X):
    return (X.ps_car_15 ** 2).astype('int')

def calculate_06_09_cat(X):
    col = X.ps_ind_06_bin * 1 + X.ps_ind_07_bin * 2 + X.ps_ind_08_bin * 3 + X.ps_ind_09_bin * 4
    return col - col.min()

def calculate_16_18_cat(X):
    col = X.ps_ind_16_bin * 1 + X.ps_ind_17_bin * 2 + X.ps_ind_18_bin * 3
    return col - col.min()

def calculate_price_per_income(X):
    return X.ps_car_13 / X.ps_ind_01 + 1

def calculate_price_per_age(X):
    return X.ps_car_13 / X.ps_ind_03 + 1

def calculate_price_per_engine(X):
    return X.ps_car_13 / X.ps_car_12 + 1

pipeline = Pipeline([
    ('calc_reg_01', ColumnTransformer('ps_reg_01', calculate_reg_01)),
    ('calc_reg_02', ColumnTransformer('ps_reg_02', calculate_reg_02)),
    ('calc_reg_04', ColumnTransformer('ps_reg_04', calculate_reg_04)),
    ('calc_reg_05', ColumnTransformer('ps_reg_05', calculate_reg_05)),
    ('calc_car_12', ColumnTransformer('ps_car_12', calculate_car_12)),
    ('calc_car_13', ColumnTransformer('ps_car_13', calculate_car_13)),
    ('calc_car_14', ColumnTransformer('ps_car_14', calculate_car_14)),
    ('calc_car_15', ColumnTransformer('ps_car_15', calculate_car_15)),
    ('calc_car_06_09_cat', ColumnTransformer('ps_car_06_09_cat', calculate_06_09_cat)),
    ('calc_car_16_18_cat', ColumnTransformer('ps_car_16_18_cat', calculate_16_18_cat)),
    ('encode_ps_ind_01', TargetEncoder([
         'ps_ind_02_cat',
         'ps_ind_04_cat',
         'ps_ind_05_cat',
         'ps_car_01_cat',
         'ps_car_02_cat',
         'ps_car_03_cat',
         'ps_car_04_cat',
         'ps_car_05_cat',
         'ps_car_06_cat',
         'ps_car_07_cat',
         'ps_car_08_cat',
         'ps_car_09_cat',
         'ps_car_10_cat',
         'ps_car_11_cat'
    ], 'target', 200, 0.05, 20)),
#     ('clip_car_12', ColumnClipper('ps_car_12', 1, 99.9)),
#     ('clip_car_13', ColumnClipper('ps_car_13', 1, 99.9)),
#     ('bin_car_12', ColumnBinner('ps_car_12')),
#     ('bin_car_13', ColumnBinner('ps_car_13')),
    ('drop', ColumnDropper([
        'id', 'target', 'ps_ind_14', 'ps_reg_03',
        'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
        'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin'
    ]))
])

In [4]:
import pandas as pd
train = pd.read_csv('./data/train.csv')
pipeline.fit(train)

ps_ind_02_cat
-1    0.858149
 1    0.000000
 2    0.000000
 3    0.000000
 4    0.000000
Name: count, dtype: float64
ps_ind_04_cat
-1    0.99883
 0    0.00000
 1    0.00000
Name: count, dtype: float64
ps_ind_05_cat
-1    0.0
 0    0.0
 1    0.0
 2    0.0
 3    0.0
 4    0.0
 5    0.0
 6    0.0
Name: count, dtype: float64
ps_car_01_cat
-1     0.996495
 0     0.000000
 1     0.000000
 2     0.000000
 3     0.000000
 4     0.000000
 5     0.000000
 6     0.000000
 7     0.000000
 8     0.000000
 9     0.000000
 10    0.000000
 11    0.000000
Name: count, dtype: float64
ps_car_02_cat
-1    0.999947
 0    0.000000
 1    0.000000
Name: count, dtype: float64
ps_car_03_cat
-1    0.0
 0    0.0
 1    0.0
Name: count, dtype: float64
ps_car_04_cat
0    0.000000e+00
1    0.000000e+00
2    0.000000e+00
3    3.059022e-07
4    6.224593e-01
5    5.030430e-06
6    0.000000e+00
7    9.836975e-01
8    0.000000e+00
9    0.000000e+00
Name: count, dtype: float64
ps_car_05_cat
-1    0.0
 0    0.0
 1    0.0
Na

Pipeline(memory=None,
     steps=[('calc_reg_01', ColumnTransformer(column='ps_reg_01',
         func=<function calculate_reg_01 at 0x10b558158>)), ('calc_reg_02', ColumnTransformer(column='ps_reg_02',
         func=<function calculate_reg_02 at 0x10b558d90>)), ('calc_reg_04', ColumnTransformer(column='ps_reg_04',
         fu...ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin']))])

In [5]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import cross_val_score

X = pipeline.transform(train)
y = train.target
model = LGBMClassifier(n_estimators=40)
scores = cross_val_score(model, X, y, scoring='roc_auc')
print(scores.mean(), scores.std(), scores)

0.63053843391 0.00201840852621 [ 0.62801531  0.63064398  0.632956  ]


In [40]:
from sklearn.metrics import roc_auc_score
holdout = pd.read_csv('./data/holdout.csv')
X_holdout = pipeline.transform(holdout)
y_holdout = holdout.target

model = LGBMClassifier(n_estimators=40)
model.fit(X, y)
pred = model.predict_proba(X_holdout)[:, 1]
score = roc_auc_score(y_holdout, pred)
print(score)

0.62433661271
