In [1]:
import gc
import math

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA

from sklearn.metrics import classification_report



In [2]:
def gini(y, pred):
    assert(len(y) == len(pred))
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [3]:
df_train = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/train.csv')
df_test = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/test.csv')

In [4]:
#train = df_train.copy()
#test = df_test.copy()
#col = [c for c in train.columns if c not in ['id','target']]

def feature_selection(X, cols):
    # Feature Selection by Regularization
    print('Running Lasso..')
    
    scaler = StandardScaler()
    std_data = scaler.fit_transform(X[cols].values)
    clf = LogisticRegression(penalty='l1', C=0.1, random_state=42, solver='liblinear', n_jobs=-1)
    clf.fit(std_data, X['target'].values.reshape((-1,)))
    imp_feats_ind = np.nonzero(clf.coef_[0])[0]
    final_feats = np.array(cols)[imp_feats_ind]
    
    print('Completed!')
    print('Total features selected are:', len(final_feats))
    print('Features Selected:', final_feats)
    
    return final_feats

#final_feats = feature_selection(train, col)

In [39]:
df_balanced = pd.concat([train[train.target == 0].sample(n=train[train.target == 1].shape[0]), train[train.target == 1]])

In [41]:
clf = GradientBoostingClassifier(n_estimators=100, random_state=99)

In [31]:
clf.fit(df_balanced[final_feats], df_balanced['target'])
importances = list(zip(final_feats, list(clf.feature_importances_)))
sorted(importances, key=lambda x: x[1], reverse=True)

[('ps_car_13', 0.13274210883565402),
 ('ps_ind_03', 0.10824961817053291),
 ('ps_reg_03', 0.069123093278157952),
 ('ps_ind_15', 0.057073047561333495),
 ('ps_ind_05_cat', 0.053819868493581197),
 ('ps_car_01_cat', 0.048150222823425626),
 ('ps_reg_01', 0.038437535662641527),
 ('ps_car_09_cat', 0.03252902868402105),
 ('ps_ind_17_bin', 0.030144527809466679),
 ('ps_ind_01', 0.028311268890030373),
 ('ps_car_07_cat', 0.025796314801762629),
 ('ps_ind_02_cat', 0.025420607040958679),
 ('ps_car_03_cat', 0.024746832978277226),
 ('ps_car_11', 0.02231672436447367),
 ('ps_car_14', 0.022199596112318823),
 ('ps_reg_02', 0.021141252986491756),
 ('ps_ind_08_bin', 0.021138129296915335),
 ('ps_ind_07_bin', 0.020770138623998929),
 ('ps_car_11_cat', 0.019257944005454237),
 ('ps_car_15', 0.017876105553081292),
 ('ps_calc_14', 0.017007156081678677),
 ('ps_calc_10', 0.015104753950231786),
 ('ps_calc_01', 0.014764205073792381),
 ('ps_ind_16_bin', 0.014582997433233021),
 ('ps_ind_04_cat', 0.012818803524558567),
 ('

In [32]:
meta = ['ps_car_13','ps_ind_03','ps_reg_03','ps_ind_15']
meta

['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15']

In [33]:
train['ps_car_13_x_ps_ind_03'] = train['ps_car_13'] * train['ps_ind_03']
train['ps_car_13_x_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']
train['ps_car_13_x_ps_ind_15'] = train['ps_car_13'] * train['ps_ind_15']
train['ps_ind_03_x_ps_reg_03'] = train['ps_ind_03'] * train['ps_reg_03']
train['ps_ind_03_x_ps_ind_15'] = train['ps_ind_03'] * train['ps_ind_15']
train['ps_reg_03_x_ps_ind_15'] = train['ps_reg_03'] * train['ps_ind_15']
train['ps_car_13_x_ps_reg_03_x_ps_ind_03'] = train['ps_car_13'] * train['ps_reg_03'] * train['ps_ind_03']
train['ps_ind_03_x_ps_reg_03_x_ps_ind_15'] = train['ps_ind_03'] * train['ps_reg_03'] * train['ps_ind_15']

In [None]:
final_feats = np.append(final_feats, ['ps_car_13_x_ps_ind_03', 'ps_car_13_x_ps_reg_03', 'ps_car_13_x_ps_ind_15', 'ps_ind_03_x_ps_reg_03',
                        'ps_ind_03_x_ps_ind_15', 'ps_reg_03_x_ps_ind_15','ps_car_13_x_ps_reg_03_x_ps_ind_03',
                        'ps_ind_03_x_ps_reg_03_x_ps_ind_15'])

In [None]:
clf.fit(df_balanced[final_feats], df_balanced['target'])
importances = list(zip(final_feats, list(clf.feature_importances_)))
sorted(importances, key=lambda x: x[1], reverse=True)

## Model ensembling

In [5]:
categorical = [c for c in df_train.columns if '_cat' in c]
len(categorical)

14

In [6]:
print(df_train.shape)
print(df_test.shape)

(595212, 59)
(892816, 58)


In [70]:
join = pd.concat([df_train, df_test], ignore_index=True)

encoded = pd.get_dummies(join, columns=categorical)
encoded.shape

train_encoded = encoded.loc[encoded.id.isin(df_train.id), ]
test_encoded = encoded.loc[encoded.id.isin(df_test.id), ].drop(['target'], axis=1)

print(train_encoded.shape)
print(test_encoded.shape)

(595212, 229)
(892816, 228)


In [72]:
train_encoded.target = train_encoded.target.astype(np.int8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [75]:
train = train_encoded.copy()
test = test_encoded.copy()

train['ps_car_13_x_ps_ind_03'] = train['ps_car_13'] * train['ps_ind_03']
train['ps_car_13_x_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']

test['ps_car_13_x_ps_ind_03'] = test['ps_car_13'] * test['ps_ind_03']
test['ps_car_13_x_ps_reg_03'] = test['ps_car_13'] * test['ps_reg_03']

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop, axis=1)

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
    
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)  

print(train.shape, test.shape)

# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub = test['id'].to_frame()
sub['target'] = 0

nrounds=2000
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()
sub.head(2)

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub.to_csv('~/Downloads/porto/result_xgb_meta_lgb_meta_features_onehot.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

(595212, 211) (892816, 210)
 xgb kfold: 1  of  5 : 
[0]	train-gini:0.190821	valid-gini:0.189363
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.25153	valid-gini:0.245314
[200]	train-gini:0.276886	valid-gini:0.260749
[300]	train-gini:0.296899	valid-gini:0.270299
[400]	train-gini:0.310255	valid-gini:0.275256
[500]	train-gini:0.321217	valid-gini:0.2782
[600]	train-gini:0.330218	valid-gini:0.279536
[700]	train-gini:0.338469	valid-gini:0.280641
[800]	train-gini:0.34553	valid-gini:0.280856
[900]	train-gini:0.352803	valid-gini:0.281242
[1000]	train-gini:0.359483	valid-gini:0.281857
[1100]	train-gini:0.366354	valid-gini:0.281988
Stopping. Best iteration:
[1073]	train-gini:0.364598	valid-gini:0.282129

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.189524	valid-gini:0.184896
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gi

Early stopping, best iteration is:
[1309]	valid_0's auc: 0.639536	valid_0's gini: 0.279072


Unnamed: 0,id,target
595212,0,0.028572
595213,1,0.024826


In [7]:
pfeatures = PolynomialFeatures(degree=2, interaction_only=True, include_bias=True)
join = pd.concat([df_train, df_test], ignore_index=True)
join.drop(join.columns[join.columns.str.startswith('ps_calc_')], axis=1, inplace=True)

x_categorical = join[categorical]
x_non_categorical = join[join.columns.difference(categorical)]

new_features = x_non_categorical.drop(['id', 'target'], axis=1)
new_matrix = pfeatures.fit_transform(new_features)
res = pd.DataFrame(data=new_matrix, columns=pfeatures.get_feature_names())
res['target'] = x_non_categorical['target']
res['id'] = x_non_categorical['id']

gc.collect()
encoded = pd.get_dummies(x_categorical, columns=categorical)

gc.collect()
final = pd.concat([res, encoded], axis=1)

train_encoded = final.loc[final.id.isin(df_train.id), ]
gc.collect()
test_encoded = final.loc[final.id.isin(df_test.id), ].drop(['target'], axis=1)

print(train_encoded.shape)
print(test_encoded.shape)

(595212, 463)
(892816, 462)


In [8]:
train_encoded.target = train_encoded.target.astype(np.int8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [95]:
x_tmp = df_train.drop(categorical, axis=1)
x_tmp.drop(x_tmp.columns[x_tmp.columns.str.startswith('ps_calc_')], axis=1, inplace=True)
x_tmp.drop(['id', 'target'], axis=1, inplace=True)

new_matrix = pfeatures.fit_transform(x_tmp)
new_matrix.shape

(595212, 277)

In [124]:
res = pd.DataFrame(data=new_matrix, columns=pfeatures.get_feature_names())
res['target'] = df_train['target']
res['id'] = df_train['id']

In [10]:
res_cols = [c for c in train_encoded.columns if c not in ['id','target']]
feats = feature_selection(train_encoded, res_cols)

Running Lasso..
Completed!
Total features selected are: 360
Features Selected: ['x0' 'x2' 'x5' 'x7' 'x10' 'x11' 'x12' 'x14' 'x15' 'x18' 'x20' 'x21' 'x22'
 'x0 x2' 'x0 x3' 'x0 x4' 'x0 x5' 'x0 x6' 'x0 x8' 'x0 x11' 'x0 x12' 'x0 x14'
 'x0 x15' 'x0 x16' 'x0 x18' 'x0 x19' 'x0 x20' 'x0 x21' 'x0 x22' 'x1 x2'
 'x1 x3' 'x1 x4' 'x1 x5' 'x1 x6' 'x1 x7' 'x1 x8' 'x1 x11' 'x1 x12' 'x1 x13'
 'x1 x14' 'x1 x16' 'x1 x19' 'x1 x20' 'x2 x3' 'x2 x4' 'x2 x5' 'x2 x6'
 'x2 x7' 'x2 x8' 'x2 x10' 'x2 x11' 'x2 x12' 'x2 x13' 'x2 x14' 'x2 x15'
 'x2 x16' 'x2 x18' 'x2 x19' 'x2 x21' 'x2 x22' 'x3 x4' 'x3 x5' 'x3 x6'
 'x3 x7' 'x3 x8' 'x3 x11' 'x3 x12' 'x3 x13' 'x3 x14' 'x3 x16' 'x3 x17'
 'x3 x18' 'x3 x19' 'x3 x21' 'x3 x22' 'x4 x5' 'x4 x6' 'x4 x7' 'x4 x8'
 'x4 x11' 'x4 x12' 'x4 x13' 'x4 x14' 'x4 x17' 'x4 x18' 'x4 x19' 'x4 x20'
 'x4 x21' 'x4 x22' 'x5 x6' 'x5 x7' 'x5 x9' 'x5 x10' 'x5 x11' 'x5 x12'
 'x5 x13' 'x5 x14' 'x5 x16' 'x5 x17' 'x5 x18' 'x5 x19' 'x5 x20' 'x5 x21'
 'x5 x22' 'x6 x8' 'x6 x9' 'x6 x11' 'x6 x12' 'x6 x13' 'x6

In [13]:
feats = ['x0', 'x2', 'x5', 'x7', 'x10', 'x11', 'x12', 'x14', 'x15', 'x18', 'x20', 'x21', 'x22',
 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x0 x8', 'x0 x11', 'x0 x12', 'x0 x14',
 'x0 x15', 'x0 x16', 'x0 x18', 'x0 x19', 'x0 x20', 'x0 x21', 'x0 x22', 'x1 x2',
 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8', 'x1 x11', 'x1 x12', 'x1 x13',
 'x1 x14', 'x1 x16', 'x1 x19', 'x1 x20', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6',
 'x2 x7', 'x2 x8', 'x2 x10', 'x2 x11', 'x2 x12', 'x2 x13', 'x2 x14', 'x2 x15',
 'x2 x16', 'x2 x18', 'x2 x19', 'x2 x21', 'x2 x22', 'x3 x4', 'x3 x5', 'x3 x6',
 'x3 x7', 'x3 x8', 'x3 x11', 'x3 x12', 'x3 x13', 'x3 x14', 'x3 x16', 'x3 x17',
 'x3 x18', 'x3 x19', 'x3 x21', 'x3 x22', 'x4 x5', 'x4 x6', 'x4 x7', 'x4 x8',
 'x4 x11', 'x4 x12', 'x4 x13', 'x4 x14', 'x4 x17', 'x4 x18', 'x4 x19', 'x4 x20',
 'x4 x21', 'x4 x22', 'x5 x6', 'x5 x7', 'x5 x9', 'x5 x10', 'x5 x11', 'x5 x12',
 'x5 x13', 'x5 x14', 'x5 x16', 'x5 x17', 'x5 x18', 'x5 x19', 'x5 x20', 'x5 x21',
 'x5 x22', 'x6 x8', 'x6 x9', 'x6 x11', 'x6 x12', 'x6 x13', 'x6 x14', 'x6 x16',
 'x6 x17', 'x6 x18', 'x6 x19', 'x6 x20', 'x6 x21', 'x6 x22', 'x7 x11', 'x7 x12',
 'x7 x15', 'x7 x16', 'x7 x17', 'x7 x18', 'x7 x19', 'x7 x20', 'x7 x21', 'x7 x22',
 'x8 x11', 'x8 x12', 'x8 x13', 'x8 x16', 'x8 x18', 'x8 x19', 'x8 x21', 'x8 x22',
 'x9 x13', 'x9 x14', 'x9 x16', 'x9 x17', 'x9 x19', 'x9 x20', 'x9 x21', 'x10 x11',
 'x10 x12', 'x10 x16', 'x10 x18', 'x10 x19', 'x10 x20', 'x10 x22', 'x11 x12',
 'x11 x13', 'x11 x14', 'x11 x16', 'x11 x17', 'x11 x18', 'x11 x19', 'x11 x20',
 'x11 x21', 'x12 x13', 'x12 x14', 'x12 x16', 'x12 x17', 'x12 x18', 'x12 x19',
 'x12 x21', 'x12 x22', 'x13 x14', 'x13 x16', 'x13 x18', 'x13 x19', 'x13 x20',
 'x13 x21', 'x14 x16', 'x14 x17', 'x14 x18', 'x14 x21', 'x14 x22', 'x15 x17',
 'x15 x20', 'x15 x21', 'x15 x22', 'x16 x17', 'x16 x18', 'x16 x19', 'x16 x20',
 'x16 x21', 'x16 x22', 'x17 x20', 'x17 x21', 'x17 x22', 'x18 x20', 'x18 x21',
 'x18 x22', 'x19 x20', 'x19 x21', 'x19 x22', 'x20 x21', 'x20 x22', 'x21 x22',
 'ps_ind_02_cat_-1', 'ps_ind_02_cat_2', 'ps_ind_02_cat_3', 'ps_ind_02_cat_4',
 'ps_ind_04_cat_-1', 'ps_ind_04_cat_0', 'ps_ind_05_cat_-1', 'ps_ind_05_cat_0',
 'ps_ind_05_cat_2','ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_6',
 'ps_car_01_cat_-1', 'ps_car_01_cat_0', 'ps_car_01_cat_1', 'ps_car_01_cat_2',
 'ps_car_01_cat_3', 'ps_car_01_cat_4', 'ps_car_01_cat_5', 'ps_car_01_cat_7',
 'ps_car_01_cat_8', 'ps_car_01_cat_9', 'ps_car_01_cat_10', 'ps_car_01_cat_11',
 'ps_car_02_cat_-1', 'ps_car_02_cat_0', 'ps_car_03_cat_-1', 'ps_car_03_cat_1',
 'ps_car_04_cat_0', 'ps_car_04_cat_2', 'ps_car_04_cat_3', 'ps_car_04_cat_4',
 'ps_car_04_cat_5', 'ps_car_04_cat_6', 'ps_car_04_cat_7', 'ps_car_04_cat_8',
 'ps_car_04_cat_9', 'ps_car_05_cat_-1', 'ps_car_05_cat_0', 'ps_car_06_cat_0',
 'ps_car_06_cat_1', 'ps_car_06_cat_2', 'ps_car_06_cat_3', 'ps_car_06_cat_4',
 'ps_car_06_cat_5', 'ps_car_06_cat_6', 'ps_car_06_cat_7', 'ps_car_06_cat_8',
 'ps_car_06_cat_10', 'ps_car_06_cat_11', 'ps_car_06_cat_12',
 'ps_car_06_cat_13', 'ps_car_06_cat_14', 'ps_car_06_cat_15',
 'ps_car_06_cat_16', 'ps_car_06_cat_17', 'ps_car_07_cat_-1', 'ps_car_07_cat_0',
 'ps_car_08_cat_0', 'ps_car_08_cat_1', 'ps_car_09_cat_-1', 'ps_car_09_cat_0',
 'ps_car_09_cat_1', 'ps_car_09_cat_3', 'ps_car_09_cat_4', 'ps_car_10_cat_0',
 'ps_car_10_cat_2', 'ps_car_11_cat_1', 'ps_car_11_cat_2', 'ps_car_11_cat_3',
 'ps_car_11_cat_5', 'ps_car_11_cat_6', 'ps_car_11_cat_7', 'ps_car_11_cat_8',
 'ps_car_11_cat_9', 'ps_car_11_cat_10', 'ps_car_11_cat_12', 'ps_car_11_cat_13',
 'ps_car_11_cat_14', 'ps_car_11_cat_15', 'ps_car_11_cat_16',
 'ps_car_11_cat_17', 'ps_car_11_cat_18', 'ps_car_11_cat_20',
 'ps_car_11_cat_21', 'ps_car_11_cat_22', 'ps_car_11_cat_23',
 'ps_car_11_cat_24', 'ps_car_11_cat_25', 'ps_car_11_cat_26',
 'ps_car_11_cat_27', 'ps_car_11_cat_28', 'ps_car_11_cat_29',
 'ps_car_11_cat_30', 'ps_car_11_cat_31', 'ps_car_11_cat_33',
 'ps_car_11_cat_34', 'ps_car_11_cat_35', 'ps_car_11_cat_36',
 'ps_car_11_cat_37', 'ps_car_11_cat_38', 'ps_car_11_cat_39',
 'ps_car_11_cat_40', 'ps_car_11_cat_41', 'ps_car_11_cat_42',
 'ps_car_11_cat_43', 'ps_car_11_cat_44', 'ps_car_11_cat_45',
 'ps_car_11_cat_46', 'ps_car_11_cat_47', 'ps_car_11_cat_48',
 'ps_car_11_cat_49', 'ps_car_11_cat_50', 'ps_car_11_cat_52',
 'ps_car_11_cat_53', 'ps_car_11_cat_54', 'ps_car_11_cat_56',
 'ps_car_11_cat_57', 'ps_car_11_cat_58', 'ps_car_11_cat_59',
 'ps_car_11_cat_60', 'ps_car_11_cat_61', 'ps_car_11_cat_62',
 'ps_car_11_cat_63', 'ps_car_11_cat_64', 'ps_car_11_cat_65',
 'ps_car_11_cat_66', 'ps_car_11_cat_67', 'ps_car_11_cat_69',
 'ps_car_11_cat_71', 'ps_car_11_cat_72', 'ps_car_11_cat_73',
 'ps_car_11_cat_74', 'ps_car_11_cat_75', 'ps_car_11_cat_76',
 'ps_car_11_cat_77', 'ps_car_11_cat_78', 'ps_car_11_cat_79',
 'ps_car_11_cat_80', 'ps_car_11_cat_81', 'ps_car_11_cat_82',
 'ps_car_11_cat_83', 'ps_car_11_cat_85', 'ps_car_11_cat_86',
 'ps_car_11_cat_87', 'ps_car_11_cat_88', 'ps_car_11_cat_89',
 'ps_car_11_cat_90', 'ps_car_11_cat_91', 'ps_car_11_cat_92',
 'ps_car_11_cat_93', 'ps_car_11_cat_94', 'ps_car_11_cat_95',
 'ps_car_11_cat_97', 'ps_car_11_cat_98', 'ps_car_11_cat_99',
 'ps_car_11_cat_100', 'ps_car_11_cat_101', 'ps_car_11_cat_102',
 'ps_car_11_cat_103', 'ps_car_11_cat_104']

In [131]:
x_test = df_test.drop(categorical, axis=1)
x_test.drop(x_test.columns[x_test.columns.str.startswith('ps_calc_')], axis=1, inplace=True)
x_test.drop(['id'], axis=1, inplace=True)

new_matrix_test = pfeatures.transform(x_test)
new_matrix_test.shape

(892816, 277)

In [132]:
res_test = pd.DataFrame(data=new_matrix_test, columns=pfeatures.get_feature_names())
res_test['id'] = df_test['id']

In [14]:
train = train_encoded[feats].copy()
test = test_encoded[feats].copy()

train['id'] = train_encoded['id']
train['target'] = train_encoded['target']
test['id'] = test_encoded['id']

print(train.shape)
print(test.shape)

(595212, 362)
(892816, 361)


In [15]:
#col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
#train = train.drop(col_to_drop, axis=1)
#test = test.drop(col_to_drop, axis=1)

print('Changing dtypes...')
for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
    
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)  

print(train.shape, test.shape)

# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub = test['id'].to_frame()
sub['target'] = 0

nrounds=2000
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[feats].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()
sub.head(2)

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[feats].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub.to_csv('~/Downloads/porto/result_xgb_meta_lgb_2polynomial_onehot.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

Changing dtypes...
(595212, 362) (892816, 361)
 xgb kfold: 1  of  5 : 
[0]	train-gini:0.188512	valid-gini:0.188631
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.258476	valid-gini:0.247011
[200]	train-gini:0.287071	valid-gini:0.263548
[300]	train-gini:0.307588	valid-gini:0.272301
[400]	train-gini:0.322812	valid-gini:0.277016
[500]	train-gini:0.335193	valid-gini:0.279021
[600]	train-gini:0.34612	valid-gini:0.280801
[700]	train-gini:0.356296	valid-gini:0.281567
[800]	train-gini:0.36488	valid-gini:0.282048
[900]	train-gini:0.373225	valid-gini:0.282125
Stopping. Best iteration:
[820]	train-gini:0.366648	valid-gini:0.282316

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.190324	valid-gini:0.181326
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.25826	valid-gini:0.24

Unnamed: 0,id,target
595212,0,0.028299
595213,1,0.02626


# Average

In [25]:
df1 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_meta_lgb_meta_features_onehot.csv')
df2 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_meta_lgb_meta_features.csv')
df3 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_meta_lgb.csv')

In [20]:
df1['target'] = df1['target'] * 0.4
df2['target'] = df2['target'] * 0.3
df3['target'] = df3['target'] * 0.3

In [27]:
m1 = pd.merge(df1, df2, on=['id'])
m2 = pd.merge(m1, df3, on=['id'])

m2['avg'] = m2[['target_x', 'target_y', 'target']].mean(axis=1)

m2.rename(columns={'target': 'target_z', 'avg': 'target'}, inplace=True)
m2.head(3)

Unnamed: 0,id,target_x,target_y,target_z,target
0,0,0.02857,0.02567,0.02631,0.02685
1,1,0.02483,0.0257,0.02755,0.026027
2,2,0.02557,0.0258,0.02639,0.02592


In [28]:
m2[['id', 'target']].to_csv('~/Downloads/porto/result_avg_meta_onehot_base_features.csv', index=False, float_format='%.5f')