In [221]:
import gc
import math

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA

from sklearn.metrics import classification_report

In [222]:
def gini(y, pred):
    assert(len(y) == len(pred))
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [223]:
df = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/train.csv')

In [None]:
features = ['ps_ind_01', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_15', 'ps_ind_16_bin',
            'ps_ind_17_bin', 'ps_reg_01', 'ps_reg_02', 'ps_car_01_cat', 'ps_car_02_cat',
            'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_12', 'ps_car_13',
            'ps_car_15', 'target']

'''features = ['ps_ind_01', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_15', 'ps_ind_16_bin',
            'ps_ind_17_bin', 'ps_reg_01', 'ps_reg_02', 'ps_car_01_cat', 'ps_car_02_cat',
            'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat', 'ps_car_12', 'ps_car_13',
            'ps_car_15', 'target']'''

In [153]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.nan if x == -1 else x)

In [155]:
categorical = []
for col in df.columns:
    if '_cat' in col:
        categorical.append(col)

In [162]:
X = df.drop(['ps_reg_03', 'ps_car_03_cat','ps_car_05_cat'], axis=1)

In [99]:
dropped = X.dropna()

In [5]:
res = []
for col in df.columns:
    if col not in features:
         res.append(col)

In [28]:
X = df.drop(res, axis=1)

In [62]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: np.nan if x == -1 else x)
    if X[col].isnull().any():
        if 'cat' in col:
            print('cat')
            X[col].fillna(value=X[col].value_counts()[0], inplace=True)
        elif 'bin' in col:
            print('bin')
            X[col].fillna(value=0, inplace=True)
        else:
            print('number')
            X[col].fillna(value=X[col].mean(), inplace=True)

cat
cat
cat
cat
number


In [None]:
X.fillna(method='ffill', inplace=True)
X.info()

In [65]:
X_clean = X.copy()

In [10]:
categorical = []
for col in X_clean.columns:
    if 'cat' in col:
        categorical.append(col)

In [11]:
for val in categorical:
    print(X_clean[val].value_counts())

0.0    533216
6.0     20854
4.0     18548
1.0      8397
3.0      8307
2.0      4225
5.0      1665
Name: ps_ind_05_cat, dtype: int64
11.0    207618
7.0     179272
6.0      62403
10.0     50104
4.0      26179
9.0      20324
5.0      18143
8.0      15094
3.0       6660
0.0       5904
2.0       2144
1.0       1367
Name: ps_car_01_cat, dtype: int64
1.0    493992
0.0    101220
Name: ps_car_02_cat, dtype: int64
0    496581
1     32115
2     23770
8     20598
9     19034
6      1560
3       640
5       545
4       230
7       139
Name: ps_car_04_cat, dtype: int64
11    131527
1     118386
0     110420
14     59253
10     33466
4      31136
15     21732
6      20951
9      17617
7      16158
3      11997
13      6246
17      4935
16      4582
12      2386
2       1607
8       1412
5       1401
Name: ps_car_06_cat, dtype: int64
1.0    564047
0.0     31165
Name: ps_car_07_cat, dtype: int64
1    495264
0     99948
Name: ps_car_08_cat, dtype: int64


In [359]:
X_clean.loc[:, 'ps_car_04_cat'] = X_clean.loc[:, 'ps_car_04_cat'].apply(lambda x: 1 if x > 0 else x)
X_clean.loc[:, 'ps_ind_05_cat'] = X_clean.loc[:, 'ps_ind_05_cat'].apply(lambda x: 1 if x > 0 else x)

In [258]:
# Split into 2 models
X_clean_cat = X_clean.drop([x for x in X_clean.columns if x not in categorical], axis=1)
X_clean_cont = X_clean.drop([x for x in X_clean.columns if x in categorical], axis=1)

X_clean_cont['target'] = X_clean.target
X_clean_cat['target'] = X_clean.target

X_clean_cont_one = X_clean_cont[X_clean_cont.target == 1]
X_clean_cont_zero = X_clean_cont[X_clean_cont.target == 0]
X_clean_cat_one = X_clean_cat[X_clean_cat.target == 1]
X_clean_cat_zero = X_clean_cat[X_clean_cat.target == 0]

In [259]:
new_x_cont = pd.concat([X_clean_cont_zero.sample(n = math.floor(1.15 * X_clean_cont_one.shape[0]), 
                                                 random_state=10), X_clean_cont_one])
new_x_cat = pd.concat([X_clean_cat_zero.sample(n = math.floor(1.15 * X_clean_cat_one.shape[0]), 
                                               random_state=10), X_clean_cat_one])

y_cont = new_x_cont['target']
y_cat = new_x_cat['target']
X_cont = new_x_cont.drop(['target'], axis=1)
X_cat = new_x_cat.drop(['target'], axis=1)

# End split

In [12]:
X_one = X_clean[X_clean.target == 1]
X_zero = X_clean[X_clean.target == 0]

In [None]:
X_zero.reset_index(drop=True, inplace=True)
X_zero.head()

In [50]:
'''start = 0
end = X_one.shape[0]

model = GradientBoostingClassifier(warm_start=True)
model_rfc = RandomForestClassifier(warm_start=True, n_estimators=100)

for i in range(math.floor(X_zero.shape[0] / X_one.shape[0])):
    df = X_zero.iloc[start:end]
    new_x = pd.concat([df, X_one])
    y = new_x['target']
    X_clean = new_x.drop(['target'], axis=1)
    
    #X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.3, random_state=10)
    
    #model_rfc.fit(X_train, y_train)
    model_rfc.fit(X_clean, y)
    #y_pred = model_rfc.predict(X_test)
    
    #print(classification_report(y_test, y_pred))
    
    start = end
    end += X_one.shape[0]
    model_rfc.n_estimators += 10'''

In [78]:
new_x = pd.concat([X_zero.sample(n = math.floor(1.15 * X_one.shape[0]), random_state=10), X_one])

y = new_x['target']
X_clean = new_x.drop(['target'], axis=1)

In [66]:
# Using all samples without balancing the dataframe

y = X_clean['target']
X_clean = X_clean.drop(['target'], axis=1)

In [87]:
transpose = pd.get_dummies(X_clean, columns=[
'ps_car_01_cat',
'ps_car_02_cat',
'ps_car_04_cat',
'ps_car_06_cat',
'ps_car_07_cat',
'ps_car_08_cat',
'ps_ind_05_cat',])

In [111]:
pca = PCA(n_components=40)
pca.fit(transpose)
X_pca = pca.transform(transpose)

In [325]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.30, random_state=10)
#X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.20, random_state=10)

# 2 Models training
#X_train_cont, X_test_cont, y_train_cont, y_test_cont = train_test_split(X_cont, y_cont, test_size=0.3, random_state=10)
#X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y_cat, test_size=0.3, random_state=10)

In [196]:
model = GradientBoostingClassifier(n_estimators=400)
model_rfc = RandomForestClassifier(n_estimators=250, class_weight='balanced')
model_xgb = xgb.XGBClassifier(n_estimators=200)

# 2 Models
xgb_cont = xgb.XGBClassifier(n_estimators=200)
xgb_cat = xgb.XGBClassifier(n_estimators=200)

In [362]:
# Testing model class balance
# model_xgb = xgb.XGBClassifier(n_estimators=800, scale_pos_weight=26.4367106112289, base_score=0.5)
model_xgb = xgb.XGBClassifier(n_estimators=200, scale_pos_weight=1.15)

In [15]:
#model.fit(X_train, y_train)
#model.fit(X_clean, y)
#model_xgb.fit(X_train, y_train)
model_xgb.fit(X_clean, y)
#model_xgb.fit(X_pca, y)
#model_rfc.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [286]:
xgb_cat.fit(X_train_cat, y_train_cat)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [287]:
xgb_cont.fit(X_train_cont, y_train_cont)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [328]:
#y_pred = model.predict(X_test)
y_pred_xgb = model_xgb.predict(X_test)
#y_pred_rfc = model_rfc.predict(X_test)

In [329]:
print(classification_report(y_test, y_pred_xgb))

             precision    recall  f1-score   support

          0       0.97      0.64      0.77    171887
          1       0.05      0.53      0.10      6677

avg / total       0.94      0.63      0.75    178564



In [116]:
print(classification_report(y_test, y_pred_xgb))

             precision    recall  f1-score   support

          0       0.59      0.73      0.65      4940
          1       0.58      0.43      0.50      4389

avg / total       0.59      0.59      0.58      9329



In [288]:
y_pred_cat = xgb_cat.predict(X_test_cat)
y_pred_cont = xgb_cont.predict(X_test_cont)

In [289]:
# 2 Models f1
print('Continuos model')
print(classification_report(y_test_cont, y_pred_cont))
print('Categorical model')
print(classification_report(y_test_cat, y_pred_cat))

Continuos model
             precision    recall  f1-score   support

          0       0.59      0.71      0.64      7440
          1       0.57      0.43      0.49      6553

avg / total       0.58      0.58      0.57     13993

Categorical model
             precision    recall  f1-score   support

          0       0.58      0.74      0.65      7440
          1       0.56      0.39      0.46      6553

avg / total       0.57      0.57      0.56     13993



In [191]:
# 2 Models f1
print('Continuos model')
print(classification_report(y_test_cont, y_pred_cont))
print('Categorical model')
print(classification_report(y_test_cat, y_pred_cat))

Continuos model
             precision    recall  f1-score   support

          0       0.59      0.71      0.65      6183
          1       0.58      0.45      0.51      5478

avg / total       0.59      0.59      0.58     11661

Categorical model
             precision    recall  f1-score   support

          0       0.58      0.74      0.65      6183
          1       0.57      0.39      0.46      5478

avg / total       0.58      0.58      0.56     11661



In [194]:
X_clean_cat.loc[X_test_cat.index, 'pred'] = y_pred_cat

In [201]:
miss = X_clean_cat[(~X_clean_cat.pred.isnull()) & (X_clean_cat.target != X_clean_cat.pred)]

In [211]:
g = miss.groupby(['ps_ind_05_cat','ps_car_01_cat','ps_car_02_cat','ps_car_04_cat',
                  'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat']).count()

# Test

In [224]:
df_test = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/test.csv')

In [164]:
X_test = df_test.copy()

In [117]:
for col in X_test.columns:
    if 'cat' in col:
        X_test[col] = X_test[col].astype('int32')
        print(col)

ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_04_cat
ps_car_05_cat
ps_car_06_cat
ps_car_07_cat
ps_car_08_cat
ps_car_09_cat
ps_car_10_cat
ps_car_11_cat


In [158]:
for col in X_test.columns:
    X_test[col] = X_test[col].apply(lambda x: np.nan if x == -1 else x)

In [165]:
X_test = X_test.drop(['ps_reg_03', 'ps_car_03_cat','ps_car_05_cat'], axis=1)

In [138]:
for col in X_test.columns:
    if X_test[col].isnull().any():
        if 'cat' in col:
            print(col)
            X_test[col].fillna(value=X_test[col].value_counts().index[0], inplace=True)
        elif 'bin' in col:
            print(col)
            X_test[col].fillna(value=0, inplace=True)
        else:
            print(col)
            X_test[col].fillna(value=X_test[col].mean(), inplace=True)

In [72]:
result = df_test.drop(res, axis=1)

In [120]:
transpose = pd.get_dummies(df_test, columns=[
'ps_car_01_cat',
'ps_car_02_cat',
'ps_car_04_cat',
'ps_car_06_cat',
'ps_car_07_cat',
'ps_car_08_cat',
'ps_ind_05_cat',])

In [122]:
pca_test = PCA(n_components=40)
X_test_pca = pca_test.fit_transform(transpose)

In [22]:
#y_pred = model.predict_proba(df_test)
y_pred = model_xgb.predict_proba(result)
#y_pred = model_xgb.predict_proba(X_test_pca)

In [None]:
response = pd.DataFrame()
response['id'] = df_test['id']
# response['target'] = pd.DataFrame(y_pred)[1]
response['target'] = pd.DataFrame(y_pred)
response

In [84]:
response.to_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_dmatrix.csv', index=False)

# Experiments

In [80]:
dtrain = xgb.DMatrix(X_clean, label=y, missing=np.nan)

In [69]:
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['base_score'] = 0.5
param['colsample_bylevel'] = 1
param['colsample_bytree'] = 1
param['gamma'] = 0
param['min_child_weight'] = 1
param['n_estimators'] = 400,
param['reg_alpha'] = 0
param['reg_lambda'] = 1
param['subsample'] = 1
param['objective'] = 'binary:logistic'

# scale weight of positive examples
param['scale_pos_weight'] = 1
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1

# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

In [81]:
bst = xgb.train(plst, dtrain)

In [82]:
y_pred = bst.predict(xgb.DMatrix(result, missing=-1))

## XGB Imp

In [145]:
train = df.copy()
test = df_test.copy()
col = [c for c in train.columns if c not in ['id','target']]
    
# Feature Selection by Lasso
print('Running Lasso..')
scaler = StandardScaler()
std_data = scaler.fit_transform(train[col].values)
clf = LogisticRegression(penalty='l1', C=0.1, random_state=42, solver='liblinear', n_jobs=1)
clf.fit(std_data, train['target'].values.reshape((-1,)))
imp_feats_ind = np.nonzero(clf.coef_[0])[0]
final_feats = np.array(col)[imp_feats_ind]
print('Lasso Completed!')
print('Total features selected are:', len(final_feats))
print('Features Selected:', final_feats)

Running Lasso..
Lasso Completed!
Total features selected are: 52
Features Selected: ['ps_ind_01' 'ps_ind_02_cat' 'ps_ind_03' 'ps_ind_04_cat' 'ps_ind_05_cat'
 'ps_ind_06_bin' 'ps_ind_07_bin' 'ps_ind_08_bin' 'ps_ind_10_bin'
 'ps_ind_11_bin' 'ps_ind_13_bin' 'ps_ind_14' 'ps_ind_15' 'ps_ind_16_bin'
 'ps_ind_17_bin' 'ps_ind_18_bin' 'ps_reg_01' 'ps_reg_02' 'ps_reg_03'
 'ps_car_01_cat' 'ps_car_02_cat' 'ps_car_03_cat' 'ps_car_04_cat'
 'ps_car_05_cat' 'ps_car_06_cat' 'ps_car_07_cat' 'ps_car_08_cat'
 'ps_car_09_cat' 'ps_car_10_cat' 'ps_car_11_cat' 'ps_car_11' 'ps_car_12'
 'ps_car_13' 'ps_car_14' 'ps_car_15' 'ps_calc_01' 'ps_calc_02' 'ps_calc_03'
 'ps_calc_05' 'ps_calc_08' 'ps_calc_09' 'ps_calc_10' 'ps_calc_11'
 'ps_calc_12' 'ps_calc_13' 'ps_calc_14' 'ps_calc_15_bin' 'ps_calc_16_bin'
 'ps_calc_17_bin' 'ps_calc_18_bin' 'ps_calc_19_bin' 'ps_calc_20_bin']


In [89]:
# XGBoost
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
x1, x2, y1, y2 = train_test_split(train[final_feats], train['target'], test_size=0.25, random_state=99)
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 5000,  watchlist, feval=gini_xgb, maximize=True, verbose_eval=10, early_stopping_rounds=100)
test['target'] = model.predict(xgb.DMatrix(test[final_feats]), ntree_limit=model.best_ntree_limit+50)

test[['id','target']].to_csv('submission.csv', index=False, float_format='%.5f')

[0]	train-gini:0.196449	valid-gini:0.185446
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-gini:0.233551	valid-gini:0.226048
[20]	train-gini:0.238338	valid-gini:0.229735
[30]	train-gini:0.239304	valid-gini:0.230396
[40]	train-gini:0.241244	valid-gini:0.232346
[50]	train-gini:0.242293	valid-gini:0.232407
[60]	train-gini:0.243121	valid-gini:0.232558
[70]	train-gini:0.24504	valid-gini:0.233833
[80]	train-gini:0.246582	valid-gini:0.235691
[90]	train-gini:0.247918	valid-gini:0.23659
[100]	train-gini:0.250004	valid-gini:0.237688
[110]	train-gini:0.252667	valid-gini:0.239756
[120]	train-gini:0.254918	valid-gini:0.24082
[130]	train-gini:0.257898	valid-gini:0.242319
[140]	train-gini:0.261359	valid-gini:0.245083
[150]	train-gini:0.264161	valid-gini:0.246618
[160]	train-gini:0.267826	valid-gini:0.249279
[170]	train-gini:0.27102	valid-gini:0.251058
[180]	train-gini:0.273552	valid-gini:0.25

### Testing with holes filled

In [170]:
complete = pd.concat([df, df_test])
complete.shape

(1488028, 59)

In [174]:
categories = [c for c in complete.columns if '_cat' in c]

In [176]:
indicators = pd.get_dummies(complete, columns=categories)

In [181]:
X_indicator = indicators.iloc[df.index]
test_indicator = indicators.iloc[df_test.index]

In [182]:
print(X_indicator.shape)
print(test_indicator.shape)

(595212, 229)
(892816, 229)


In [197]:
xgb_meta = xgb.XGBClassifier(max_depth=4, learning_rate=0.2, subsample=0.9, colsample_bytree=0.9, n_estimators=250, seed=99)

In [212]:
df_one = X_indicator[X_indicator.target == 1]
df_zero = X_indicator[X_indicator.target == 0]
meta_x = pd.concat([df_zero.sample(n = math.floor(1.15 * df_one.shape[0]), random_state=10), X_one])
meta_x.shape

(46642, 237)

In [214]:
xgb_meta.fit(meta_x[final_feats], meta_x['target'])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=250, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=99, silent=True, subsample=0.9)

In [218]:
xgb_meta.predict_proba(indicators[final_feats])

array([[  9.99957383e-01,   4.26362458e-05],
       [  9.99957383e-01,   4.26362458e-05],
       [  9.99957383e-01,   4.26362458e-05],
       ..., 
       [  9.99957383e-01,   4.26362458e-05],
       [  9.99957383e-01,   4.26362458e-05],
       [  9.99957383e-01,   4.26362458e-05]], dtype=float32)

In [199]:
indicators['meta'] = xgb_meta.predict_proba(indicators[final_feats])

In [183]:
train = X_indicator.copy()
test = test_indicator.copy()
col = [c for c in train.columns if c not in ['id','target']]
    
# Feature Selection by Lasso
print('Running Lasso..')
scaler = StandardScaler()
std_data = scaler.fit_transform(train[col].values)
clf = LogisticRegression(penalty='l1', C=0.1, random_state=42, solver='liblinear', n_jobs=1)
clf.fit(std_data, train['target'].values.reshape((-1,)))
imp_feats_ind = np.nonzero(clf.coef_[0])[0]
final_feats = np.array(col)[imp_feats_ind]
print('Lasso Completed!')
print('Total features selected are:', len(final_feats))
print('Features Selected:', final_feats)

Running Lasso..
Lasso Completed!
Total features selected are: 198
Features Selected: ['ps_calc_01' 'ps_calc_02' 'ps_calc_03' 'ps_calc_05' 'ps_calc_08'
 'ps_calc_09' 'ps_calc_10' 'ps_calc_11' 'ps_calc_12' 'ps_calc_13'
 'ps_calc_14' 'ps_calc_15_bin' 'ps_calc_16_bin' 'ps_calc_17_bin'
 'ps_calc_18_bin' 'ps_calc_19_bin' 'ps_calc_20_bin' 'ps_car_11' 'ps_car_12'
 'ps_car_13' 'ps_car_15' 'ps_ind_01' 'ps_ind_03' 'ps_ind_06_bin'
 'ps_ind_07_bin' 'ps_ind_08_bin' 'ps_ind_10_bin' 'ps_ind_11_bin'
 'ps_ind_13_bin' 'ps_ind_14' 'ps_ind_15' 'ps_ind_16_bin' 'ps_ind_17_bin'
 'ps_ind_18_bin' 'ps_reg_01' 'ps_reg_02' 'ps_reg_03' 'ps_car_01_cat_-1'
 'ps_car_01_cat_0' 'ps_car_01_cat_1' 'ps_car_01_cat_2' 'ps_car_01_cat_3'
 'ps_car_01_cat_4' 'ps_car_01_cat_5' 'ps_car_01_cat_7' 'ps_car_01_cat_8'
 'ps_car_01_cat_9' 'ps_car_01_cat_10' 'ps_car_01_cat_11' 'ps_car_02_cat_-1'
 'ps_car_02_cat_0' 'ps_car_03_cat_-1' 'ps_car_03_cat_1' 'ps_car_04_cat_0'
 'ps_car_04_cat_1' 'ps_car_04_cat_2' 'ps_car_04_cat_3' 'ps_car_04_cat_4

In [194]:
pred = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_baseline_onehot.csv', usecols=['target'])

In [195]:
pred.shape

(892816, 1)

In [204]:
final_feats = np.append(final_feats, ['meta'])

In [206]:
X_indicator = indicators.iloc[df.index]
train = X_indicator.copy()
train.shape

(595212, 230)

In [207]:
# XGBoost
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 
          'eval_metric': 'auc', 'seed': 99, 'silent': True}
x1, x2, y1, y2 = train_test_split(train[final_feats], train['target'], test_size=0.25, random_state=99)
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 5000, watchlist, feval=gini_xgb, maximize=True, verbose_eval=10, early_stopping_rounds=100)

[0]	train-gini:0.200884	valid-gini:0.189385
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[10]	train-gini:0.234085	valid-gini:0.22408
[20]	train-gini:0.237555	valid-gini:0.225962
[30]	train-gini:0.237636	valid-gini:0.225954
[40]	train-gini:0.238949	valid-gini:0.227963
[50]	train-gini:0.24259	valid-gini:0.23132
[60]	train-gini:0.242859	valid-gini:0.231644
[70]	train-gini:0.24303	valid-gini:0.231788
[80]	train-gini:0.24429	valid-gini:0.232913
[90]	train-gini:0.245171	valid-gini:0.23324
[100]	train-gini:0.246817	valid-gini:0.234797
[110]	train-gini:0.248734	valid-gini:0.236091
[120]	train-gini:0.252005	valid-gini:0.238499
[130]	train-gini:0.256489	valid-gini:0.241196
[140]	train-gini:0.260282	valid-gini:0.243891
[150]	train-gini:0.264443	valid-gini:0.247113
[160]	train-gini:0.266908	valid-gini:0.247986
[170]	train-gini:0.269786	valid-gini:0.250389
[180]	train-gini:0.272973	valid-gini:0.2524

In [208]:
predict_tmp = indicators[indicators.id.isin(df_test.id)]

In [185]:
test['target'] = model.predict(xgb.DMatrix(test[final_feats]), ntree_limit=model.best_ntree_limit+50)
test[['id','target']].to_csv('~/Downloads/porto/result_xgb_baseline_onehot.csv', index=False, float_format='%.5f')

In [211]:
predict_tmp['target'] = model.predict(xgb.DMatrix(predict_tmp[final_feats]), ntree_limit=model.best_ntree_limit+50)
predict_tmp[['id','target']].to_csv('~/Downloads/porto/result_xgb_baseline_onehot_meta.csv', index=False, float_format='%.5f')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Meta ensembling with XGB and LGB

In [226]:
train = df.copy()
test = df_test.copy()

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop, axis=1)

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)  

print(train.shape, test.shape)

# xgb
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()
sub.head(2)

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)
    
sub.to_csv('~/Downloads/porto/result_xgb_meta_lgb.csv', index=False, float_format='%.5f') 
gc.collect()
sub.head(2)

(595212, 39) (892816, 38)
 xgb kfold: 1  of  5 : 
[0]	train-gini:0.19047	valid-gini:0.197622
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.250015	valid-gini:0.243101
[200]	train-gini:0.27618	valid-gini:0.2585
[300]	train-gini:0.296144	valid-gini:0.270284
[400]	train-gini:0.309669	valid-gini:0.276664
[500]	train-gini:0.319834	valid-gini:0.279849
[600]	train-gini:0.32752	valid-gini:0.281409
[700]	train-gini:0.334553	valid-gini:0.282087
[800]	train-gini:0.340891	valid-gini:0.281927
Stopping. Best iteration:
[707]	train-gini:0.334978	valid-gini:0.282232

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.192166	valid-gini:0.181254
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.249937	valid-gini:0.232574
[200]	train-gini:0.275353	valid-gini:0.256817
[300]	train-gini:0

Unnamed: 0,id,target
0,0,0.02631
1,1,0.027545


### Averaging

In [228]:
df1 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_baseline.csv')
df2 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_baseline_onehot.csv')
df3 = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_meta_lgb.csv')

In [231]:
m1 = pd.merge(df1, df2, on=['id'])

In [232]:
m2 = pd.merge(m1, df3, on=['id'])
m2.head(3)

Unnamed: 0,id,target_x,target_y,target
0,0,0.02813,0.02975,0.02631
1,1,0.02827,0.02507,0.02755
2,2,0.02659,0.02842,0.02639


In [236]:
m2['avg'] = m2[['target_x', 'target_y', 'target']].mean(axis=1)

In [238]:
m2.rename(columns={'target': 'target_z', 'avg': 'target'}, inplace=True)
m2.head(3)

Unnamed: 0,id,target_x,target_y,target_z,target
0,0,0.02813,0.02975,0.02631,0.028063
1,1,0.02827,0.02507,0.02755,0.026963
2,2,0.02659,0.02842,0.02639,0.027133


In [239]:
m2[['id', 'target']].to_csv('C:/Users/pedro.castanha/Downloads/porto/result_xgb_avg.csv', index=False, float_format='%.5f')