In [1]:
import gc
import math

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA

from sklearn.metrics import classification_report



In [88]:
def gini(y, pred):
    assert(len(y) == len(pred))
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [77]:
def bucketize(df, cols):
    for col in cols:
        print('Processing ' + col)
        stats = df[col].describe()
        df[col + '_bkt'] = df.loc[:, col].apply(lambda x: 'G1' if x <= stats[-4] else 'G2' 
                                                if x <= stats[-3] else 'G3' if x <= stats[-2] else 'G4').values

In [87]:
#train = df_train.copy()
#test = df_test.copy()
#col = [c for c in train.columns if c not in ['id','target']]

def feature_selection(X, cols):
    # Feature Selection by Regularization
    print('Running Lasso..')
    
    scaler = StandardScaler()
    std_data = scaler.fit_transform(X[cols].values)
    clf = LogisticRegression(penalty='l1', C=0.1, random_state=42, solver='liblinear', n_jobs=-1)
    clf.fit(std_data, X['target'].values.reshape((-1,)))
    imp_feats_ind = np.nonzero(clf.coef_[0])[0]
    final_feats = np.array(cols)[imp_feats_ind]
    
    print('Completed!')
    print('Total features selected are:', len(final_feats))
    print('Features Selected:', final_feats)
    
    return final_feats

#final_feats = feature_selection(train, col)

In [75]:
df_train = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/train.csv')
df_test = pd.read_csv('C:/Users/pedro.castanha/Downloads/porto/test.csv')

## Model ensembling

In [82]:
categorical = [c for c in df_train.columns if '_cat' in c or '_bkt' in c]
len(categorical)

26

In [72]:
train_encoded.target = train_encoded.target.astype(np.int8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [None]:
'''pfeatures = PolynomialFeatures(degree=2, interaction_only=True, include_bias=True)
join = pd.concat([df_train, df_test], ignore_index=True)
join.drop(join.columns[join.columns.str.startswith('ps_calc_')], axis=1, inplace=True)

x_categorical = join[categorical]
x_non_categorical = join[join.columns.difference(categorical)]

new_features = x_non_categorical.drop(['id', 'target'], axis=1)
new_matrix = pfeatures.fit_transform(new_features)
res = pd.DataFrame(data=new_matrix, columns=pfeatures.get_feature_names())
res['target'] = x_non_categorical['target']
res['id'] = x_non_categorical['id']

gc.collect()
encoded = pd.get_dummies(x_categorical, columns=categorical)

gc.collect()
final = pd.concat([res, encoded], axis=1)

train_encoded = final.loc[final.id.isin(df_train.id), ]
gc.collect()
test_encoded = final.loc[final.id.isin(df_test.id), ].drop(['target'], axis=1)

print(train_encoded.shape)
print(test_encoded.shape)'''

In [85]:
join = pd.concat([df_train, df_test], ignore_index=True)
join.drop(join.columns[join.columns.str.startswith('ps_calc_')], axis=1, inplace=True)
bucketize(join, cols)

encoded = pd.get_dummies(join, columns=categorical)
encoded.target = encoded.target.astype(np.int8, errors='ignore')
gc.collect()

train_encoded = encoded.loc[encoded.id.isin(df_train.id), ]
test_encoded = encoded.loc[encoded.id.isin(df_test.id), ].drop(['target'], axis=1)

print(train_encoded.shape)
print(test_encoded.shape)

Processing ps_ind_01
Processing ps_ind_03
Processing ps_ind_14
Processing ps_ind_15
Processing ps_reg_01
Processing ps_reg_02
Processing ps_reg_03
Processing ps_car_11
Processing ps_car_12
Processing ps_car_13
Processing ps_car_14
Processing ps_car_15
(595212, 252)
(892816, 251)


In [89]:
res_cols = [c for c in train_encoded.columns if c not in ['id','target']]
feats = feature_selection(train_encoded, res_cols)
feats = np.append(feats, ['id', 'target'])

Running Lasso..
Completed!
Total features selected are: 213
Features Selected: ['ps_car_11' 'ps_car_12' 'ps_car_13' 'ps_car_15' 'ps_ind_01'
 'ps_ind_06_bin' 'ps_ind_07_bin' 'ps_ind_08_bin' 'ps_ind_10_bin'
 'ps_ind_12_bin' 'ps_ind_13_bin' 'ps_ind_15' 'ps_ind_16_bin'
 'ps_ind_17_bin' 'ps_ind_18_bin' 'ps_reg_01' 'ps_reg_02' 'ps_reg_03'
 'ps_ind_02_cat_-1' 'ps_ind_02_cat_2' 'ps_ind_02_cat_3' 'ps_ind_02_cat_4'
 'ps_ind_04_cat_-1' 'ps_ind_04_cat_0' 'ps_ind_05_cat_-1' 'ps_ind_05_cat_0'
 'ps_ind_05_cat_2' 'ps_ind_05_cat_3' 'ps_ind_05_cat_4' 'ps_ind_05_cat_5'
 'ps_ind_05_cat_6' 'ps_car_01_cat_-1' 'ps_car_01_cat_0' 'ps_car_01_cat_1'
 'ps_car_01_cat_2' 'ps_car_01_cat_3' 'ps_car_01_cat_4' 'ps_car_01_cat_5'
 'ps_car_01_cat_7' 'ps_car_01_cat_8' 'ps_car_01_cat_9' 'ps_car_01_cat_10'
 'ps_car_01_cat_11' 'ps_car_02_cat_-1' 'ps_car_02_cat_0' 'ps_car_02_cat_1'
 'ps_car_03_cat_-1' 'ps_car_03_cat_1' 'ps_car_04_cat_0' 'ps_car_04_cat_1'
 'ps_car_04_cat_2' 'ps_car_04_cat_3' 'ps_car_04_cat_4' 'ps_car_04_cat_5'


In [None]:
train = train_encoded[feats].copy()
test = test_encoded.copy()

#train['ps_car_13_x_ps_ind_03'] = train['ps_car_13'] * train['ps_ind_03']
train['ps_car_13_x_ps_reg_03'] = train['ps_car_13'] * train['ps_reg_03']

#test['ps_car_13_x_ps_ind_03'] = test['ps_car_13'] * test['ps_ind_03']
test['ps_car_13_x_ps_reg_03'] = test['ps_car_13'] * test['ps_reg_03']

#X = train.drop(['id', 'target'], axis=1)
X = train[final]
features = X.columns
X = X.values
y = train['target'].values
sub = test['id'].to_frame()
sub['target'] = 0

print(X.shape)
print(len(features))

kfold = 4
nrounds = 2000

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':7, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

results = list()

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i + 1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    
    results.append({'iter': i + 1, 'prediction': lgb_model.predict(test[features].values, 
                                                                   num_iteration=lgb_model.best_iteration)})
    
gc.collect()

In [168]:
dict(list(lgb_model.best_score.values())[0])['gini']

0.27398256580659952

In [146]:
w = [0.28982, 0.28377, 0.289779, 0.274796]

In [107]:
results[0]['prediction'] * w[0] / sum(w)

array([ 0.00775416,  0.00673922,  0.00654757, ...,  0.01083775,
        0.00569593,  0.00862744])

In [155]:
sub['target'] = 0
for i in range(len(w)):
    sub['target'] += results[i]['prediction'] * w[i] / sum(w)

In [157]:
sub.to_csv('~/Downloads/porto/result_lgb_4fold_ensemble_feature_selection_001.csv', index=False, float_format='%.5f')

In [156]:
sub.head(3)

Unnamed: 0,id,target
595212,0,0.02908
595213,1,0.025286
595214,2,0.024894


# Feature analysis

In [113]:
train.head(5)

Unnamed: 0,ps_car_11,ps_car_12,ps_car_13,ps_car_15,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_10_bin,...,ps_car_11_cat_96,ps_car_11_cat_97,ps_car_11_cat_98,ps_car_11_cat_99,ps_car_11_cat_101,ps_car_11_cat_103,ps_car_11_cat_104,id,target,ps_car_13_x_ps_reg_03
0,2,0.4,0.883679,3.605551,2,5,0,1,0,0,...,0,0,0,0,0,0,0,7,0.0,0.634544
1,3,0.316228,0.618817,2.44949,1,7,0,0,1,0,...,0,0,0,0,0,0,0,9,0.0,0.474062
2,1,0.316228,0.641586,3.316625,5,9,0,0,1,0,...,0,0,0,0,0,0,0,13,0.0,-0.641586
3,1,0.374166,0.542949,2.0,0,2,1,0,0,0,...,0,0,0,0,0,0,1,16,0.0,0.315425
4,3,0.31607,0.565832,2.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,17,0.0,0.475728


In [90]:
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced')

In [117]:
x_tr, x_te, y_tr, y_te = train_test_split(train.drop(['id', 'target'], axis=1), train['target'], test_size=0.25, random_state=10)

In [92]:
x_tr, x_te, y_tr, y_te = train_test_split(train_encoded[feats].drop(['id', 'target'], axis=1), train_encoded['target'], test_size=0.25, random_state=10)

In [93]:
rfc.fit(x_tr, y_tr)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [97]:
tmp = rfc.feature_importances_
tmp_2 = list(zip(list(train_encoded[feats].drop(['id', 'target'], axis=1).columns), list(tmp)))
tmp_2

[('ps_car_11', 0.012648171052023267),
 ('ps_car_12', 0.026946682713697729),
 ('ps_car_13', 0.076785732142168106),
 ('ps_car_15', 0.03609475990634612),
 ('ps_ind_01', 0.032203239768109328),
 ('ps_ind_06_bin', 0.010120815572846047),
 ('ps_ind_07_bin', 0.0094729029192570507),
 ('ps_ind_08_bin', 0.0084548305313338606),
 ('ps_ind_10_bin', 3.8876685972299775e-05),
 ('ps_ind_12_bin', 0.00089665796675703448),
 ('ps_ind_13_bin', 9.0753195701608215e-05),
 ('ps_ind_15', 0.047844831397844081),
 ('ps_ind_16_bin', 0.01084191903192846),
 ('ps_ind_17_bin', 0.0078925660494098115),
 ('ps_ind_18_bin', 0.00741524641843932),
 ('ps_reg_01', 0.034372454134408037),
 ('ps_reg_02', 0.041054193752387294),
 ('ps_reg_03', 0.067357406127983535),
 ('ps_ind_02_cat_-1', 0.00017130647934937075),
 ('ps_ind_02_cat_2', 0.011519352922207841),
 ('ps_ind_02_cat_3', 0.0048229298269371204),
 ('ps_ind_02_cat_4', 0.0022961932184484423),
 ('ps_ind_04_cat_-1', 8.0525551723661175e-05),
 ('ps_ind_04_cat_0', 0.014107738937208238),
 (

In [151]:
res = []
for e in tmp_2:
    if e[1] >= .001:
        res.append(e)
        
print(len(res))

107


In [152]:
final = [f[0] for f in res]

In [153]:
final

['ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_15',
 'ps_ind_01',
 'ps_ind_03',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_ind_02_cat_2',
 'ps_ind_02_cat_3',
 'ps_ind_02_cat_4',
 'ps_ind_04_cat_0',
 'ps_ind_04_cat_1',
 'ps_ind_05_cat_0',
 'ps_ind_05_cat_2',
 'ps_ind_05_cat_3',
 'ps_ind_05_cat_4',
 'ps_ind_05_cat_6',
 'ps_car_01_cat_0',
 'ps_car_01_cat_3',
 'ps_car_01_cat_4',
 'ps_car_01_cat_5',
 'ps_car_01_cat_7',
 'ps_car_01_cat_8',
 'ps_car_01_cat_9',
 'ps_car_01_cat_10',
 'ps_car_01_cat_11',
 'ps_car_02_cat_0',
 'ps_car_02_cat_1',
 'ps_car_03_cat_-1',
 'ps_car_03_cat_1',
 'ps_car_04_cat_0',
 'ps_car_04_cat_1',
 'ps_car_04_cat_2',
 'ps_car_04_cat_9',
 'ps_car_05_cat_-1',
 'ps_car_05_cat_0',
 'ps_car_06_cat_0',
 'ps_car_06_cat_1',
 'ps_car_06_cat_3',
 'ps_car_06_cat_4',
 'ps_car_06_cat_6',
 'ps_car_06_cat_7',
 'ps_car_06_cat_9',
 'ps_car_06_ca

In [171]:
pca = PCA(n_components='mle', svd_solver='full')

In [173]:
pca.fit(train[final])

PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [174]:
pca.n_components_

106

In [181]:
print(pca.noise_variance_)

2.61133278506e-30


In [12]:
cols = [c for c in df_train.columns if 'calc' not in c if 'cat' not in c if 'bin' not in c and c not in ['id', 'target']]
cols

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_14',
 'ps_ind_15',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15']

In [14]:
num = df_train[cols]
num

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
0,2,5,0,11,0.7,0.2,0.718070,2,0.400000,0.883679,0.370810,3.605551
1,1,7,0,3,0.8,0.4,0.766078,3,0.316228,0.618817,0.388716,2.449490
2,5,9,0,12,0.0,0.0,-1.000000,1,0.316228,0.641586,0.347275,3.316625
3,0,2,0,8,0.9,0.2,0.580948,1,0.374166,0.542949,0.294958,2.000000
4,0,0,0,9,0.7,0.6,0.840759,3,0.316070,0.565832,0.365103,2.000000
5,5,4,0,6,0.9,1.8,2.332649,2,0.445982,0.879049,0.406202,3.000000
6,2,3,0,8,0.6,0.1,0.617454,2,0.316228,0.639683,0.368782,3.162278
7,5,4,0,13,0.7,0.4,0.607248,3,0.446990,0.900574,0.374833,3.316625
8,5,3,0,6,0.9,0.7,0.901388,3,0.400000,0.780641,0.407431,2.828427
9,1,2,0,4,0.9,1.4,2.316652,2,0.447214,1.458184,0.390256,3.605551


In [19]:
for c in num.columns:
    print('Column: ' + c + ' | Max: ' + str(num[c].max()) + ' Min: ' + str(num[c].min()))

Column: ps_ind_01 | Max: 7 Min: 0
Column: ps_ind_03 | Max: 11 Min: 0
Column: ps_ind_14 | Max: 4 Min: 0
Column: ps_ind_15 | Max: 13 Min: 0
Column: ps_reg_01 | Max: 0.9 Min: 0.0
Column: ps_reg_02 | Max: 1.8 Min: 0.0
Column: ps_reg_03 | Max: 4.0379450219 Min: -1.0
Column: ps_car_11 | Max: 3 Min: -1
Column: ps_car_12 | Max: 1.2649110641 Min: -1.0
Column: ps_car_13 | Max: 3.7206260026 Min: 0.2506190682
Column: ps_car_14 | Max: 0.6363961031 Min: -1.0
Column: ps_car_15 | Max: 3.7416573868 Min: 0.0


In [23]:
num.describe()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,1.900378,4.423318,0.012451,7.299922,0.610991,0.439184,0.551102,2.346072,0.379945,0.813265,0.276256,3.065899
std,1.983789,2.699902,0.127545,3.546042,0.287643,0.404264,0.793506,0.832548,0.058327,0.224588,0.357154,0.731366
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.250619,-1.0,0.0
25%,0.0,2.0,0.0,5.0,0.4,0.2,0.525,2.0,0.316228,0.670867,0.333167,2.828427
50%,1.0,4.0,0.0,7.0,0.7,0.3,0.720677,3.0,0.374166,0.765811,0.368782,3.316625
75%,3.0,6.0,0.0,10.0,0.9,0.6,1.0,3.0,0.4,0.90619,0.396485,3.605551
max,7.0,11.0,4.0,13.0,0.9,1.8,4.037945,3.0,1.264911,3.720626,0.636396,3.741657


In [22]:
d = num['ps_ind_01'].describe()
d

count    595212.000000
mean          1.900378
std           1.983789
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max           7.000000
Name: ps_ind_01, dtype: float64

In [76]:
bucketize(df_train, cols)

In [79]:
df_train[[c + '_bkt' for c in cols]]

Unnamed: 0,ps_ind_01_bkt,ps_ind_03_bkt,ps_ind_14_bkt,ps_ind_15_bkt,ps_reg_01_bkt,ps_reg_02_bkt,ps_reg_03_bkt,ps_car_11_bkt,ps_car_12_bkt,ps_car_13_bkt,ps_car_14_bkt,ps_car_15_bkt
0,G3,G3,G1,G4,G2,G1,G2,G1,G3,G3,G3,G3
1,G2,G4,G1,G1,G3,G3,G3,G2,G1,G1,G3,G1
2,G4,G4,G1,G4,G1,G1,G1,G1,G1,G1,G2,G2
3,G1,G1,G1,G3,G3,G1,G2,G1,G2,G1,G1,G1
4,G1,G1,G1,G3,G2,G3,G3,G2,G1,G1,G2,G1
5,G4,G2,G1,G2,G3,G4,G4,G1,G4,G3,G4,G2
6,G3,G2,G1,G3,G2,G1,G2,G1,G1,G1,G2,G2
7,G4,G2,G1,G4,G2,G3,G2,G2,G4,G3,G3,G2
8,G4,G2,G1,G2,G3,G4,G3,G2,G3,G3,G4,G1
9,G2,G1,G1,G1,G3,G4,G4,G1,G4,G4,G3,G3
