In [1]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def Gini(y_true, y_pred):
    
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).T
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 0].argsort()][::-1, 0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coeff (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coeff
    return G_pred * 1. / G_true

In [3]:
cv_only = True
save_cv = True
full_train = False

In [4]:
def evalerror(preds, dtrain):
    labels = dtrain.get_labels()
    return 'gini', Gini(labels, preds), True

In [5]:
%%time
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

Wall time: 5.19 s


In [6]:
train_label = train['target']
train_id = train['id']
test_id = test['id']

In [7]:
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

In [8]:
y = train['target'].values

drop_feature = ['id', 'target']
X = train.drop(drop_feature, axis=1)

feature_names = X.columns.tolist()

In [9]:
cat_feature_names = [c for c in feature_names if ('cat' in c and 'count' not in c)]
cat_feature_names

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [10]:
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]
num_features

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15']

In [11]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)

In [12]:
num_features.append('missing')
num_features

['ps_ind_01',
 'ps_ind_03',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'missing']

In [13]:
for c in cat_feature_names:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [14]:
enc = OneHotEncoder()
enc.fit(train[cat_feature_names])
X_cat = enc.transform(train[cat_feature_names])
X_t_cat = enc.transform(test[cat_feature_names])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [15]:
ind_feature = [c for c in feature_names if 'ind' in c]
ind_feature

['ps_ind_01',
 'ps_ind_02_cat',
 'ps_ind_03',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin']

In [16]:
count = 0

In [17]:
for c in ind_feature:
    if count == 0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [19]:
cat_count_features = []
for c in cat_feature_names+['new_ind']:
    d = pd.concat([train[c], test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [22]:
train_list = [train[num_features+cat_count_features].values, X_cat, ]
test_list = [test[num_features+cat_count_features].values, X_t_cat, ]

KeyError: "['ps_car_03_cat_count', 'ps_ind_04_cat_count', 'ps_car_04_cat_count', 'ps_car_10_cat_count', 'ps_ind_05_cat_count', 'ps_car_06_cat_count', 'ps_car_08_cat_count', 'ps_ind_02_cat_count', 'new_ind_count', 'ps_car_05_cat_count', 'ps_car_02_cat_count', 'ps_car_09_cat_count', 'ps_car_11_cat_count', 'ps_car_07_cat_count', 'ps_car_01_cat_count'] not in index"