In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from datetime import datetime, timedelta, date
import calendar
from sys import getsizeof
import math, collections, io, re

import pandas as pd
import numpy as np
import scipy
from scipy.sparse import *
from scipy.spatial.distance import euclidean, seuclidean
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import normalize

from scipy.sparse import lil_matrix, csc_matrix, csr_matrix, coo_matrix
from scipy.sparse import hstack, vstack

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from scipy.stats import skew

import pickle
import yaml
import json
import sys, getopt, fnmatch, gc

import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split

import collections

pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
np.seterr(divide='ignore', invalid='ignore')



{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

# Load data

In [3]:
submission = pd.read_csv('./input/sample_submission_sLex1ul.csv')
origin_train = pd.read_csv('./input/train_ZoGVYWq.csv')
origin_test = pd.read_csv('./input/test_66516Ee.csv')

# Data preparation

In [4]:
origin_train = pd.concat([origin_train, pd.get_dummies(origin_train.sourcing_channel)], axis=1, join_axes=[origin_train.index])
origin_test = pd.concat([origin_test, pd.get_dummies(origin_test.sourcing_channel)], axis=1, join_axes=[origin_test.index])

In [5]:
origin_train = pd.concat([origin_train, pd.get_dummies(origin_train.residence_area_type)], axis=1, join_axes=[origin_train.index])
origin_test = pd.concat([origin_test, pd.get_dummies(origin_test.residence_area_type)], axis=1, join_axes=[origin_test.index])

In [6]:
origin_train['total_count'] = origin_train['Count_3-6_months_late'] + origin_train['Count_6-12_months_late'] + origin_train['Count_more_than_12_months_late']
origin_test['total_count'] = origin_test['Count_3-6_months_late'] + origin_test['Count_6-12_months_late'] + origin_test['Count_more_than_12_months_late']

In [7]:
application_underwriting_score_mean = origin_train.application_underwriting_score.mean()

origin_train.application_underwriting_score.fillna(application_underwriting_score_mean, inplace=True)
origin_test.application_underwriting_score.fillna(application_underwriting_score_mean, inplace=True)

# Create matrixes

In [9]:
full_feature_names = [
    'perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
    'Count_3-6_months_late', 'Count_6-12_months_late', 'Count_more_than_12_months_late',
    'application_underwriting_score', 'no_of_premiums_paid',
    'A', 'B', 'C', 'D', 'E', 'Rural', 'Urban', 'total_count',
    'premium'
]

In [10]:
full_train_sparse = csr_matrix(origin_train[full_feature_names])
full_test_sparse = csr_matrix(origin_test[full_feature_names])

# lightgbm

In [11]:
def fit_predict(data, y, test):
    dtrain = lgb.Dataset(data=data, label=y, free_raw_data=False)
    dtrain.construct()

    oof_preds = np.zeros(data.shape[0])
    sub_preds = np.zeros(test.shape[0])

    lgb_params = {
        "objective" : "binary",
        "metric" : "binary_logloss",
#         "metric" : "auc",

#         'max_depth': 3,
        "num_leaves": 10,
        "min_data_in_leaf": 10,
        "learning_rate": 0.01,

        "feature_fraction": 0.3,
        "feature_fraction_seed": 10,

        "bagging_fraction": 0.8,
        "bagging_freq" : 10,
        "bagging_seed" : 42, #2018

        "verbosity" : 1,
#         'lambda_l1' : 10,
#         'lambda_l2' : 10,
        'max_bin' : 50
    }

    folds = KFold(n_splits=7, shuffle=True, random_state=2)

    counter = 1
    for trn_idx, val_idx in folds.split(data):
        print('----------------------------')
        print('Fold: %d' % counter)

        trn_d = dtrain.subset(trn_idx)
        val_d = dtrain.subset(val_idx)

        clf = lgb.train(
            params=lgb_params,
            train_set=trn_d,
            valid_sets=[trn_d, val_d],
            num_boost_round=10000,
            early_stopping_rounds=100,
            verbose_eval=50
        )

        oof_preds[val_idx] = clf.predict(dtrain.data[val_idx, :])
        sub_preds += clf.predict(test) / folds.n_splits
        
        counter += 1

    print('Full Out-Of-Fold score : %9.6f' % (mean_squared_error(y, oof_preds)**0.5))

    return oof_preds, sub_preds

In [12]:
train_renewal_preds, test_renewal_preds = fit_predict(full_train_sparse, origin_train.renewal, full_test_sparse)

----------------------------
Fold: 1
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.4321	valid_1's binary_logloss: 0.430339
[100]	training's binary_logloss: 0.312031	valid_1's binary_logloss: 0.309161
[150]	training's binary_logloss: 0.251775	valid_1's binary_logloss: 0.248142
[200]	training's binary_logloss: 0.219619	valid_1's binary_logloss: 0.215519
[250]	training's binary_logloss: 0.202765	valid_1's binary_logloss: 0.19838
[300]	training's binary_logloss: 0.192905	valid_1's binary_logloss: 0.188443
[350]	training's binary_logloss: 0.187307	valid_1's binary_logloss: 0.182879
[400]	training's binary_logloss: 0.184005	valid_1's binary_logloss: 0.179667
[450]	training's binary_logloss: 0.181669	valid_1's binary_logloss: 0.17748
[500]	training's binary_logloss: 0.180279	valid_1's binary_logloss: 0.176312
[550]	training's binary_logloss: 0.179179	valid_1's binary_logloss: 0.175434
[600]	training's binary_logloss: 0.178414	valid_1's binary

----------------------------
Fold: 4
Training until validation scores don't improve for 100 rounds.
[50]	training's binary_logloss: 0.432014	valid_1's binary_logloss: 0.430743
[100]	training's binary_logloss: 0.311925	valid_1's binary_logloss: 0.309838
[150]	training's binary_logloss: 0.251748	valid_1's binary_logloss: 0.248872
[200]	training's binary_logloss: 0.21971	valid_1's binary_logloss: 0.216266
[250]	training's binary_logloss: 0.202864	valid_1's binary_logloss: 0.19896
[300]	training's binary_logloss: 0.193082	valid_1's binary_logloss: 0.188829
[350]	training's binary_logloss: 0.1875	valid_1's binary_logloss: 0.183104
[400]	training's binary_logloss: 0.18418	valid_1's binary_logloss: 0.179735
[450]	training's binary_logloss: 0.18184	valid_1's binary_logloss: 0.177424
[500]	training's binary_logloss: 0.180456	valid_1's binary_logloss: 0.176185
[550]	training's binary_logloss: 0.17932	valid_1's binary_logloss: 0.175206
[600]	training's binary_logloss: 0.178545	valid_1's binary_lo

[1100]	training's binary_logloss: 0.172488	valid_1's binary_logloss: 0.18236
[1150]	training's binary_logloss: 0.172165	valid_1's binary_logloss: 0.182322
[1200]	training's binary_logloss: 0.171868	valid_1's binary_logloss: 0.182328
[1250]	training's binary_logloss: 0.171604	valid_1's binary_logloss: 0.182339
Early stopping, best iteration is:
[1180]	training's binary_logloss: 0.171977	valid_1's binary_logloss: 0.1823
Full Out-Of-Fold score :  0.219779


# xgboost

In [13]:
dtest = xgb.DMatrix(full_test_sparse, feature_names=full_feature_names)

In [14]:
def fit_predict_xgboost(data, y, dtest):

    oof_preds = np.zeros(data.shape[0])
    sub_preds = np.zeros(dtest.num_row())

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'max_depth': 3,
        'eta': 0.01,
        'eval_metric': 'logloss',
        'subsample': 0.8,
        'colsample_bytree': 0.3,
    #     'min_child_weight': 5,
    #     'lambda': 10,
    #     'alpha': 10,
        'nthread': 4
    #     'silent': 1,
    }


    folds = KFold(n_splits=7, shuffle=True, random_state=1)

    counter = 1
    for trn_idx, val_idx in folds.split(data):
        print('----------------------------')
        print('Fold: %d' % counter)

        dtrain_sample = xgb.DMatrix(data[trn_idx], label=y[trn_idx], feature_names=full_feature_names)
        dvalid_sample = xgb.DMatrix(data[val_idx], label=y[val_idx], feature_names=full_feature_names)
        assert(dtrain_sample.num_col() == dvalid_sample.num_col())
        assert(dtrain_sample.num_col() == len(full_feature_names))
        assert(dtrain_sample.num_col() == data.shape[1])
        assert(dvalid_sample.num_col() == data.shape[1])

        watchlist  = [(dtrain_sample, 'train'), (dvalid_sample, 'valid')]
        num_round = 10000
        xgb_model = xgb.train(params, dtrain_sample, num_round, watchlist,
                              verbose_eval=50,
                              early_stopping_rounds=100)

        oof_preds[val_idx] = xgb_model.predict(dvalid_sample)
        sub_preds += xgb_model.predict(dtest) / folds.n_splits

        counter += 1

    print('Full Out-Of-Fold score : %9.6f'
          % (mean_squared_error(y, oof_preds)**0.5))

    return oof_preds, sub_preds

In [15]:
train_renewal_preds_xg, test_renewal_preds_xg = fit_predict_xgboost(full_train_sparse, origin_train.renewal, dtest)

----------------------------
Fold: 1
[0]	train-logloss:0.685219	valid-logloss:0.685253
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.428939	valid-logloss:0.430028
[100]	train-logloss:0.31108	valid-logloss:0.312966
[150]	train-logloss:0.252593	valid-logloss:0.25521
[200]	train-logloss:0.219996	valid-logloss:0.223113
[250]	train-logloss:0.202617	valid-logloss:0.206247
[300]	train-logloss:0.19295	valid-logloss:0.196842
[350]	train-logloss:0.187761	valid-logloss:0.19199
[400]	train-logloss:0.184176	valid-logloss:0.188634
[450]	train-logloss:0.181984	valid-logloss:0.186545
[500]	train-logloss:0.180511	valid-logloss:0.185189
[550]	train-logloss:0.179455	valid-logloss:0.184238
[600]	train-logloss:0.178712	valid-logloss:0.183604
[650]	train-logloss:0.178191	valid-logloss:0.183265
[700]	train-logloss:0.177731	valid-logloss:0.18294
[750]	train-logloss:0.177316	valid-logl

[1000]	train-logloss:0.177439	valid-logloss:0.175628
[1050]	train-logloss:0.177212	valid-logloss:0.175537
[1100]	train-logloss:0.177003	valid-logloss:0.175498
[1150]	train-logloss:0.176852	valid-logloss:0.175447
[1200]	train-logloss:0.176729	valid-logloss:0.175438
[1250]	train-logloss:0.176567	valid-logloss:0.17543
[1300]	train-logloss:0.1764	valid-logloss:0.175414
[1350]	train-logloss:0.176302	valid-logloss:0.175383
[1400]	train-logloss:0.176201	valid-logloss:0.175397
[1450]	train-logloss:0.176076	valid-logloss:0.175379
[1500]	train-logloss:0.175989	valid-logloss:0.175357
[1550]	train-logloss:0.17588	valid-logloss:0.175344
[1600]	train-logloss:0.175771	valid-logloss:0.175327
[1650]	train-logloss:0.175713	valid-logloss:0.175335
[1700]	train-logloss:0.175624	valid-logloss:0.175341
Stopping. Best iteration:
[1601]	train-logloss:0.175769	valid-logloss:0.175326

----------------------------
Fold: 5
[0]	train-logloss:0.685224	valid-logloss:0.685191
Multiple eval metrics have been passed: 'v

## Bagging

In [16]:
origin_test['renewal_preds'] = (test_renewal_preds + test_renewal_preds_xg) / 2

In [116]:
# submission['renewal'] = (test_renewal_preds + test_renewal_preds_xg) / 2
# submission['incentives'] = 350
# submission.to_csv('./output/10_mix_kfold7__350.csv', index=False)

## incentives function

In [59]:
submission['renewal'] = origin_test['renewal_preds']
submission['incentives'] = np.sqrt(origin_test.premium)*2.8 * (np.exp2(1-origin_test.renewal_preds))

submission.to_csv('./output/17_mix__sqrt2.8_exp2.csv', index=False)