# Configurations

In [100]:
from sklearn import preprocessing
from IPython.display import display, HTML
import numpy as np
import pandas as pd
#from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics, linear_model, svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

Training_file = './Dataset/train_ZoGVYWq.csv'
Test_file     = './Dataset/test_66516Ee.csv'

Training_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int, 
'renewal':np.int}

Test_dtype = {'id':np.str, 'perc_premium_paid_by_cash_credit':np.float32, 
'age_in_days':np.float32, 
'Income':np.float32,
'Count_3-6_months_late':np.float32, 
'Count_6-12_months_late':np.float32,
'Count_more_than_12_months_late':np.float32, 
'application_underwriting_score':np.float32,
'no_of_premiums_paid':np.float32, 
'sourcing_channel':np.str, 
'residence_area_type':np.str,
'premium':np.int}


## 1) Loading Dataset - Train and test

In [101]:
# Loading dataset
trn_origin = pd.read_csv(Training_file, dtype=Training_dtype, na_values='')
tst_origin = pd.read_csv(Test_file, dtype=Test_dtype, na_values='')

display(trn_origin.head())
display(tst_origin.head())

#Check the shape of each dataset
print(trn_origin.shape)
print(tst_origin.shape)

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,renewal
0,110936,0.429,12058.0,355060.0,0.0,0.0,0.0,99.019997,13.0,C,Urban,3300,1
1,41492,0.01,21546.0,315150.0,0.0,0.0,0.0,99.889999,21.0,A,Urban,18000,1
2,31300,0.917,17531.0,84140.0,2.0,3.0,1.0,98.690002,7.0,C,Rural,3300,0
3,19415,0.049,15341.0,250510.0,0.0,0.0,0.0,99.57,9.0,A,Urban,9600,1
4,99379,0.052,31400.0,198680.0,0.0,0.0,0.0,99.870003,12.0,B,Urban,9600,1


Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium
0,649,0.001,27384.0,51150.0,0.0,0.0,0.0,99.889999,7.0,A,Rural,3300
1,81136,0.124,23735.0,285140.0,0.0,0.0,0.0,98.93,19.0,A,Urban,11700
2,70762,1.0,17170.0,186030.0,0.0,0.0,0.0,,2.0,B,Urban,11700
3,53935,0.198,16068.0,123540.0,0.0,0.0,0.0,99.0,11.0,B,Rural,5400
4,15476,0.041,10591.0,200020.0,1.0,0.0,0.0,99.169998,14.0,A,Rural,9600


(79853, 13)
(34224, 12)


## 1-2) Check the descriptive statistics for each dataset
- We can see that some columns have 'missing' values in it as counts of each column are different.

In [102]:
display(trn_origin.describe())
display(tst_origin.describe())

# age_in_days, Income, Count_3-6_months_late, Count_6-12_months_late	Count_more_than_12_months_late	application_underwriting_score	no_of_premiums_paid	premium

Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium,renewal
count,79853.0,79853.0,79853.0,79756.0,79756.0,79756.0,76879.0,79853.0,79853.0,79853.0
mean,0.314291,18846.738281,208848.6,0.248671,0.078188,0.060008,99.071289,10.863888,10924.507533,0.93741
std,0.334902,5208.866699,496577.8,0.691676,0.436455,0.311902,0.739792,5.170848,9401.676542,0.242226
min,0.0,7670.0,24030.0,0.0,0.0,0.0,91.900002,2.0,1200.0,0.0
25%,0.034,14974.0,108010.0,0.0,0.0,0.0,98.809998,7.0,5400.0,1.0
50%,0.167,18625.0,166560.0,0.0,0.0,0.0,99.209999,10.0,7500.0,1.0
75%,0.538,22636.0,252090.0,0.0,0.0,0.0,99.540001,14.0,13800.0,1.0
max,1.0,37602.0,90262600.0,13.0,17.0,11.0,99.889999,60.0,60000.0,1.0


Unnamed: 0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium
count,34224.0,34224.0,34224.0,34193.0,34193.0,34193.0,32901.0,34224.0,34224.0
mean,0.314454,18824.125,202820.2,0.238733,0.080718,0.058111,99.062012,10.890428,10834.721248
std,0.334058,5246.466797,270253.2,0.686056,0.454576,0.307093,0.742933,5.216792,9263.074506
min,0.0,7671.0,24030.0,0.0,0.0,0.0,91.900002,2.0,1200.0
25%,0.034,14972.0,106397.5,0.0,0.0,0.0,98.800003,7.0,5400.0
50%,0.169,18623.0,165070.0,0.0,0.0,0.0,99.209999,10.0,7500.0
75%,0.54,22636.0,250020.0,0.0,0.0,0.0,99.529999,14.0,13800.0
max,1.0,35785.0,21914550.0,12.0,10.0,7.0,99.889999,59.0,60000.0


## 2) Data preprocessing


### 2-0) Split X and Y dataset

In [103]:
trn_origin_X = trn_origin.loc[:,trn_origin.columns != 'renewal'].copy()
trn_origin_Y = trn_origin.loc[:,'renewal'].copy()
trn_origin_Y = pd.DataFrame(trn_origin_Y, columns=['renewal'])

tst_origin_X = tst_origin.copy()

## 2-1) Convert string values into dummy variables

In [104]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['sourcing_channel', 'residence_area_type']

# Concatenate trn and test X dataset
n_row_trn, ncol_trn = trn_origin_X.shape
n_row_tst, ncol_tst = tst_origin_X.shape

agg_data_X = pd.concat([trn_origin_X, tst_origin_X], axis=0)

for cat_col in cat_cols:
    print(' Convert string values from col : ', cat_col)

    just_dummy = pd.get_dummies(agg_data_X[cat_col], prefix=cat_col)
    #print(just_dummy)
    
    # Concatenate dummy columns into dataset
    agg_data_X = pd.concat([agg_data_X, just_dummy], axis=1)
    # Drop the origin column
    agg_data_X.drop(cat_col, axis=1, inplace=True)


# Split trn and test X dataset
trn_dum_X = agg_data_X.iloc[0:n_row_trn, :].copy()
tst_dum_X = agg_data_X.iloc[n_row_trn:, :].copy()

print(trn_dum_X.shape)
print(tst_dum_X.shape)

    #display(agg_data_X)
#     label_encoder = LabelEncoder()
#     onehot_encoder = OneHotEncoder(sparse=False)

#     trn_col_arr = trn_imputed[cat_col]
#     tst_col_arr = tst_imputed[cat_col]
    
#     ## Str to Integer encoding
#     label_encoder.fit(trn_col_arr)
#     trn_int_encoded = label_encoder.transform(trn_col_arr)
#     tst_int_encoded = label_encoder.transform(tst_col_arr)
#     #print(integer_encoded)

#     ## Integer to Binary encoding
#     trn_int_encoded = trn_int_encoded.reshape(len(trn_int_encoded), 1)
#     tst_int_encoded = tst_int_encoded.reshape(len(tst_int_encoded), 1)
#     onehot_encoder.fit(trn_int_encoded)
#     trn_onehot_encoded = onehot_encoder.transform(trn_int_encoded)
#     tst_onehot_encoded = onehot_encoder.transform(tst_int_encoded)
    
#     print(trn_onehot_encoded)
#     print(tst_onehot_encoded)
    
    ## Merge to the dataset
    

 Convert string values from col :  sourcing_channel
 Convert string values from col :  residence_area_type
(79853, 17)
(34224, 17)


## 2-2) Imputating Missing values
- We can figure out columns containing missing values

In [107]:
# Check the missing values
display(trn_dum_X.isna().sum())
display(tst_dum_X.isna().sum())

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 97
Count_6-12_months_late                97
Count_more_than_12_months_late        97
application_underwriting_score      2974
no_of_premiums_paid                    0
premium                                0
sourcing_channel_A                     0
sourcing_channel_B                     0
sourcing_channel_C                     0
sourcing_channel_D                     0
sourcing_channel_E                     0
residence_area_type_Rural              0
residence_area_type_Urban              0
dtype: int64

id                                     0
perc_premium_paid_by_cash_credit       0
age_in_days                            0
Income                                 0
Count_3-6_months_late                 31
Count_6-12_months_late                31
Count_more_than_12_months_late        31
application_underwriting_score      1323
no_of_premiums_paid                    0
premium                                0
sourcing_channel_A                     0
sourcing_channel_B                     0
sourcing_channel_C                     0
sourcing_channel_D                     0
sourcing_channel_E                     0
residence_area_type_Rural              0
residence_area_type_Urban              0
dtype: int64

### Imputation Strategy 1 : Mean

In [41]:
from sklearn.preprocessing import Imputer

# From .isnull().sum()
na_in_cols = ['Count_3-6_months_late', 'Count_6-12_months_late', 
            'Count_more_than_12_months_late', 'application_underwriting_score']

trn_imputed_X = trn_origin_X.copy()
tst_imputed_X = tst_origin_X.copy()

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(trn_imputed_X[na_in_cols])

trn_imputed_X[na_in_cols] = imp.transform(trn_imputed_X[na_in_cols])
tst_imputed_X[na_in_cols] = imp.transform(tst_imputed_X[na_in_cols])


### Imputation Strategy 2 : K-nearest Neighbors

In [111]:
from sklearn.neighbors import KNeighborsRegressor

Y_cols = ['Count_3-6_months_late',
          'Count_6-12_months_late',
          'Count_more_than_12_months_late',
          'application_underwriting_score']

X_cols = ['perc_premium_paid_by_cash_credit',
          'age_in_days',
          'Income',
          'no_of_premiums_paid',
          'premium',
          'sourcing_channel_A',
          'sourcing_channel_B',
          'sourcing_channel_C',
          'sourcing_channel_D',
          'sourcing_channel_E',
          'residence_area_type_Rural',
          'residence_area_type_Urban']

trn_dum_imp_X = trn_dum_X.copy()
tst_dum_imp_X = tst_dum_X.copy()

ALL_dum_imp_X = pd.concat([trn_dum_imp_X,tst_dum_imp_X], axis=0).copy()


for Y_col in Y_cols:
    
    ALL_nan_idx = ALL_dum_imp_X[Y_col].isna().nonzero()[0]
    ALL_nan_TF = ALL_dum_imp_X[Y_col].isna()
    
    neigh = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='brute',  metric='seuclidean')
    neigh.fit(ALL_dum_imp_X[X_cols][~ALL_nan_TF], ALL_dum_imp_X[Y_col][~ALL_nan_TF]) 
    
    print('Target Col : ', Y_col, ' Nan rows :', len(nan_idx))
    
    # For training dataset
    nan_idx = trn_dum_imp_X[Y_col].isna().nonzero()[0]
    nan_TF = trn_dum_imp_X[Y_col].isna()
    
    for X_idx in nan_idx:
        #print(np.array(trn_dum_X[X_cols].iloc[X_idx])
        y = neigh.predict(trn_dum_imp_X[X_cols].iloc[X_idx].values.reshape(1,-1))
        trn_dum_imp_X[Y_col].iloc[X_idx] = y[0]
        #print(X_idx, y, trn_dum_imp_X[Y_col].iloc[X_idx])
    
    # For test dataset
    nan_idx = tst_dum_imp_X[Y_col].isna().nonzero()[0]
    nan_TF = tst_dum_imp_X[Y_col].isna()    
    
    for X_idx in nan_idx:
        #print(np.array(tst_dum_X[X_cols].iloc[X_idx])
        y = neigh.predict(tst_dum_imp_X[X_cols].iloc[X_idx].values.reshape(1,-1))
        tst_dum_imp_X[Y_col].iloc[X_idx] = y[0]
        #print(X_idx, y, tst_dum_imp_X[Y_col].iloc[X_idx])


Target Col :  Count_3-6_months_late  Nan rows : 1323
158 [0.] 0.0
1216 [0.] 0.0
1471 [0.] 0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


3092 [0.] 0.0
4177 [0.] 0.0
6384 [0.14086985] 0.14086986
6773 [0.20570383] 0.20570384
7373 [0.] 0.0
7378 [0.] 0.0
7494 [0.] 0.0
10336 [0.] 0.0
10464 [0.0956559] 0.095655896
11236 [0.50144681] 0.50144684
11358 [0.29184728] 0.2918473
11946 [0.70707609] 0.7070761
13639 [0.] 0.0
14557 [0.27030607] 0.27030608
14743 [0.1674218] 0.1674218
15018 [0.] 0.0
15116 [0.] 0.0
15968 [0.10979671] 0.10979671
16135 [0.] 0.0
17090 [0.25655737] 0.25655738
17899 [0.] 0.0
18047 [0.36143866] 0.36143866
20416 [0.26186437] 0.26186436
20753 [0.166119] 0.166119
20803 [0.] 0.0
22327 [0.] 0.0
24053 [0.] 0.0
24169 [0.18032519] 0.1803252
24211 [0.] 0.0
25447 [0.] 0.0
25693 [0.] 0.0
26010 [0.19887359] 0.19887358
26790 [0.11137405] 0.11137405
28330 [0.] 0.0
30213 [0.65247223] 0.6524722
31159 [0.] 0.0
32773 [0.] 0.0
33461 [0.] 0.0
35158 [0.18349352] 0.18349352
37655 [0.43172972] 0.43172973
38640 [0.] 0.0
39721 [0.] 0.0
39931 [0.] 0.0
39991 [0.14901639] 0.14901638
41300 [0.] 0.0
42894 [0.] 0.0
43813 [0.] 0.0
43922 [0.070

73733 [0.] 0.0
75205 [0.] 0.0
75360 [0.] 0.0
76274 [0.17524599] 0.17524599
77321 [0.] 0.0
77401 [0.] 0.0
661 [0.] 0.0
693 [0.14354688] 0.14354688
1994 [0.] 0.0
2041 [0.] 0.0
2182 [0.] 0.0
3162 [0.] 0.0
4235 [0.] 0.0
4591 [0.] 0.0
6699 [0.16184918] 0.16184919
8104 [0.23445712] 0.23445712
11339 [0.] 0.0
11992 [0.] 0.0
13962 [0.1415551] 0.1415551
14890 [0.20265877] 0.20265877
14891 [0.] 0.0
14918 [0.] 0.0
16024 [0.] 0.0
16580 [0.20587668] 0.20587668
18279 [0.18742169] 0.18742168
21207 [0.17973593] 0.17973593
25901 [0.] 0.0
26528 [0.] 0.0
27911 [0.26490416] 0.26490417
28604 [0.] 0.0
28867 [0.34299692] 0.34299693
29411 [0.19833304] 0.19833304
30130 [0.] 0.0
30350 [0.] 0.0
31766 [0.] 0.0
32372 [0.] 0.0
33609 [0.] 0.0
Target Col :  application_underwriting_score  Nan rows : 31
18 [99.61780459] 99.617805
26 [99.52061353] 99.520615
39 [99.21279454] 99.21279
60 [99.47997618] 99.47997
61 [98.94975932] 98.94976
76 [99.70008183] 99.70008
77 [99.52434811] 99.524345
84 [97.26753975] 97.26754
140 [99.

6694 [99.30899744] 99.309
6710 [99.79429636] 99.7943
6724 [99.84703284] 99.84703
6742 [99.5927819] 99.59278
6752 [99.04692122] 99.04692
6757 [99.61112496] 99.61112
6759 [99.5064705] 99.50647
6770 [99.71763631] 99.717636
6773 [99.84051822] 99.840515
6785 [99.28856428] 99.28857
6809 [99.43391744] 99.433914
6890 [99.56088739] 99.56089
6927 [99.82511963] 99.82512
6963 [99.81759643] 99.8176
6971 [99.8885047] 99.888504
6986 [99.82332679] 99.823326
7000 [99.6566214] 99.656624
7012 [99.40088367] 99.40089
7028 [99.01531609] 99.01532
7038 [99.58692781] 99.58693
7065 [99.76294241] 99.76294
7066 [99.17250424] 99.1725
7077 [99.01415654] 99.01416
7086 [97.39973687] 97.399734
7087 [98.15878452] 98.15878
7089 [99.10832011] 99.10832
7093 [98.98021737] 98.98022
7125 [99.61165958] 99.61166
7147 [99.3549873] 99.35499
7183 [99.25471836] 99.254715
7225 [99.73285052] 99.73285
7238 [99.52982913] 99.52983
7249 [99.61294415] 99.612946
7261 [99.18813864] 99.18814
7275 [99.8142662] 99.81426
7300 [99.61769024] 99.

14194 [99.51443029] 99.51443
14230 [99.82653125] 99.82653
14258 [99.8678441] 99.86784
14281 [99.74332901] 99.74333
14344 [99.64619752] 99.646194
14415 [99.6453579] 99.645355
14450 [99.80550779] 99.80551
14556 [99.35108805] 99.35109
14557 [99.66731962] 99.66732
14578 [99.71130609] 99.7113
14588 [99.84263476] 99.842636
14612 [99.62501169] 99.625015
14707 [99.61075621] 99.610756
14819 [99.64382401] 99.64382
14821 [99.76553284] 99.76553
14855 [99.59274707] 99.59275
14860 [99.64758253] 99.64758
14892 [99.77417163] 99.77417
14942 [98.90036958] 98.90037
14954 [99.42526537] 99.42526
15028 [99.75963167] 99.75963
15030 [99.4050214] 99.40502
15038 [99.63492878] 99.634926
15061 [99.50387111] 99.50387
15072 [99.53644587] 99.536446
15078 [99.58940046] 99.5894
15081 [99.58655354] 99.586555
15098 [99.39231713] 99.39232
15101 [99.56361539] 99.563614
15116 [98.99792533] 98.997925
15150 [99.71384259] 99.713844
15161 [99.7042773] 99.70428
15175 [99.64929972] 99.6493
15207 [99.76855665] 99.768555
15232 [99

21834 [99.63082087] 99.63082
21846 [99.83419857] 99.8342
21868 [99.44247214] 99.442474
21936 [99.57522652] 99.575226
21949 [99.77110955] 99.77111
21968 [99.46685228] 99.46685
21998 [99.42758349] 99.42758
22007 [99.77104007] 99.77104
22013 [99.02223597] 99.02224
22019 [99.47406721] 99.47407
22033 [99.67520465] 99.6752
22252 [99.07759317] 99.07759
22305 [99.79693278] 99.796936
22309 [99.75996362] 99.759964
22314 [99.75346699] 99.75346
22323 [99.45141517] 99.451416
22327 [99.50882573] 99.50883
22433 [99.68077301] 99.68077
22453 [99.76665566] 99.766655
22460 [99.37895556] 99.37895
22464 [99.80587209] 99.80587
22504 [99.54965486] 99.54965
22546 [99.55785884] 99.55786
22559 [99.37313678] 99.37314
22628 [99.79878403] 99.79878
22643 [99.56846667] 99.56847
22701 [99.81636703] 99.81637
22773 [99.85516833] 99.85517
22799 [99.5789237] 99.578926
22811 [99.73821023] 99.73821
22812 [99.33024901] 99.330246
22816 [99.38431381] 99.384315
22872 [99.43362873] 99.43363
22914 [99.28775679] 99.28776
22921 [9

29646 [99.03748679] 99.03748
29683 [98.98827211] 98.98827
29696 [99.43236007] 99.43236
29700 [98.58454357] 98.58454
29762 [99.26939021] 99.269394
29774 [99.71963004] 99.71963
29862 [99.3763362] 99.376335
29873 [99.60519854] 99.6052
29932 [99.77909727] 99.7791
29935 [99.51706527] 99.51707
29954 [99.60195246] 99.60195
29993 [99.75129331] 99.7513
29998 [99.81163033] 99.81163
30015 [99.77714885] 99.777145
30017 [99.3770481] 99.377045
30067 [99.72080675] 99.72081
30145 [99.08137043] 99.08137
30165 [99.64179818] 99.6418
30213 [99.74819595] 99.7482
30258 [99.55027834] 99.55028
30322 [99.80501837] 99.805016
30343 [99.62331359] 99.623314
30371 [99.81452491] 99.81452
30382 [99.42895283] 99.428955
30411 [99.66429187] 99.66429
30470 [95.724999] 95.725
30504 [99.27137553] 99.27138
30510 [99.39978596] 99.39979
30516 [99.60151708] 99.60152
30556 [99.88444529] 99.884445
30582 [99.82075668] 99.820755
30588 [99.66205772] 99.662056
30625 [99.75029479] 99.7503
30631 [99.68802168] 99.68802
30661 [99.864518

37655 [99.71325634] 99.71326
37663 [99.50484209] 99.504845
37713 [99.79720721] 99.79721
37738 [99.21769295] 99.21769
37756 [99.61003096] 99.61003
37777 [99.54825862] 99.548256
37791 [99.164656] 99.16466
37802 [99.69991842] 99.69992
37805 [99.55313368] 99.55313
37834 [99.66319634] 99.66319
37837 [99.70912328] 99.70912
37838 [99.52870764] 99.52871
37904 [99.67101445] 99.67101
37926 [99.73268458] 99.73268
37947 [96.23169186] 96.23169
37985 [99.33013379] 99.33013
38011 [99.57672338] 99.57672
38034 [99.85339664] 99.85339
38107 [99.6621677] 99.66217
38108 [99.76587518] 99.76588
38114 [99.45945214] 99.45945
38163 [99.689067] 99.689064
38166 [99.41371082] 99.41371
38170 [99.67447352] 99.67448
38179 [99.43227355] 99.432274
38195 [99.77824825] 99.77825
38252 [99.81559659] 99.8156
38263 [99.14876196] 99.148766
38331 [99.7890919] 99.78909
38377 [99.77005816] 99.77006
38390 [98.90146208] 98.90146
38392 [99.47121379] 99.471214
38401 [99.79663452] 99.79663
38421 [99.56519154] 99.56519
38436 [99.56005

45071 [99.80606825] 99.80607
45122 [99.83191366] 99.83192
45151 [99.70497333] 99.70497
45153 [99.21467918] 99.214676
45155 [99.64578981] 99.64579
45207 [99.8135666] 99.81357
45235 [99.87394121] 99.87394
45256 [99.73549788] 99.7355
45357 [98.7886954] 98.7887
45366 [99.57329708] 99.573296
45390 [99.76275365] 99.76276
45421 [99.82611348] 99.82611
45434 [99.69900977] 99.69901
45453 [99.66109151] 99.661095
45465 [99.64966306] 99.649666
45471 [99.65363395] 99.65363
45473 [99.60447763] 99.60448
45474 [99.65149199] 99.65149
45490 [98.72758508] 98.727585
45492 [99.26878111] 99.26878
45495 [99.51929546] 99.519295
45519 [99.58046265] 99.58046
45529 [99.68456306] 99.68456
45613 [99.56215872] 99.56216
45616 [99.3567791] 99.35678
45647 [99.15077831] 99.15078
45671 [99.83001263] 99.83001
45673 [98.98856227] 98.98856
45683 [99.21295064] 99.21295
45704 [99.69452792] 99.69453
45760 [99.52254624] 99.522545
45784 [99.25386399] 99.25386
45849 [99.55193147] 99.55193
45873 [99.82718279] 99.82719
45894 [98.69

52486 [98.43371295] 98.433716
52513 [99.53701195] 99.53701
52520 [99.32466538] 99.32467
52556 [99.61811912] 99.61812
52581 [99.34698915] 99.34699
52603 [99.74501059] 99.74501
52627 [99.70261035] 99.702614
52652 [99.73098559] 99.73099
52654 [99.80595276] 99.805954
52668 [99.68789787] 99.6879
52686 [99.38803771] 99.38804
52734 [98.56581176] 98.56581
52785 [99.45523065] 99.45523
52825 [99.68451153] 99.68451
52838 [99.76051908] 99.76052
52911 [99.8803106] 99.88031
52915 [99.77412071] 99.774124
52950 [99.7742221] 99.77422
52960 [99.69026191] 99.69026
53008 [99.59688072] 99.59688
53038 [99.74029232] 99.740295
53068 [99.88870858] 99.88871
53078 [99.81870409] 99.8187
53096 [99.79331879] 99.79332
53103 [99.76488752] 99.764885
53112 [99.33621426] 99.33621
53145 [99.80764626] 99.80765
53179 [99.87043676] 99.87044
53184 [99.46949494] 99.4695
53269 [99.54072382] 99.540726
53284 [96.80147298] 96.801476
53300 [99.07083619] 99.07084
53371 [98.7374343] 98.737434
53409 [99.65796684] 99.65797
53427 [99.0

60783 [99.46474204] 99.464745
60808 [99.61181669] 99.61182
60832 [99.76592754] 99.76593
60841 [99.62696678] 99.62697
60847 [99.06525852] 99.06526
60934 [99.69930794] 99.69931
60986 [99.25257361] 99.25257
61071 [99.81001776] 99.81002
61089 [98.76539066] 98.76539
61106 [98.98452312] 98.98452
61131 [99.68116299] 99.68116
61143 [99.61639914] 99.6164
61173 [99.79690195] 99.796906
61177 [99.59271702] 99.59272
61217 [99.71524106] 99.71524
61280 [99.78463727] 99.78464
61310 [99.53395281] 99.53395
61329 [99.73129375] 99.73129
61382 [99.34975348] 99.349754
61405 [99.51150907] 99.51151
61432 [99.81101773] 99.81102
61450 [99.20749249] 99.20749
61488 [99.66632737] 99.66633
61541 [99.39205469] 99.39205
61606 [99.53795508] 99.53796
61613 [99.40559103] 99.405594
61625 [99.33595104] 99.33595
61657 [99.37787105] 99.37787
61664 [99.77182647] 99.77183
61712 [99.40376683] 99.40377
61739 [98.81440963] 98.81441
61791 [99.83478025] 99.83478
61816 [99.8582192] 99.85822
61847 [99.85509844] 99.855095
61852 [99.3

67762 [99.19769661] 99.19769
67865 [98.79582644] 98.79583
67877 [99.3560733] 99.35607
67926 [99.54478407] 99.544785
67929 [99.2532743] 99.25327
67947 [99.55463284] 99.554634
68010 [99.64006722] 99.64007
68014 [99.61397425] 99.613976
68017 [99.65702061] 99.65702
68042 [99.87234913] 99.87235
68044 [99.85478269] 99.85478
68083 [99.71536433] 99.71536
68091 [99.65384929] 99.65385
68105 [99.87309216] 99.87309
68118 [99.50626693] 99.50626
68125 [99.66633841] 99.666336
68140 [99.76503366] 99.76503
68165 [99.26075775] 99.26076
68214 [99.08573626] 99.08574
68219 [99.61749914] 99.6175
68236 [99.45709578] 99.45709
68245 [99.76380856] 99.76381
68285 [99.65316042] 99.65316
68293 [99.48599287] 99.48599
68342 [99.49695316] 99.496956
68376 [99.63796306] 99.63796
68396 [99.61867414] 99.618675
68422 [99.74924777] 99.749245
68436 [99.58422121] 99.58422
68478 [99.36271756] 99.36272
68497 [99.19934373] 99.19934
68528 [99.80977216] 99.80977
68552 [99.3142597] 99.31426
68555 [99.65891526] 99.65891
68613 [99.6

75729 [99.59139293] 99.59139
75736 [99.53299567] 99.533
75803 [99.56772556] 99.567726
75854 [99.71955393] 99.71955
75864 [99.7895245] 99.78953
75865 [99.5729002] 99.5729
75939 [99.76928453] 99.76929
75944 [99.64740108] 99.6474
75947 [99.62837803] 99.62838
75955 [99.36288634] 99.362885
75964 [99.80593885] 99.80594
76018 [99.67466129] 99.67466
76100 [99.54844016] 99.54844
76106 [97.79766296] 97.79766
76149 [99.73961255] 99.73961
76221 [99.84371236] 99.84371
76239 [99.47643229] 99.47643
76244 [99.748999] 99.749
76250 [99.54636692] 99.546364
76298 [99.77715633] 99.77715
76300 [99.69154666] 99.69154
76358 [99.5255457] 99.52554
76363 [99.10772462] 99.10773
76365 [99.31531105] 99.31531
76372 [99.62006418] 99.62006
76408 [99.44039231] 99.44039
76428 [99.8122233] 99.812225
76436 [99.04055635] 99.04056
76470 [99.88422979] 99.88423
76480 [98.47951146] 98.479515
76609 [99.46175841] 99.46176
76647 [99.10974412] 99.10974
76683 [99.33422675] 99.33423
76689 [99.769678] 99.76968
76735 [99.28982253] 99.

3342 [99.7745082] 99.774506
3344 [99.42365639] 99.42365
3444 [99.71043695] 99.710434
3446 [99.25215] 99.25215
3455 [99.26985536] 99.26985
3478 [99.78435939] 99.78436
3527 [99.73403638] 99.73404
3556 [99.56061681] 99.560616
3562 [99.86867414] 99.868675
3592 [99.65817983] 99.65818
3612 [99.73571703] 99.73572
3629 [99.73332095] 99.73332
3630 [99.79032841] 99.79033
3656 [99.56352801] 99.56353
3705 [99.59830592] 99.598305
3766 [99.75744265] 99.75745
3772 [99.67243659] 99.67244
3817 [99.77119689] 99.771194
3870 [99.52663994] 99.52664
3890 [99.61852516] 99.61852
3926 [99.62149758] 99.6215
3927 [99.54612475] 99.54613
3945 [98.81734575] 98.817345
3974 [99.26729376] 99.267296
4042 [99.70132608] 99.701324
4078 [99.30941874] 99.30942
4107 [99.4439387] 99.44394
4137 [99.32186385] 99.32186
4201 [99.87032826] 99.87033
4229 [99.30512344] 99.30512
4235 [99.54765599] 99.54765
4249 [97.58393499] 97.58394
4386 [99.57185118] 99.57185
4394 [99.56945213] 99.56945
4415 [99.65243018] 99.65243
4418 [99.70473724

10721 [98.8023726] 98.802376
10735 [99.42914198] 99.42914
10737 [99.50883013] 99.50883
10768 [99.49618334] 99.496185
10773 [99.69646611] 99.696465
10781 [99.78829679] 99.7883
10813 [99.77837767] 99.77838
10817 [99.21914599] 99.21915
10833 [99.4881562] 99.48816
10853 [99.72682733] 99.72683
10927 [98.92196639] 98.92197
10939 [99.62312549] 99.62312
10946 [99.57189318] 99.57189
10983 [99.3651731] 99.36517
10986 [99.72748421] 99.727486
11028 [99.56684414] 99.56684
11110 [99.85397936] 99.85398
11156 [99.71738866] 99.71739
11171 [99.517878] 99.517876
11241 [99.41952276] 99.419525
11278 [99.61172636] 99.611725
11294 [99.16531732] 99.16531
11305 [99.81152259] 99.81152
11315 [99.43824686] 99.43825
11339 [99.50692958] 99.50693
11344 [99.63673616] 99.636734
11361 [99.05474208] 99.05474
11363 [98.34097956] 98.34098
11366 [99.82372229] 99.82372
11407 [99.54000614] 99.54001
11437 [99.80251106] 99.80251
11442 [99.54640068] 99.5464
11478 [99.68878592] 99.68879
11488 [99.51682] 99.51682
11550 [97.944019

18278 [99.18999174] 99.189995
18279 [99.86638486] 99.86639
18314 [99.86554548] 99.86555
18326 [99.67035433] 99.67036
18331 [99.80000655] 99.8
18353 [99.80143371] 99.80144
18374 [98.39464114] 98.39464
18436 [99.82485203] 99.82485
18466 [99.82372867] 99.82373
18500 [99.60151213] 99.60151
18507 [99.66171397] 99.66171
18591 [99.63270334] 99.632706
18620 [99.68124529] 99.681244
18624 [98.6282956] 98.628296
18626 [99.55381233] 99.55381
18703 [99.85049065] 99.850494
18734 [99.39392065] 99.39392
18764 [99.43059606] 99.430595
18772 [99.5957766] 99.59578
18810 [99.40247987] 99.40248
18884 [99.64287269] 99.642876
18908 [98.15718259] 98.15718
18963 [99.46317114] 99.46317
19001 [99.41694447] 99.41695
19010 [99.41999817] 99.42
19011 [99.3840247] 99.384026
19016 [98.55280192] 98.5528
19099 [99.33186267] 99.33186
19133 [99.00738431] 99.007385
19170 [99.47403932] 99.47404
19208 [99.8291432] 99.82914
19235 [99.61437138] 99.61437
19259 [99.67505395] 99.67506
19281 [98.69860222] 98.6986
19376 [99.73174588

25996 [99.56343596] 99.56344
26008 [99.5984987] 99.598495
26015 [99.54715863] 99.54716
26106 [99.7701485] 99.77015
26171 [99.80177204] 99.80177
26179 [99.49145225] 99.491455
26235 [99.5232436] 99.52325
26240 [99.13091757] 99.13092
26285 [99.1706552] 99.170654
26332 [99.85161292] 99.851616
26337 [99.65002177] 99.650024
26400 [99.68995034] 99.68995
26406 [99.78570511] 99.785706
26414 [98.76400448] 98.76401
26453 [99.30550532] 99.305504
26471 [99.51274832] 99.51275
26474 [99.41759445] 99.417595
26519 [99.63897682] 99.63898
26528 [99.76609684] 99.7661
26543 [99.46553306] 99.46553
26551 [99.86773824] 99.86774
26571 [99.32616967] 99.32617
26621 [99.4000821] 99.400085
26702 [99.8149117] 99.81491
26709 [99.37375335] 99.37376
26716 [99.6317353] 99.63174
26779 [99.88541919] 99.88542
26817 [99.35438607] 99.354385
26873 [99.79916233] 99.799164
26879 [98.96658488] 98.96658
26907 [99.62666575] 99.62666
26973 [99.81312852] 99.813126
26977 [99.14571401] 99.14571
26990 [99.67596836] 99.67597
26993 [99.

33509 [98.62337064] 98.62337
33576 [99.88401179] 99.88401
33604 [99.47464403] 99.47465
33607 [99.81408835] 99.81409
33610 [98.92559188] 98.92559
33640 [99.80250459] 99.802505
33660 [99.63752985] 99.63753
33685 [99.4266731] 99.426674
33694 [99.57817695] 99.57818
33714 [99.88815837] 99.88816
33825 [99.22300395] 99.22301
33834 [99.59302353] 99.593025
33897 [99.71688678] 99.71689
33924 [99.58985952] 99.58986
33970 [99.56407407] 99.56407
34018 [98.31808895] 98.31809
34021 [99.14192785] 99.14193
34103 [98.58143064] 98.58143
34119 [99.6251735] 99.625175
34154 [99.63489879] 99.634895
34170 [99.86429874] 99.864296
34178 [99.55898467] 99.55898
34206 [99.19305816] 99.19306


In [112]:
# Check the missing values AGAIN
display(trn_dum_imp_X.isna().sum())
display(tst_dum_imp_X.isna().sum())

id                                  0
perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
premium                             0
sourcing_channel_A                  0
sourcing_channel_B                  0
sourcing_channel_C                  0
sourcing_channel_D                  0
sourcing_channel_E                  0
residence_area_type_Rural           0
residence_area_type_Urban           0
dtype: int64

id                                  0
perc_premium_paid_by_cash_credit    0
age_in_days                         0
Income                              0
Count_3-6_months_late               0
Count_6-12_months_late              0
Count_more_than_12_months_late      0
application_underwriting_score      0
no_of_premiums_paid                 0
premium                             0
sourcing_channel_A                  0
sourcing_channel_B                  0
sourcing_channel_C                  0
sourcing_channel_D                  0
sourcing_channel_E                  0
residence_area_type_Rural           0
residence_area_type_Urban           0
dtype: int64

## 2-3) Data standardization

In [113]:
cols_for_std = ['age_in_days', 'Income', 'Count_3-6_months_late', 
                'Count_6-12_months_late', 'Count_more_than_12_months_late', 'application_underwriting_score',
                'no_of_premiums_paid', 'premium']

scaler = preprocessing.StandardScaler().fit(trn_dum_imp_X[cols_for_std])
trn_dum_imp_X[cols_for_std] = scaler.transform(trn_dum_imp_X[cols_for_std])
tst_dum_imp_X[cols_for_std] = scaler.transform(tst_dum_imp_X[cols_for_std])

## 2-4) Data Export

In [114]:
#trn_imputed_dum_X, trn_origin_Y, tst_origin_X
trn_dum_imp_X.to_csv('./Dataset/trn_dum_imp_X.csv', index=False)
tst_dum_imp_X.to_csv('./Dataset/tst_dum_imp_X.csv', index=False)
trn_origin_Y.to_csv('./Dataset/trn_origin_Y.csv', index=False)
tst_origin_X.to_csv('./Dataset/tst_origin_X.csv', index=False)

In [115]:
print(trn_dum_imp_X.shape)
print(trn_origin_Y.shape)

trn_origin_Y.head()

(79853, 17)
(79853, 1)


Unnamed: 0,renewal
0,1
1,1
2,0
3,1
4,1
