In [1]:
# importing lib 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score

# Reading the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_original=test.copy()
train_original = train.copy()

# getting the shapes of the datasets
print("Shape of Train :", train.shape)
print("Shape of Test :", test.shape)

Shape of Train : (233154, 41)
Shape of Test : (112392, 40)


In [2]:
# filling the missing values in the Employment.Type attribute of train and test sets

# Employement Type has two types of Employment i.e., self employed and salaried
# but the empty values must be the people who don't  work at all that's why it is empty
# let's fill unemployed in the place of Null values

train['Employment.Type'].fillna('Unemployed', inplace = True)
test['Employment.Type'].fillna('Unemployed', inplace = True)

# let's check if there is any null values still left or not
print("Null values left in the train set:", train.isnull().sum().sum())
print("Null values left in the test set:", test.isnull().sum().sum())

Null values left in the train set: 0
Null values left in the test set: 0


In [3]:
# let's save the unique id of the test set and labels set

unique_id = test['UniqueID']
y_train = train.iloc[:, -1]

# let's delete the last column from the dataset to  concat train and test
train = train.drop(['loan_default'], axis = 1)

# shape of train
train.shape

(233154, 40)

In [4]:
# lets concat the train and test sets for preprocessing and visualizations

data = pd.concat([train, test], axis = 0)

# let's check the shape
data.shape

(345546, 40)

In [5]:
# encodings for type of employments

data['Employment.Type'] = data['Employment.Type'].replace(('Self employed', 'Salaried', 'Unemployed'), (2, 1, 0))

# checking the values  of employement type
data['Employment.Type'].value_counts()

2    187429
1    147013
0     11104
Name: Employment.Type, dtype: int64

In [6]:
#performing log transformations on disbursed amount, ltv, and asset cost

data['disbursed_amount'] = np.log1p(data['disbursed_amount'])
data['ltv'] = np.log1p(data['ltv'])
data['asset_cost'] = np.log1p(data['asset_cost'])

In [7]:
import re
def cal_age(file,col_name):
    date=[]
    for each in file[col_name]:   # 'Date.of.Birth'
        w=re.findall(r'\d+',each)
        old_age='19'+w[2]
        age=2018-int(old_age)
        date.append(age)
    file['Age']=date
cal_age(data,'Date.of.Birth')

In [8]:
def cal_year(file,col_name):
    clean=[]
    for each in file[col_name]:  # 'AVERAGE.ACCT.AGE'
        w=re.findall(r'\d+',each)
        month=int(w[0])+int(w[1])/12
        clean.append(month)
    file[col_name]=clean

cal_year(data,'AVERAGE.ACCT.AGE')
cal_year(data,'CREDIT.HISTORY.LENGTH')

In [9]:
data.head(10)

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,Age
0,420825,10.831292,10.975088,4.505902,67,22807,45,1441,01-01-84,1,...,0,0,0,0,0,0,0.0,0.0,0,34
1,537409,10.761004,11.090584,4.307168,67,22807,45,1502,31-07-85,2,...,0,0,1991,0,0,1,1.916667,1.916667,0,33
2,417566,10.883298,11.02453,4.506785,67,22807,45,1497,24-08-85,2,...,0,0,0,0,0,0,0.0,0.0,0,33
3,624493,10.959784,11.099136,4.494015,67,22807,45,1501,30-12-93,2,...,0,0,31,0,0,0,0.666667,1.25,1,25
4,539055,10.866261,11.007104,4.493009,67,22807,45,1495,09-12-77,2,...,0,0,0,0,0,0,0.0,0.0,1,41
5,518279,10.906213,11.033292,4.507116,67,22807,45,1501,08-09-90,2,...,0,0,1347,0,0,0,1.75,2.0,0,28
6,529269,10.743977,11.026809,4.349245,67,22807,45,1502,01-06-88,1,...,0,0,0,0,0,0,0.0,0.0,0,30
7,510278,10.689556,11.033292,4.288951,67,22807,45,1501,04-10-89,1,...,0,0,0,0,0,0,0.166667,0.166667,0,29
8,490213,10.891429,11.03447,4.506013,67,22807,45,1497,15-11-91,2,...,0,0,0,0,0,0,4.666667,4.666667,1,27
9,510980,10.870547,11.023551,4.476768,67,22807,45,1492,01-06-68,1,...,0,0,2608,0,0,0,1.583333,1.583333,0,50


In [10]:
data['PRIMARY.INSTAL.AMT'] = np.log1p(data['PRIMARY.INSTAL.AMT'])
data['SEC.INSTAL.AMT'] = np.log1p(data['SEC.INSTAL.AMT'])

In [11]:
# applying log transformation to all these attributes

data['SEC.NO.OF.ACCTS'] = np.log1p(data['SEC.NO.OF.ACCTS'])
data['SEC.ACTIVE.ACCTS'] = np.log1p(data['SEC.ACTIVE.ACCTS'])
data['SEC.OVERDUE.ACCTS'] = np.log1p(data['SEC.OVERDUE.ACCTS'])
#data['SEC.CURRENT.BALANCE'] = np.log1p(data['SEC.CURRENT.BALANCE'])
data['SEC.SANCTIONED.AMOUNT'] = np.log1p(data['SEC.SANCTIONED.AMOUNT'])
data['SEC.DISBURSED.AMOUNT'] = np.log1p(data['SEC.DISBURSED.AMOUNT'])

#  filling  missing values in sec.current.balance
data['SEC.CURRENT.BALANCE'].fillna(data['SEC.CURRENT.BALANCE'].mean(), inplace = True)

In [12]:
#  applying log transformations to the primary account attributes

data['PRI.NO.OF.ACCTS'] = np.log1p(data['PRI.NO.OF.ACCTS'])
data['PRI.ACTIVE.ACCTS'] = np.log1p(data['PRI.ACTIVE.ACCTS'])
data['PRI.OVERDUE.ACCTS'] = np.log1p(data['PRI.OVERDUE.ACCTS'])
#data['PRI.CURRENT.BALANCE'] = np.log1p(data['PRI.CURRENT.BALANCE'])
#data['PRI.SANCTIONED.AMOUNT'] = np.log1p(data['PRI.SANCTIONED.AMOUNT'])
data['PRI.DISBURSED.AMOUNT'] = np.log1p(data['PRI.DISBURSED.AMOUNT'])


#  filling  missing values in sec.current.balance
data['PRI.CURRENT.BALANCE'].fillna(data['PRI.CURRENT.BALANCE'].mean(), inplace = True)
data['PRI.SANCTIONED.AMOUNT'].fillna(data['PRI.SANCTIONED.AMOUNT'].mean(), inplace = True)

In [13]:
# encodings for bureau score(perform cns score distribution)

data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('No Bureau History Available', 0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Sufficient History Not Available', 0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Not Enough Info available on the customer', 0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Activity seen on the customer (Inactive)',0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Updates available in last 36 months', 0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Only a Guarantor', 0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: More than 50 active Accounts found',0)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('M-Very High Risk', 1)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('L-Very High Risk', 1)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('K-High Risk', 2)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('J-High Risk', 2)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('I-Medium Risk', 3)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('H-Medium Risk', 3)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('G-Low Risk', 4)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('F-Low Risk', 4)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('E-Low Risk', 4)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('D-Very Low Risk', 5)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('C-Very Low Risk', 5)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('B-Very Low Risk', 5)
data['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].replace('A-Very Low Risk', 5)

In [14]:
data['PERFORM_CNS.SCORE'] = np.log1p(data['PERFORM_CNS.SCORE'])

In [15]:
# some attributes are categorical but they are in integer so let's convert them into category

data['branch_id'] = data['branch_id'].astype('category')
data['manufacturer_id'] = data['manufacturer_id'].astype('category')
data['State_ID'] = data['State_ID'].astype('category')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['branch_id'] = le.fit_transform(data['branch_id'])
data['manufacturer_id'] = le.fit_transform(data['manufacturer_id'])
data['State_ID'] = le.fit_transform(data['State_ID'])

# # checking the values in these attributes
# print(data['branch_id'].value_counts())
# print(data['manufacturer_id'].value_counts())
# print(data['State_ID'].value_counts())

1     20527
30    15962
2     14881
20    11520
56    10917
18    10510
3     10276
12     8864
15     8777
0      8337
57     7870
24     7852
14     7723
59     7585
60     7258
61     7180
23     6577
36     6350
16     6103
47     5898
8      5873
7      5685
65     5594
55     5468
52     5292
40     5220
74     5201
11     4806
28     4630
5      4472
      ...  
76     2186
17     2175
77     1882
67     1624
54     1461
13     1427
41     1414
72     1366
37     1305
25     1285
69     1284
62     1215
53     1211
32     1188
19      978
51      916
78      907
22      845
79      818
45      766
58      708
44      664
46      659
70      613
29      543
80      535
81      487
42      394
50      145
63      101
Name: branch_id, Length: 82, dtype: int64
5     161203
0      87053
3      40927
1      22964
2      14812
6      14049
4       3364
7       1138
9         25
8          9
11         1
10         1
Name: manufacturer_id, dtype: int64
3     70438
5     48903
2     4786

In [16]:
# lets extract features from disbursal dates
# as all  the disbursement dates are of year 2018 so we can extract the months

data['DisbursalDate'] = pd.to_datetime(data['DisbursalDate'], errors = 'coerce')

# extracting the month of the disbursement
data['DisbursalMonth'] = data['DisbursalDate'].dt.month

data['DisbursalMonth'].value_counts()

11    99420
10    89440
8     58586
9     57939
12    10659
3      7601
6      7024
4      4627
7      4339
5      4178
1      1708
2        25
Name: DisbursalMonth, dtype: int64

In [17]:
# removing unnecassary columns

data = data.drop(['UniqueID', 'supplier_id', 'Current_pincode_ID', 'Date.of.Birth', 'DisbursalDate', 'Employee_code_ID'], axis = 1)

# checking the new columns of data
data.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'manufacturer_id',
       'Employment.Type', 'State_ID', 'MobileNo_Avl_Flag', 'Aadhar_flag',
       'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
       'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AVERAGE.ACCT.AGE',
       'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'Age', 'DisbursalMonth'],
      dtype='object')

In [18]:
# separating train and test datasets from data

x_train = data.iloc[:233154,:]
x_test = data.iloc[233154:,:]

# checking the shape of train and test
print("Shape of train :", x_train.shape)
print("Shape of test :", x_test.shape)

Shape of train : (233154, 36)
Shape of test : (112392, 36)


In [19]:
# applying SMOTE

from imblearn.over_sampling import SMOTE

x_resample, y_resample = SMOTE().fit_sample(x_train, y_train.values.ravel()) 

# checking the shape of x_resample and y_resample
print("Shape of x:", x_resample.shape)
print("Shape of y:", y_resample.shape)

Shape of x: (365086, 36)
Shape of y: (365086,)


In [20]:
# train and valid sets from train
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_resample, y_resample, test_size = 0.2, random_state = 0)

# checking the shapes
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(292068, 36)
(292068,)
(73018, 36)
(73018,)


In [21]:
# applying standardization

# standardization

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_valid = sc.transform(x_valid)
x_test = sc.transform(x_test)


In [27]:
# catboot classifier
import catboost as cb
cb_model = cb.CatBoostClassifier()
cb_model.fit(x_train, y_train)

y_pred = cb_model.predict(x_valid)

cm = accuracy_score(y_valid, y_pred)
print(cm)

Learning rate set to 0.083533
0:	learn: 0.6477171	total: 240ms	remaining: 4m
1:	learn: 0.6190999	total: 391ms	remaining: 3m 15s
2:	learn: 0.5968139	total: 570ms	remaining: 3m 9s
3:	learn: 0.5902663	total: 739ms	remaining: 3m 4s
4:	learn: 0.5741180	total: 906ms	remaining: 3m
5:	learn: 0.5556893	total: 1.07s	remaining: 2m 57s
6:	learn: 0.5450022	total: 1.23s	remaining: 2m 54s
7:	learn: 0.5407230	total: 1.4s	remaining: 2m 53s
8:	learn: 0.5334057	total: 1.58s	remaining: 2m 54s
9:	learn: 0.5255529	total: 1.74s	remaining: 2m 52s
10:	learn: 0.5193818	total: 1.91s	remaining: 2m 51s
11:	learn: 0.5146998	total: 2.1s	remaining: 2m 53s
12:	learn: 0.5098098	total: 2.3s	remaining: 2m 54s
13:	learn: 0.5060478	total: 2.51s	remaining: 2m 56s
14:	learn: 0.4912030	total: 2.7s	remaining: 2m 57s
15:	learn: 0.4863476	total: 2.88s	remaining: 2m 57s
16:	learn: 0.4832278	total: 3.08s	remaining: 2m 58s
17:	learn: 0.4767848	total: 3.29s	remaining: 2m 59s
18:	learn: 0.4753501	total: 3.47s	remaining: 2m 59s
19:	le

158:	learn: 0.3930621	total: 28.7s	remaining: 2m 31s
159:	learn: 0.3929106	total: 28.9s	remaining: 2m 31s
160:	learn: 0.3927493	total: 29s	remaining: 2m 31s
161:	learn: 0.3922126	total: 29.2s	remaining: 2m 31s
162:	learn: 0.3916832	total: 29.4s	remaining: 2m 30s
163:	learn: 0.3916248	total: 29.5s	remaining: 2m 30s
164:	learn: 0.3914238	total: 29.7s	remaining: 2m 30s
165:	learn: 0.3913597	total: 30s	remaining: 2m 30s
166:	learn: 0.3911446	total: 30.3s	remaining: 2m 31s
167:	learn: 0.3911078	total: 30.5s	remaining: 2m 30s
168:	learn: 0.3902960	total: 30.7s	remaining: 2m 30s
169:	learn: 0.3896829	total: 30.8s	remaining: 2m 30s
170:	learn: 0.3894186	total: 31s	remaining: 2m 30s
171:	learn: 0.3893371	total: 31.1s	remaining: 2m 29s
172:	learn: 0.3892751	total: 31.4s	remaining: 2m 30s
173:	learn: 0.3891440	total: 31.7s	remaining: 2m 30s
174:	learn: 0.3890858	total: 31.9s	remaining: 2m 30s
175:	learn: 0.3888446	total: 32s	remaining: 2m 30s
176:	learn: 0.3887610	total: 32.2s	remaining: 2m 29s
1

315:	learn: 0.3726195	total: 58.2s	remaining: 2m 6s
316:	learn: 0.3725839	total: 58.4s	remaining: 2m 5s
317:	learn: 0.3720264	total: 58.6s	remaining: 2m 5s
318:	learn: 0.3719986	total: 58.7s	remaining: 2m 5s
319:	learn: 0.3717475	total: 58.9s	remaining: 2m 5s
320:	learn: 0.3714418	total: 59.1s	remaining: 2m 4s
321:	learn: 0.3713245	total: 59.2s	remaining: 2m 4s
322:	learn: 0.3712877	total: 59.4s	remaining: 2m 4s
323:	learn: 0.3712585	total: 59.6s	remaining: 2m 4s
324:	learn: 0.3706428	total: 59.8s	remaining: 2m 4s
325:	learn: 0.3705998	total: 1m	remaining: 2m 4s
326:	learn: 0.3705748	total: 1m	remaining: 2m 3s
327:	learn: 0.3705513	total: 1m	remaining: 2m 3s
328:	learn: 0.3704791	total: 1m	remaining: 2m 3s
329:	learn: 0.3702814	total: 1m	remaining: 2m 3s
330:	learn: 0.3702638	total: 1m	remaining: 2m 3s
331:	learn: 0.3702427	total: 1m 1s	remaining: 2m 2s
332:	learn: 0.3702186	total: 1m 1s	remaining: 2m 2s
333:	learn: 0.3700333	total: 1m 1s	remaining: 2m 2s
334:	learn: 0.3700105	total: 1

471:	learn: 0.3634852	total: 1m 25s	remaining: 1m 35s
472:	learn: 0.3634632	total: 1m 25s	remaining: 1m 35s
473:	learn: 0.3634473	total: 1m 25s	remaining: 1m 34s
474:	learn: 0.3634255	total: 1m 25s	remaining: 1m 34s
475:	learn: 0.3634125	total: 1m 25s	remaining: 1m 34s
476:	learn: 0.3633962	total: 1m 26s	remaining: 1m 34s
477:	learn: 0.3633634	total: 1m 26s	remaining: 1m 34s
478:	learn: 0.3633392	total: 1m 26s	remaining: 1m 33s
479:	learn: 0.3633286	total: 1m 26s	remaining: 1m 33s
480:	learn: 0.3633067	total: 1m 26s	remaining: 1m 33s
481:	learn: 0.3632852	total: 1m 26s	remaining: 1m 33s
482:	learn: 0.3632640	total: 1m 27s	remaining: 1m 33s
483:	learn: 0.3632377	total: 1m 27s	remaining: 1m 32s
484:	learn: 0.3631297	total: 1m 27s	remaining: 1m 32s
485:	learn: 0.3631003	total: 1m 27s	remaining: 1m 32s
486:	learn: 0.3628355	total: 1m 27s	remaining: 1m 32s
487:	learn: 0.3628046	total: 1m 27s	remaining: 1m 32s
488:	learn: 0.3627856	total: 1m 27s	remaining: 1m 31s
489:	learn: 0.3627618	total:

625:	learn: 0.3587180	total: 1m 50s	remaining: 1m 6s
626:	learn: 0.3586946	total: 1m 50s	remaining: 1m 5s
627:	learn: 0.3586791	total: 1m 50s	remaining: 1m 5s
628:	learn: 0.3586681	total: 1m 51s	remaining: 1m 5s
629:	learn: 0.3586499	total: 1m 51s	remaining: 1m 5s
630:	learn: 0.3586300	total: 1m 51s	remaining: 1m 5s
631:	learn: 0.3586123	total: 1m 51s	remaining: 1m 4s
632:	learn: 0.3585858	total: 1m 51s	remaining: 1m 4s
633:	learn: 0.3585722	total: 1m 51s	remaining: 1m 4s
634:	learn: 0.3585342	total: 1m 52s	remaining: 1m 4s
635:	learn: 0.3584998	total: 1m 52s	remaining: 1m 4s
636:	learn: 0.3584802	total: 1m 52s	remaining: 1m 4s
637:	learn: 0.3584692	total: 1m 52s	remaining: 1m 3s
638:	learn: 0.3584545	total: 1m 52s	remaining: 1m 3s
639:	learn: 0.3584450	total: 1m 52s	remaining: 1m 3s
640:	learn: 0.3584172	total: 1m 52s	remaining: 1m 3s
641:	learn: 0.3584007	total: 1m 53s	remaining: 1m 3s
642:	learn: 0.3583891	total: 1m 53s	remaining: 1m 2s
643:	learn: 0.3583548	total: 1m 53s	remaining:

783:	learn: 0.3546733	total: 2m 16s	remaining: 37.7s
784:	learn: 0.3546588	total: 2m 16s	remaining: 37.5s
785:	learn: 0.3546433	total: 2m 17s	remaining: 37.3s
786:	learn: 0.3546269	total: 2m 17s	remaining: 37.2s
787:	learn: 0.3546092	total: 2m 17s	remaining: 37s
788:	learn: 0.3545883	total: 2m 17s	remaining: 36.8s
789:	learn: 0.3545714	total: 2m 17s	remaining: 36.6s
790:	learn: 0.3545565	total: 2m 17s	remaining: 36.4s
791:	learn: 0.3545326	total: 2m 18s	remaining: 36.3s
792:	learn: 0.3544497	total: 2m 18s	remaining: 36.1s
793:	learn: 0.3544350	total: 2m 18s	remaining: 35.9s
794:	learn: 0.3544192	total: 2m 18s	remaining: 35.7s
795:	learn: 0.3544004	total: 2m 18s	remaining: 35.5s
796:	learn: 0.3543884	total: 2m 18s	remaining: 35.4s
797:	learn: 0.3543718	total: 2m 19s	remaining: 35.2s
798:	learn: 0.3543537	total: 2m 19s	remaining: 35s
799:	learn: 0.3543352	total: 2m 19s	remaining: 34.8s
800:	learn: 0.3543130	total: 2m 19s	remaining: 34.7s
801:	learn: 0.3542927	total: 2m 19s	remaining: 34.

939:	learn: 0.3511778	total: 2m 44s	remaining: 10.5s
940:	learn: 0.3511625	total: 2m 44s	remaining: 10.3s
941:	learn: 0.3510727	total: 2m 45s	remaining: 10.2s
942:	learn: 0.3510516	total: 2m 45s	remaining: 9.98s
943:	learn: 0.3510336	total: 2m 45s	remaining: 9.81s
944:	learn: 0.3510205	total: 2m 45s	remaining: 9.63s
945:	learn: 0.3510107	total: 2m 45s	remaining: 9.45s
946:	learn: 0.3510042	total: 2m 45s	remaining: 9.28s
947:	learn: 0.3509912	total: 2m 45s	remaining: 9.1s
948:	learn: 0.3509751	total: 2m 46s	remaining: 8.92s
949:	learn: 0.3509603	total: 2m 46s	remaining: 8.75s
950:	learn: 0.3509494	total: 2m 46s	remaining: 8.57s
951:	learn: 0.3509423	total: 2m 46s	remaining: 8.4s
952:	learn: 0.3509221	total: 2m 46s	remaining: 8.22s
953:	learn: 0.3509040	total: 2m 46s	remaining: 8.04s
954:	learn: 0.3508910	total: 2m 47s	remaining: 7.87s
955:	learn: 0.3508753	total: 2m 47s	remaining: 7.69s
956:	learn: 0.3508512	total: 2m 47s	remaining: 7.52s
957:	learn: 0.3508268	total: 2m 47s	remaining: 7

In [1]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'ctr_border_count':[50,5,10,20,100,200],
          'thread_count':4}

In [23]:
# this function does 3-fold crossvalidation with catboostclassifier          
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.ix[train_index]
        test_labels = train_label.ix[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)


Average Precision Score : 0.8090646225869282


In [31]:
# let's predict for the tests set

y_pred_lgb = cb_model.predict(x_test)

# lets look at the submission file

submission = pd.read_csv('sample_submission_24jSKY6.csv')

submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0
1,723482,0
2,758529,0
3,763449,0
4,708663,0


In [32]:
#  let's create a submission file

#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'UniqueID': unique_id,'loan_default': y_pred_lgb})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,UniqueID,loan_default
0,655269,0.0
1,723482,0.0
2,758529,0.0
3,763449,0.0
4,708663,0.0


In [33]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'catboost.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: catboost.csv
