In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from sklearn import metrics


%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
def preprocessing_data(data):
    data = data.drop([ 'issue_d', 'earliest_cr_line',  'zip_code', 'addr_state', 'record_id','policy_code'], axis=1)

    data['term'] = data['term'].map({' 36 months': 0, ' 60 months': 1})    
    data['initial_list_status'] = data['initial_list_status'].map({'f': 0, 'w': 1})
    data['application_type'] = data['application_type'].map({'INDIVIDUAL': 0, 'JOINT': 1})
    data['emp_length'] = data['emp_length'].map({'< 1 year': 1, '1 year': 2, '2 years': 3,  '3 years': 4,  '4 years': 5,  '5 years': 6,  '6 years': 7,  '7 years': 8,  '8 years': 9,  '9 years': 10,  '10+ years': 11})

    data.emp_length.fillna(value=0,inplace=True)
    data.revol_util.fillna(value=0,inplace=True)  
    data['emp_title'] = data.emp_title.map(lambda x: 0 if x == 'n/a' else 1)

    data.collections_12_mths_ex_med.fillna(value=data.collections_12_mths_ex_med.mean(),inplace=True)
    data.mths_since_last_delinq.fillna(value=data.mths_since_last_delinq.mean(),inplace=True)
    data.tot_coll_amt.fillna(value=data.tot_coll_amt.mean(),inplace=True)
    data.tot_cur_bal.fillna(value=data.tot_coll_amt.mean(),inplace=True)
    data.total_rev_hi_lim.fillna(value=data.total_rev_hi_lim.mean(),inplace=True)
    return data


In [4]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

train = preprocessing_data(train)
test = preprocessing_data(test)
X_train = train.drop(['loan_status'], axis=1)
y_train = train['loan_status']

X_test = test.drop(['loan_status'], axis=1)
y_test = test['loan_status']

In [11]:
categorical_columns = [c for c in X_train.columns if X_train[c].dtype.name == 'object']
cat_features = list(map(lambda x: list(X_train.columns).index(x), categorical_columns))

# print(cat_features)

In [9]:
model = CatBoostClassifier(iterations = 500)
model.fit(X_train, y_train, cat_features=cat_features)
preds = model.predict(X_test)

Learning rate set to 0.125838
0:	learn: 0.6415308	total: 269ms	remaining: 2m 14s
1:	learn: 0.6031088	total: 480ms	remaining: 1m 59s
2:	learn: 0.5745506	total: 706ms	remaining: 1m 56s
3:	learn: 0.5535024	total: 930ms	remaining: 1m 55s
4:	learn: 0.5375163	total: 1.12s	remaining: 1m 50s
5:	learn: 0.5249922	total: 1.32s	remaining: 1m 48s
6:	learn: 0.5165059	total: 1.53s	remaining: 1m 47s
7:	learn: 0.5093694	total: 1.74s	remaining: 1m 46s
8:	learn: 0.5037844	total: 1.93s	remaining: 1m 45s
9:	learn: 0.4995950	total: 2.11s	remaining: 1m 43s
10:	learn: 0.4960499	total: 2.31s	remaining: 1m 42s
11:	learn: 0.4934510	total: 2.5s	remaining: 1m 41s
12:	learn: 0.4916802	total: 2.72s	remaining: 1m 41s
13:	learn: 0.4903010	total: 2.9s	remaining: 1m 40s
14:	learn: 0.4888257	total: 3.1s	remaining: 1m 40s
15:	learn: 0.4876775	total: 3.31s	remaining: 1m 40s
16:	learn: 0.4866612	total: 3.5s	remaining: 1m 39s
17:	learn: 0.4859006	total: 3.7s	remaining: 1m 39s
18:	learn: 0.4852149	total: 3.91s	remaining: 1m 3

158:	learn: 0.4685141	total: 33s	remaining: 1m 10s
159:	learn: 0.4684436	total: 33.2s	remaining: 1m 10s
160:	learn: 0.4684090	total: 33.4s	remaining: 1m 10s
161:	learn: 0.4683596	total: 33.6s	remaining: 1m 10s
162:	learn: 0.4682962	total: 33.8s	remaining: 1m 9s
163:	learn: 0.4682172	total: 34s	remaining: 1m 9s
164:	learn: 0.4681243	total: 34.2s	remaining: 1m 9s
165:	learn: 0.4680906	total: 34.4s	remaining: 1m 9s
166:	learn: 0.4680248	total: 34.6s	remaining: 1m 8s
167:	learn: 0.4679388	total: 34.8s	remaining: 1m 8s
168:	learn: 0.4678438	total: 35s	remaining: 1m 8s
169:	learn: 0.4677319	total: 35.2s	remaining: 1m 8s
170:	learn: 0.4676690	total: 35.4s	remaining: 1m 8s
171:	learn: 0.4676020	total: 35.6s	remaining: 1m 7s
172:	learn: 0.4675102	total: 35.8s	remaining: 1m 7s
173:	learn: 0.4674471	total: 36s	remaining: 1m 7s
174:	learn: 0.4673649	total: 36.2s	remaining: 1m 7s
175:	learn: 0.4672802	total: 36.4s	remaining: 1m 6s
176:	learn: 0.4671960	total: 36.6s	remaining: 1m 6s
177:	learn: 0.46

318:	learn: 0.4593951	total: 1m 6s	remaining: 37.5s
319:	learn: 0.4593378	total: 1m 6s	remaining: 37.3s
320:	learn: 0.4592573	total: 1m 6s	remaining: 37.1s
321:	learn: 0.4591994	total: 1m 6s	remaining: 36.9s
322:	learn: 0.4591564	total: 1m 7s	remaining: 36.7s
323:	learn: 0.4591095	total: 1m 7s	remaining: 36.5s
324:	learn: 0.4590859	total: 1m 7s	remaining: 36.3s
325:	learn: 0.4590116	total: 1m 7s	remaining: 36.1s
326:	learn: 0.4589782	total: 1m 7s	remaining: 35.9s
327:	learn: 0.4589302	total: 1m 8s	remaining: 35.7s
328:	learn: 0.4588619	total: 1m 8s	remaining: 35.5s
329:	learn: 0.4587783	total: 1m 8s	remaining: 35.3s
330:	learn: 0.4587206	total: 1m 8s	remaining: 35.2s
331:	learn: 0.4586563	total: 1m 9s	remaining: 35s
332:	learn: 0.4586175	total: 1m 9s	remaining: 34.8s
333:	learn: 0.4585754	total: 1m 9s	remaining: 34.6s
334:	learn: 0.4585245	total: 1m 9s	remaining: 34.4s
335:	learn: 0.4584687	total: 1m 9s	remaining: 34.2s
336:	learn: 0.4584186	total: 1m 10s	remaining: 34s
337:	learn: 0.4

474:	learn: 0.4517341	total: 1m 41s	remaining: 5.32s
475:	learn: 0.4516773	total: 1m 41s	remaining: 5.11s
476:	learn: 0.4516385	total: 1m 41s	remaining: 4.89s
477:	learn: 0.4515749	total: 1m 41s	remaining: 4.68s
478:	learn: 0.4515331	total: 1m 41s	remaining: 4.47s
479:	learn: 0.4514991	total: 1m 42s	remaining: 4.25s
480:	learn: 0.4514475	total: 1m 42s	remaining: 4.04s
481:	learn: 0.4514021	total: 1m 42s	remaining: 3.83s
482:	learn: 0.4513488	total: 1m 42s	remaining: 3.62s
483:	learn: 0.4513102	total: 1m 43s	remaining: 3.4s
484:	learn: 0.4512805	total: 1m 43s	remaining: 3.19s
485:	learn: 0.4512304	total: 1m 43s	remaining: 2.98s
486:	learn: 0.4511954	total: 1m 43s	remaining: 2.77s
487:	learn: 0.4511341	total: 1m 43s	remaining: 2.55s
488:	learn: 0.4510777	total: 1m 44s	remaining: 2.34s
489:	learn: 0.4510190	total: 1m 44s	remaining: 2.13s
490:	learn: 0.4510064	total: 1m 44s	remaining: 1.91s
491:	learn: 0.4509653	total: 1m 44s	remaining: 1.7s
492:	learn: 0.4508918	total: 1m 44s	remaining: 1

In [10]:
print('CatBoost: ')
print('Accuracy score: ' + str(metrics.accuracy_score(y_test, preds)))

print('Recall score: ' + str(metrics.recall_score(y_test, preds)))

print('Precision score: ' + str(metrics.precision_score(y_test, preds)))

print('Auc score: ' + str(metrics.roc_auc_score(y_test, preds)))

CatBoost: 
Accuracy score: 0.7833391611302596
Recall score: 0.9718719274996794
Precision score: 0.7954028618409544
Auc score: 0.5452758778482497
