# Zindi Competition- Loan Default Prediction
### By: Majid Lagzian

Link to the competition: https://zindi.africa/competitions/data-science-nigeria-challenge-1-loan-default-prediction

Metric of interest: Accuracy

In [1]:
import pandas as pd
import os.path as osp
import numpy as np
import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# First, we import our data from the appropriate files
trainperf_path = osp.join(osp.curdir,'Zindi','trainperf.csv')
trainperf_data = pd.read_csv(trainperf_path, parse_dates=['approveddate','creationdate'])
testperf_path = osp.join(osp.curdir,'Zindi','testperf.csv')
testperf_data = pd.read_csv(testperf_path, parse_dates=['approveddate','creationdate'])
trainperf_data.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56,2017-07-25 07:22:47,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41,2017-07-05 16:04:18,15000.0,17250.0,30,,Good
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57,2017-07-06 13:52:51,20000.0,22250.0,15,,Good
3,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41,2017-07-27 18:00:35,10000.0,11500.0,15,,Good
4,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45,2017-07-03 22:42:39,40000.0,44000.0,30,,Good


In [3]:
traindemo_path = osp.join(osp.curdir,'Zindi','traindemographics.csv')
traindemo_data = pd.read_csv(traindemo_path, parse_dates=['birthdate'])
testdemo_path = osp.join(osp.curdir,'Zindi','testdemographics.csv')
testdemo_data = pd.read_csv(testdemo_path, parse_dates=['birthdate'])
traindemo_data.head()

Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
0,8a858e135cb22031015cbafc76964ebd,1973-10-10,Savings,3.319219,6.528604,GT Bank,,,
1,8a858e275c7ea5ec015c82482d7c3996,1986-01-21,Savings,3.325598,7.119403,Sterling Bank,,Permanent,
2,8a858e5b5bd99460015bdc95cd485634,1987-04-01,Savings,5.7461,5.563174,Fidelity Bank,,,
3,8a858efd5ca70688015cabd1f1e94b55,1991-07-19,Savings,3.36285,6.642485,GT Bank,,Permanent,
4,8a858e785acd3412015acd48f4920d04,1982-11-22,Savings,8.455332,11.97141,GT Bank,,Permanent,


In [4]:
trainprev_path = osp.join(osp.curdir,'Zindi','trainprevloans.csv')
trainprev_data = pd.read_csv(trainprev_path, parse_dates=['approveddate','creationdate','closeddate',
                                                                           'firstduedate','firstrepaiddate'])
testprev_path = osp.join(osp.curdir,'Zindi','testprevloans.csv')
testprev_data = pd.read_csv(testprev_path, parse_dates=['approveddate','creationdate','closeddate',
                                                                           'firstduedate','firstrepaiddate'])
trainprev_data.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
0,8a2a81a74ce8c05d014cfb32a0da1049,301682320,2,2016-08-15 18:22:40,2016-08-15 17:22:32,10000.0,13000.0,30,2016-09-01 16:06:48,,2016-09-14,2016-09-01 15:51:43
1,8a2a81a74ce8c05d014cfb32a0da1049,301883808,9,2017-04-28 18:39:07,2017-04-28 17:38:53,10000.0,13000.0,30,2017-05-28 14:44:49,,2017-05-30,2017-05-26 00:00:00
2,8a2a81a74ce8c05d014cfb32a0da1049,301831714,8,2017-03-05 10:56:25,2017-03-05 09:56:19,20000.0,23800.0,30,2017-04-26 22:18:56,,2017-04-04,2017-04-26 22:03:47
3,8a8588f35438fe12015444567666018e,301861541,5,2017-04-09 18:25:55,2017-04-09 17:25:42,10000.0,11500.0,15,2017-04-24 01:35:52,,2017-04-24,2017-04-24 00:48:43
4,8a85890754145ace015429211b513e16,301941754,2,2017-06-17 09:29:57,2017-06-17 08:29:50,10000.0,11500.0,15,2017-07-14 21:18:43,,2017-07-03,2017-07-14 21:08:35


##Dataset sizes and information

In [5]:
print('Train Demographic: {} Rows, {} Columns'.format(*traindemo_data.shape))
print('Train Performance: {} Rows, {} Columns'.format(*trainperf_data.shape))
print('Train Previous Loan: {} Rows, {} Columns'.format(*trainprev_data.shape))
print('Test Demographic: {} Rows, {} Columns'.format(*testdemo_data.shape))
print('Test Performance: {} Rows, {} Columns'.format(*testperf_data.shape))
print('Test Previous Loan: {} Rows, {} Columns'.format(*testprev_data.shape))

Train Demographic: 4346 Rows, 9 Columns
Train Performance: 4368 Rows, 10 Columns
Train Previous Loan: 18183 Rows, 12 Columns
Test Demographic: 1487 Rows, 9 Columns
Test Performance: 1450 Rows, 9 Columns
Test Previous Loan: 5907 Rows, 12 Columns


##Number of performance rows existing in other datasets

In [6]:
print('Train Performance has {} IDs in common with Train Demographic'.format(trainperf_data['customerid'].isin(traindemo_data['customerid']).sum()))
print('Train Performance has {} IDs in common with Test Demographic'.format(trainperf_data['customerid'].isin(testdemo_data['customerid']).sum()))
print('Train Performance has {} IDs in common with Train Previous Loan'.format(trainperf_data['customerid'].isin(trainprev_data['customerid']).sum()))
print('Train Performance has {} IDs in common with Test Previous Loan'.format(trainperf_data['customerid'].isin(testprev_data['customerid']).sum()))
print('-'*100)
print('Test Performance has {} IDs in common with Train Demographic'.format(testperf_data['customerid'].isin(traindemo_data['customerid']).sum()))
print('Test Performance has {} IDs in common with Test Demographic'.format(testperf_data['customerid'].isin(testdemo_data['customerid']).sum()))
print('Test Performance has {} IDs in common with Train Previous Loan'.format(testperf_data['customerid'].isin(trainprev_data['customerid']).sum()))
print('Test Performance has {} IDs in common with Test Previous Loan'.format(testperf_data['customerid'].isin(testprev_data['customerid']).sum()))

Train Performance has 3269 IDs in common with Train Demographic
Train Performance has 1099 IDs in common with Test Demographic
Train Performance has 4359 IDs in common with Train Previous Loan
Train Performance has 0 IDs in common with Test Previous Loan
----------------------------------------------------------------------------------------------------
Test Performance has 1065 IDs in common with Train Demographic
Test Performance has 385 IDs in common with Test Demographic
Test Performance has 0 IDs in common with Train Previous Loan
Test Performance has 1442 IDs in common with Test Previous Loan


Conclusion: We need to merge train and test demographic datasets, but we won't merge test and train performance datasets to prevent data leakage.

##Cleaning previous loans dataset

In [7]:
trainprev_data.drop_duplicates(keep='last',inplace=True)
trainprev_data.describe(include='all')

  trainprev_data.describe(include='all')
  trainprev_data.describe(include='all')
  trainprev_data.describe(include='all')
  trainprev_data.describe(include='all')
  trainprev_data.describe(include='all')


Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,closeddate,referredby,firstduedate,firstrepaiddate
count,18183,18183.0,18183.0,18183,18183,18183.0,18183.0,18183.0,18183,1026,18183,18183
unique,4359,,,18172,18173,,,,17955,521,363,18011
top,8a858f7d5578012a01557ea194d94948,,,2017-04-20 14:00:44,2017-06-12 10:38:36,,,,2016-11-07 00:53:42,8a858fc55b2548dd015b286e452c678c,2017-07-03 00:00:00,2016-08-04 00:00:00
freq,26,,,2,2,,,,5,14,398,7
first,,,,2016-01-15 08:53:28,2016-01-15 07:53:17,,,,2016-02-02 08:18:15,,2016-02-15 00:00:00,2016-02-02 08:13:55
last,,,,2017-07-28 10:47:43,2017-07-28 09:46:34,,,,2017-07-30 22:09:11,,2017-08-22 00:00:00,2017-07-30 21:59:01
mean,,301839500.0,4.189353,,,16501.23742,19573.202931,26.69279,,,,
std,,93677.67,3.24949,,,9320.547516,10454.245277,10.946556,,,,
min,,301600100.0,1.0,,,3000.0,3450.0,15.0,,,,
25%,,301776600.0,2.0,,,10000.0,11500.0,15.0,,,,


In [8]:
#Interest amount
trainprev_data['interest'] = trainprev_data['totaldue']-trainprev_data['loanamount']

# Days before due
trainprev_data['days_remained'] = trainprev_data['termdays']- (trainprev_data['closeddate'] - trainprev_data['approveddate']).dt.days

# Default or not
trainprev_data['goodflags'] = trainprev_data.apply(lambda x: 0 if (x['days_remained']<0) else 1, axis=1)

In [9]:
new_trainprev=trainprev_data.groupby('customerid').agg({'loanamount':['sum','mean'],'interest':['sum','mean'],'termdays':'mean',
                                                        'days_remained':['mean','max','min'],'goodflags':'sum'})

new_trainprev.columns = ['_'.join(col).strip() for col in new_trainprev.columns.values]

In [10]:
new_trainprev.head()

Unnamed: 0_level_0,loanamount_sum,loanamount_mean,interest_sum,interest_mean,termdays_mean,days_remained_mean,days_remained_max,days_remained_min,goodflags_sum
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8a1088a0484472eb01484669e3ce4e0b,10000.0,10000.0,1500.0,1500.0,15.0,-7.0,-7,-7,0
8a1a1e7e4f707f8b014f797718316cad,70000.0,17500.0,19500.0,4875.0,37.5,6.25,26,-2,3
8a1a32fc49b632520149c3b8fdf85139,90000.0,12857.142857,16500.0,2357.142857,19.285714,1.0,3,-1,6
8a1eb5ba49a682300149c3c068b806c7,130000.0,16250.0,32400.0,4050.0,33.75,5.25,12,0,8
8a1edbf14734127f0147356fdb1b1eb2,20000.0,10000.0,4500.0,2250.0,22.5,4.5,8,1,2


##Converting the test previous loan dataset to the same format

In [11]:
testprev_data.drop_duplicates(keep='last',inplace=True)
testprev_data['interest'] = testprev_data['totaldue']-testprev_data['loanamount']
testprev_data['days_remained'] = (testprev_data['approveddate'] - testprev_data['closeddate']).dt.days +testprev_data['termdays']
testprev_data['goodflags'] = testprev_data.apply(lambda x: 0 if (x['days_remained']<0) else 1, axis=1)
new_testprev=testprev_data.groupby('customerid').agg({'loanamount':['sum','mean'],'interest':['sum','mean'],'termdays':'mean',
                                                        'days_remained':['mean','max','min'],'goodflags':'sum'})
new_testprev.columns = ['_'.join(col).strip() for col in new_testprev.columns.values]

## Merging and cleaning demographic data

In [12]:
demographic_data=pd.concat([traindemo_data,testdemo_data])
demographic_data.drop_duplicates(keep='last',inplace=True)
demographic_data.describe(include='all')

  demographic_data.describe(include='all')


Unnamed: 0,customerid,birthdate,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,bank_branch_clients,employment_status_clients,level_of_education_clients
count,5818,5818,5818,5818.0,5818.0,5818,65,4953,795
unique,5818,4078,3,,,18,55,6,4
top,8a858e135cb22031015cbafc76964ebd,1980-09-22 00:00:00,Savings,,,GT Bank,APAPA,Permanent,Graduate
freq,1,6,4570,,,2128,3,4205,576
first,,1961-10-13 00:00:00,,,,,,,
last,,1996-03-28 00:00:00,,,,,,,
mean,,,,4.576024,7.220867,,,,
std,,,,6.98348,3.047189,,,,
min,,,,-149.0337,-74.005974,,,,
25%,,,,3.354953,6.473605,,,,


In [13]:
# Age
demographic_data['Age'] = datetime.datetime.now().year - demographic_data['birthdate'].dt.year

In [14]:
# Bank account types
demographic_data["bank_account_type"].value_counts()

Savings    4570
Other      1168
Current      80
Name: bank_account_type, dtype: int64

In [15]:
# Bank names
larger_50=demographic_data["bank_name_clients"].value_counts()[demographic_data["bank_name_clients"].value_counts()>=50]
otherbanks=demographic_data["bank_name_clients"].value_counts()[demographic_data["bank_name_clients"].value_counts()<50]
print(larger_50)
print('\nTotal number of other banks is',otherbanks.sum())

GT Bank          2128
First Bank        790
Access Bank       567
UBA               461
Zenith Bank       412
Diamond Bank      392
EcoBank           202
Stanbic IBTC      193
FCMB              168
Skye Bank         146
Fidelity Bank     139
Sterling Bank      67
Name: bank_name_clients, dtype: int64

Total number of other banks is 153


In [16]:
# Employment status
demographic_data["employment_status_clients"].value_counts()

Permanent        4205
Self-Employed     470
Student           189
Unemployed         79
Retired             8
Contract            2
Name: employment_status_clients, dtype: int64

In [17]:
# Level of education
demographic_data["level_of_education_clients"].value_counts()

Graduate         576
Secondary        118
Post-Graduate     90
Primary           11
Name: level_of_education_clients, dtype: int64

In [18]:
demographic_data.drop(['birthdate','bank_branch_clients'],axis=1,inplace=True)
demographic_data["bank_account_type"] = demographic_data["bank_account_type"].replace("Current", "Other")

demographic_data["bank_name_clients"] = demographic_data["bank_name_clients"].replace(otherbanks.index.tolist(),'Other')

job_replacement={'Retired':'Other','Contract':'Other',np.nan:'Missing'}
demographic_data["employment_status_clients"] = demographic_data["employment_status_clients"].replace(job_replacement)

edu_replacement={'Secondary':'Secondary and lower','Primary':'Secondary and lower',np.nan:'Missing'}
demographic_data["level_of_education_clients"] = demographic_data["level_of_education_clients"].replace(edu_replacement)

demographic_data.describe(include='all')

Unnamed: 0,customerid,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,level_of_education_clients,Age
count,5818,5818,5818.0,5818.0,5818,5818,5818,5818.0
unique,5818,2,,,13,6,4,
top,8a858e135cb22031015cbafc76964ebd,Savings,,,GT Bank,Permanent,Missing,
freq,1,4570,,,2128,4205,5023,
mean,,,4.576024,7.220867,,,,38.960639
std,,,6.98348,3.047189,,,,6.174445
min,,,-149.0337,-74.005974,,,,27.0
25%,,,3.354953,6.473605,,,,35.0
50%,,,3.58848,6.621319,,,,38.0
75%,,,6.556336,7.42657,,,,43.0


##Cleaning performance data

In [19]:
trainperf_data.drop_duplicates(inplace=True)
#Converting the labels to binary
labels = {'Good':1,'Bad':0}
trainperf_data['good_bad_flag'] = trainperf_data['good_bad_flag'].map(labels)
trainperf_data.describe(include='all')

  trainperf_data.describe(include='all')
  trainperf_data.describe(include='all')


Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
count,4368,4368.0,4368.0,4368,4368,4368.0,4368.0,4368.0,587,4368.0
unique,4368,,,4362,4364,,,,521,
top,8a2a81a74ce8c05d014cfb32a0da1049,,,2017-07-24 15:06:11,2017-07-05 13:28:44,,,,8a858fc55b2548dd015b286e452c678c,
freq,1,,,2,2,,,,8,
first,,,,2017-07-01 01:35:26,2017-07-01 00:35:20,,,,,
last,,,,2017-07-30 22:55:51,2017-07-30 21:55:43,,,,,
mean,,301981000.0,5.17239,,,17809.065934,21257.377679,29.261676,,0.782051
std,,13431.15,3.653569,,,10749.694571,11943.510416,11.512519,,0.4129
min,,301958500.0,2.0,,,10000.0,10000.0,15.0,,0.0
25%,,301969100.0,2.0,,,10000.0,13000.0,30.0,,1.0


In [20]:
trainperf_data = trainperf_data.merge(new_trainprev[['loanamount_sum','goodflags_sum']], left_on='referredby', right_on='customerid', how='left')
trainperf_data.rename(columns={'loanamount_sum':'refloans','goodflags_sum':'refgoodflags'},inplace=True)
trainperf_data['referredby'] = trainperf_data.apply(lambda x: 0 if pd.isna(x['referredby'])  else 1, axis=1)

#Interest amount
trainperf_data['loaninterest'] = trainperf_data['totaldue']-trainperf_data['loanamount']

trainperf_data.drop(columns=['systemloanid','creationdate','approveddate','totaldue'],inplace=True)

In [21]:
trainperf_data['good_bad_flag'].value_counts()

1    3416
0     952
Name: good_bad_flag, dtype: int64

Doing the same cleaning on the test performance dataset:

In [22]:
testperf_data.drop_duplicates(inplace=True)
testperf_data = testperf_data.merge(new_testprev[['loanamount_sum','goodflags_sum']], left_on='referredby', right_on='customerid', how='left')
testperf_data.rename(columns={'loanamount_sum':'refloans','goodflags_sum':'refgoodflags'},inplace=True)
testperf_data['referredby'] = testperf_data.apply(lambda x: 0 if pd.isna(x['referredby'])  else 1, axis=1)
#Interest amount
testperf_data['loaninterest'] = testperf_data['totaldue']-testperf_data['loanamount']
testperf_data.drop(columns=['systemloanid','creationdate','approveddate','totaldue'],inplace=True)
testperf_data.describe(include='all')

Unnamed: 0,customerid,loannumber,loanamount,termdays,referredby,refloans,refgoodflags,loaninterest
count,1450,1450.0,1450.0,1450.0,1450.0,22.0,22.0,1450.0
unique,1450,,,,,,,
top,8a858899538ddb8e015390510b321f08,,,,,,,
freq,1,,,,,,,
mean,,5.088966,17482.758621,28.810345,0.126897,57954.545455,2.5,3391.905448
std,,3.66521,10585.420034,11.074839,0.332972,78719.802841,2.154729,1604.997104
min,,2.0,10000.0,15.0,0.0,10000.0,1.0,500.0
25%,,2.0,10000.0,30.0,0.0,20000.0,1.0,2500.0
50%,,4.0,10000.0,30.0,0.0,40000.0,2.0,3000.0
75%,,7.0,20000.0,30.0,0.0,50000.0,3.0,4500.0


##Final merging and feature engineering

In [23]:
#left merge with previous loans dataset
train_df = trainperf_data.merge(new_trainprev, how='left', on='customerid')
test_df=testperf_data.merge(new_testprev, how='left', on='customerid')
train_df=train_df.fillna(0)
test_df=test_df.fillna(0)

#left merge with demographic dataset
train_df = train_df.merge(demographic_data, how='left', on='customerid')
test_df=test_df.merge(demographic_data, how='left', on='customerid')

In [24]:
train_df.describe(include='all')

Unnamed: 0,customerid,loannumber,loanamount,termdays,referredby,good_bad_flag,refloans,refgoodflags,loaninterest,loanamount_sum,...,days_remained_max,days_remained_min,goodflags_sum,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,level_of_education_clients,Age
count,4368,4368.0,4368.0,4368.0,4368.0,4368.0,4368.0,4368.0,4368.0,4368.0,...,4368.0,4368.0,4368.0,4368,4368.0,4368.0,4368,4368,4368,4368.0
unique,4368,,,,,,,,,,...,,,,2,,,13,6,4,
top,8a2a81a74ce8c05d014cfb32a0da1049,,,,,,,,,,...,,,,Savings,,,GT Bank,Permanent,Missing,
freq,1,,,,,,,,,,...,,,,3417,,,1604,3130,3763,
mean,,5.17239,17809.065934,29.261676,0.134386,0.782051,2998.626374,0.162317,3448.311745,68690.934066,...,8.292811,-2.964973,3.298764,,4.539944,7.253575,,,,38.973214
std,,3.653569,10749.694571,11.512519,0.341106,0.4129,21670.989409,0.989285,1692.694555,88941.791429,...,10.711023,15.149085,3.250235,,7.422612,2.988238,,,,6.165971
min,,2.0,10000.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-31.0,-350.0,0.0,,-149.0337,-33.868818,,,,27.0
25%,,2.0,10000.0,30.0,0.0,1.0,0.0,0.0,3000.0,10000.0,...,2.0,-4.0,1.0,,3.354974,6.474631,,,,35.0
50%,,4.0,10000.0,30.0,0.0,1.0,0.0,0.0,3000.0,30000.0,...,6.0,0.0,2.0,,3.58342,6.622692,,,,38.0
75%,,7.0,20000.0,30.0,0.0,1.0,0.0,0.0,4500.0,90000.0,...,13.0,2.0,5.0,,6.533464,7.429028,,,,43.0


##Model Assessment Metrics

In [25]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,roc_auc_score,classification_report,roc_auc_score,roc_curve,auc

def model(algorithm,dtrain_X,dtrain_Y,dtest_X,dtest_Y):

    algorithm.fit(dtrain_X,dtrain_Y)
    predictions = algorithm.predict(dtest_X)
    print (algorithm)

    print ("Accuracy score : ", accuracy_score(predictions,dtest_Y))
    print ("Recall score   : ", recall_score(predictions,dtest_Y))
    print ("classification report :\n",classification_report(predictions,dtest_Y))

    fig = plt.figure(figsize=(6,4))
    ax  = fig.add_subplot(111)
    prediction_probabilities = algorithm.predict_proba(dtest_X)[:,1]
    fpr , tpr , thresholds   = roc_curve(dtest_Y,prediction_probabilities)
    ax.plot(fpr,tpr,label   = ["Area under curve : ",auc(fpr,tpr)],linewidth=2,linestyle="dotted")
    ax.plot([0,1],[0,1],linewidth=2,linestyle="dashed")
    plt.legend(loc="best")
    plt.title("ROC-CURVE & AREA UNDER CURVE")

##Classifiers

In [26]:
X=train_df.drop(['customerid','good_bad_flag'],axis=1)
y=train_df['good_bad_flag']

In [27]:
X.shape

(4368, 23)

In [29]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [30]:
pip install lightgbm



In [28]:
pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.1 cmaes-0.10.0 colorlog-6.7.0 optuna-3.2.0


In [29]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import f1_score, make_scorer, precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import VarianceThreshold
import optuna

In [30]:
cat_cols = ['bank_account_type', 'bank_name_clients', 'employment_status_clients', 'level_of_education_clients']
cols_to_scale= ['loannumber', 'loanamount', 'termdays', 'refloans',
       'refgoodflags', 'loaninterest', 'loanamount_sum', 'loanamount_mean',
       'interest_sum', 'interest_mean', 'termdays_mean', 'days_remained_mean',
       'days_remained_max', 'days_remained_min', 'goodflags_sum',
       'longitude_gps', 'latitude_gps', 'Age']
X[cat_cols]=X[cat_cols].astype('category')
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

## Gradient Boosting with Optuna

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'threshold': trial.suggest_float('threshold', 0.05, 0.2),
      }

  hyper_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 50, 110, step=2),
        "random_state": 77,
        "n_estimators": trial.suggest_int("n_estimators", 200, 400, step=50)
        }

  column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

  pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', GradientBoostingClassifier(**hyper_params))
    ])

  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  return score

In [None]:
study = optuna.create_study(direction="maximize")

[I 2023-07-23 17:24:05,527] A new study created in memory with name: no-name-076be921-d1c9-4d05-a4da-98b8751a851c


In [None]:
study.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-23 17:25:12,231] Trial 0 finished with value: 0.7981624343726562 and parameters: {'threshold': 0.16057292333879397, 'learning_rate': 0.009385132529798211, 'max_depth': 8, 'min_samples_leaf': 64, 'n_estimators': 350}. Best is trial 0 with value: 0.7981624343726562.
[I 2023-07-23 17:25:26,870] Trial 1 finished with value: 0.8017550626808101 and parameters: {'threshold': 0.12307247198780973, 'learning_rate': 0.022403166289507257, 'max_depth': 2, 'min_samples_leaf': 58, 'n_estimators': 250}. Best is trial 1 with value: 0.8017550626808101.
[I 2023-07-23 17:26:24,421] Trial 2 finished with value: 0.7958727097396336 and parameters: {'threshold': 0.13604494438503212, 'learning_rate': 0.007693486519728955, 'max_depth': 7, 'min_samples_leaf': 56, 'n_estimators': 350}. Best is trial 1 with value: 0.8017550626808101.
[I 2023-07-23 17:27:26,605] Trial 3 finished with value: 0.7867159541412194 and parameters: {'threshold': 0.18287990896054274, 'learning_rate': 0.03621158134038267, 'max_de

##Adaboost with Optuna

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'threshold': trial.suggest_float('threshold', 0.05, 0.15),
      }

  hyper_params = {

        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        "random_state": 42,
        'n_estimators':trial.suggest_int('n_estimators', 200, 300, step=50),
        }

  column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

  pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', AdaBoostClassifier(**hyper_params))
    ])

  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study_adaboost = optuna.create_study(direction="maximize")

[I 2023-07-23 18:46:02,139] A new study created in memory with name: no-name-8f7bf8fe-74eb-45e3-903d-b2719d5c8367


In [None]:
study_adaboost.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-23 18:46:36,945] Trial 0 finished with value: 0.7958716382727955 and parameters: {'threshold': 0.12070386136780384, 'learning_rate': 0.003050572079031314, 'n_estimators': 300}. Best is trial 0 with value: 0.7958716382727955.
[I 2023-07-23 18:46:48,254] Trial 1 finished with value: 0.8001221472195436 and parameters: {'threshold': 0.10595045743937176, 'learning_rate': 0.029779660260181105, 'n_estimators': 200}. Best is trial 1 with value: 0.8001221472195436.
[I 2023-07-23 18:47:02,291] Trial 2 finished with value: 0.7952201864352298 and parameters: {'threshold': 0.06062060889346748, 'learning_rate': 0.004496770982402542, 'n_estimators': 250}. Best is trial 1 with value: 0.8001221472195436.
[I 2023-07-23 18:47:13,527] Trial 3 finished with value: 0.7984860173577628 and parameters: {'threshold': 0.09450500320916641, 'learning_rate': 0.07495925697581367, 'n_estimators': 200}. Best is trial 1 with value: 0.8001221472195436.
[I 2023-07-23 18:47:25,677] Trial 4 finished with value: 

## Catboost with Optuna

In [None]:
X_train[cat_cols] = X_train[cat_cols].astype('category')
X_val[cat_cols] = X_val[cat_cols].astype('category')

def objective_dt(trial, X, y):

  hyper_params = {

        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 100, step=2),
        'verbose':0,
        'cat_features': cat_cols,
        "random_state": 77,
        "n_estimators": trial.suggest_int("n_estimators", 200, 400, step=100)
  }

  clf = CatBoostClassifier(**hyper_params)
  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(clf, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study_catboost = optuna.create_study(direction="maximize")

[I 2023-07-23 19:40:03,810] A new study created in memory with name: no-name-91765a0c-7c1f-43a9-b1e7-0902b8b5d202


In [None]:
study_catboost.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-23 19:40:14,211] Trial 0 finished with value: 0.7912943319404265 and parameters: {'learning_rate': 0.0027922921878884522, 'max_depth': 3, 'l2_leaf_reg': 1.6207227960907107, 'min_data_in_leaf': 88, 'n_estimators': 200}. Best is trial 0 with value: 0.7912943319404265.
[I 2023-07-23 19:40:36,872] Trial 1 finished with value: 0.7955469838208508 and parameters: {'learning_rate': 0.0028173865550765826, 'max_depth': 6, 'l2_leaf_reg': 87.32346181200286, 'min_data_in_leaf': 70, 'n_estimators': 300}. Best is trial 1 with value: 0.7955469838208508.
[I 2023-07-23 19:40:48,402] Trial 2 finished with value: 0.79391085395907 and parameters: {'learning_rate': 0.0020069186643767237, 'max_depth': 4, 'l2_leaf_reg': 4.629960327787597, 'min_data_in_leaf': 72, 'n_estimators': 200}. Best is trial 1 with value: 0.7955469838208508.
[I 2023-07-23 19:41:17,865] Trial 3 finished with value: 0.7981613629058181 and parameters: {'learning_rate': 0.0027710950427296244, 'max_depth': 6, 'l2_leaf_reg': 52.304

## LightGBM with Optuna

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'threshold': trial.suggest_float('threshold', 0.05, 0.15),
      }

  hyper_params = {

        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'n_estimators':trial.suggest_int('n_estimators', 200, 300, step=50),
        }

  column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

  pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', LGBMClassifier(**hyper_params))
    ])

  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study_lgbm = optuna.create_study(direction="maximize")

[I 2023-07-23 21:30:07,800] A new study created in memory with name: no-name-9cdc3e47-d690-40c9-8c3d-fd2073ddb2ed


In [None]:
study_lgbm.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-23 21:30:09,825] Trial 0 finished with value: 0.7899871423979429 and parameters: {'threshold': 0.10581724817863575, 'learning_rate': 0.06163671489480557, 'num_leaves': 195, 'max_depth': 4, 'min_child_samples': 76, 'n_estimators': 300}. Best is trial 0 with value: 0.7899871423979429.
[I 2023-07-23 21:30:14,053] Trial 1 finished with value: 0.7821407907425264 and parameters: {'threshold': 0.14434966617706763, 'learning_rate': 0.0018418541760100908, 'num_leaves': 296, 'max_depth': 7, 'min_child_samples': 56, 'n_estimators': 300}. Best is trial 0 with value: 0.7899871423979429.
[I 2023-07-23 21:30:25,782] Trial 2 finished with value: 0.7801767920282867 and parameters: {'threshold': 0.09598756163488124, 'learning_rate': 0.05931987769198833, 'num_leaves': 188, 'max_depth': 10, 'min_child_samples': 7, 'n_estimators': 300}. Best is trial 0 with value: 0.7899871423979429.
[I 2023-07-23 21:30:28,067] Trial 3 finished with value: 0.7899903567984572 and parameters: {'threshold': 0.13664

## SVM with Optuna

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'threshold': trial.suggest_float('threshold', 0.05, 0.15),
      }

  hyper_params = {

        'kernel':trial.suggest_categorical("kernel", ['linear', 'poly','rbf']),
        'C': trial.suggest_float('C', 0.01, 10),
        'degree':trial.suggest_int("degree", 1, 3),
        'gamma':trial.suggest_categorical("gamma", ["scale","auto"])
        }

  column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

  pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', SVC(**hyper_params))
    ])

  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study_svc = optuna.create_study(direction="maximize")
study_svc.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-24 17:35:31,351] A new study created in memory with name: no-name-b4720dec-c408-4eef-87d0-b89e1b1ec6a9
[I 2023-07-24 17:35:33,842] Trial 0 finished with value: 0.7821407907425264 and parameters: {'threshold': 0.09787666183939808, 'kernel': 'linear', 'C': 0.9242338264298048, 'degree': 1, 'gamma': 'auto'}. Best is trial 0 with value: 0.7821407907425264.
[I 2023-07-24 17:35:39,154] Trial 1 finished with value: 0.7821407907425264 and parameters: {'threshold': 0.13255466539775243, 'kernel': 'linear', 'C': 6.953051716096226, 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.7821407907425264.
[I 2023-07-24 17:35:40,857] Trial 2 finished with value: 0.7821407907425264 and parameters: {'threshold': 0.11401435956335064, 'kernel': 'poly', 'C': 3.6663994507938122, 'degree': 1, 'gamma': 'auto'}. Best is trial 0 with value: 0.7821407907425264.
[I 2023-07-24 17:35:46,773] Trial 3 finished with value: 0.7821407907425264 and parameters: {'threshold': 0.053992530249161376, 'kernel

## Random Forest with Optuna

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'threshold': trial.suggest_float('threshold', 0.05, 0.15),
      }

  hyper_params = {
            'max_depth': trial.suggest_int('max_depth', 4, 20, step=2),
            'min_samples_split': trial.suggest_int('min_samples_split', 5, 100, step=5),
            "n_estimators": trial.suggest_int('n_estimators', 200, 500, step=100),
            "random_state": 77
        }

  column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

  pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', RandomForestClassifier(**hyper_params))
    ])

  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
  cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [None]:
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(lambda trial: objective_dt(trial, X_train, y_train), n_trials=200,  gc_after_trial=True)

[I 2023-07-24 18:47:21,569] A new study created in memory with name: no-name-483b2c99-1709-4929-8ef4-d6b64bf2e8e4
[I 2023-07-24 18:47:27,084] Trial 0 finished with value: 0.7942355084110148 and parameters: {'threshold': 0.14816695839156077, 'max_depth': 14, 'min_samples_split': 75, 'n_estimators': 200}. Best is trial 0 with value: 0.7942355084110148.
[I 2023-07-24 18:47:34,258] Trial 1 finished with value: 0.8011025393764063 and parameters: {'threshold': 0.1372178619435666, 'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}. Best is trial 1 with value: 0.8011025393764063.
[I 2023-07-24 18:47:39,943] Trial 2 finished with value: 0.8001242901532197 and parameters: {'threshold': 0.1353023824396083, 'max_depth': 10, 'min_samples_split': 55, 'n_estimators': 200}. Best is trial 1 with value: 0.8011025393764063.
[I 2023-07-24 18:47:50,572] Trial 3 finished with value: 0.797832422586521 and parameters: {'threshold': 0.1355456350470815, 'max_depth': 6, 'min_samples_split': 35, 'n_es

# Assessing the Best Models:

## Best Gradient Boosting


In [34]:

selector_params = {
      'threshold': 0.15777420250541063
      }

hyper_params = {
        'learning_rate': 0.011262350504551067,
        "max_depth": 2,
        "min_samples_leaf": 56,
        "random_state": 77,
        "n_estimators": 300
        }

column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

model_gbc1 = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', GradientBoostingClassifier(**hyper_params))
    ])

model_gbc1.fit(X_train, y_train)
y_predict_gbc1=model_gbc1.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_gbc1,y_val))
y_hat_gbc1=pd.DataFrame(y_predict_gbc1)

cm = confusion_matrix(y_val, y_predict_gbc1)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7879481311975591
Confusion Matrix:
[[ 40 246]
 [ 32 993]]


## Best AdaBoost

In [35]:

selector_params = {
      'threshold': 0.05934401277221134,
      }

hyper_params = {

        'learning_rate': 0.01642041319774052,
        "random_state": 42,
        'n_estimators':250,
        }

column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

model_adaboost = Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', AdaBoostClassifier(**hyper_params))
    ])

model_adaboost.fit(X_train, y_train)
y_predict_adaboost=model_adaboost.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_adaboost,y_val))
y_hat_adaboost=pd.DataFrame(y_predict_adaboost)

cm = confusion_matrix(y_val, y_predict_adaboost)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7879481311975591
Confusion Matrix:
[[  33  253]
 [  25 1000]]


## Best Catboost

In [36]:

hyper_params = {

        'learning_rate': 0.004473304394329983,
        "max_depth": 8,
        "l2_leaf_reg": 7.432011106828359e-06,
        "min_data_in_leaf": 74,
        'verbose':0,
        'cat_features': cat_cols,
        "random_state": 77,
        "n_estimators": 200
  }

model_catboost = CatBoostClassifier(**hyper_params)
model_catboost.fit(X_train, y_train)
y_predict_catboost=model_catboost.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_catboost,y_val))
y_hat_catboost=pd.DataFrame(y_predict_catboost)

cm = confusion_matrix(y_val, y_predict_catboost)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7879481311975591
Confusion Matrix:
[[  31  255]
 [  23 1002]]


## Best LightGBM

In [37]:

selector_params = {
      'threshold': 0.09915031809885629,
      }

hyper_params = {

        'learning_rate': 0.026503435132685858,
        'num_leaves': 612,
        'max_depth': 3,
        'min_child_samples': 84,
        'n_estimators':200,
        }

column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

model_lgbm= Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', LGBMClassifier(**hyper_params))
    ])

model_lgbm.fit(X_train,y_train)

y_predict_lgbm=model_lgbm.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_lgbm,y_val))
y_hat_lgbm=pd.DataFrame(y_predict_lgbm)

cm = confusion_matrix(y_val, y_predict_lgbm)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7871853546910755
Confusion Matrix:
[[ 42 244]
 [ 35 990]]


## Best SVC

In [38]:
selector_params = {
      'threshold': 0.08865050141384695,
      }

hyper_params = {

        'kernel':'rbf',
        'C': 5.241334470405655,
        'degree':2,
        'gamma':"auto"
        }

column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

model_svc= Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', SVC(**hyper_params))
    ])

model_svc.fit(X_train,y_train)

y_predict_svc=model_svc.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_svc,y_val))
y_hat_svc=pd.DataFrame(y_predict_svc)

cm = confusion_matrix(y_val, y_predict_svc)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7871853546910755
Confusion Matrix:
[[  24  262]
 [  17 1008]]


## Best Random Forest

In [39]:
selector_params = {
      'threshold': 0.06569354737862629,
      }

hyper_params = {
            'max_depth': 16,
            'min_samples_split': 5,
            "n_estimators": 200,
            "random_state": 77
        }


column_transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    (StandardScaler(), cols_to_scale),
    remainder="passthrough"
    )

model_rf= Pipeline([
    ('preprocessor', column_transformer),
    ('feature_selection', VarianceThreshold(**selector_params)),
    ('classifier', RandomForestClassifier(**hyper_params))
    ])

model_rf.fit(X_train,y_train)

y_predict_rf=model_rf.predict(X_val)
print ("Accuracy score : ", accuracy_score(y_predict_rf,y_val))
y_hat_rf=pd.DataFrame(y_predict_rf)

cm = confusion_matrix(y_val, y_predict_rf)
print("Confusion Matrix:")
print(cm)

Accuracy score :  0.7841342486651411
Confusion Matrix:
[[ 39 247]
 [ 36 989]]


## Meta Classifier

In [45]:
def model_pred(model,X):
  y_predict=model.predict(X)
  y_hat=pd.DataFrame(y_predict)
  return y_hat

In [46]:
# Merging the predictions of all the models
X_features=pd.concat([y_hat_gbc1,y_hat_adaboost,y_hat_catboost,y_hat_lgbm,
                      y_hat_svc, y_hat_rf], axis=1)
columns=['gbc','adaboost','catboost','lgbm','svc','rf']
X_features.columns=columns
X_features.head()

Unnamed: 0,gbc,adaboost,catboost,lgbm,svc,rf
0,1,1,1,1,1,1
1,1,1,1,1,1,1
2,0,0,1,0,1,1
3,1,1,1,1,1,1
4,1,1,1,1,1,1


In [47]:
X_features.shape

(1311, 6)

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'k': trial.suggest_int("k", 3, 9)
      }

  hyper_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 50, 110, step=2),
        "random_state": 77,
        "n_estimators": trial.suggest_int("n_estimators", 200, 400, step=50)
        }

  stacking_gbc = Pipeline([
    ('feature_selection', SelectKBest(**selector_params)),
    ('classifier', GradientBoostingClassifier(**hyper_params))
    ])


  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  cv_scores = cross_val_score(stacking_gbc, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  return score

In [None]:
study_all = optuna.create_study(direction="maximize")
study_all.optimize(lambda trial: objective_dt(trial, X_features, y_val), n_trials=200,  gc_after_trial=True)

[I 2023-07-24 18:08:58,348] A new study created in memory with name: no-name-ca0edeb8-bf0e-411c-b590-d4057b83f4ab
[I 2023-07-24 18:08:59,101] Trial 0 finished with value: 0.7818477345949554 and parameters: {'k': 3, 'learning_rate': 0.0004837518951600616, 'max_depth': 8, 'min_samples_leaf': 84, 'n_estimators': 350}. Best is trial 0 with value: 0.7818477345949554.
[I 2023-07-24 18:08:59,765] Trial 1 finished with value: 0.7818477345949554 and parameters: {'k': 9, 'learning_rate': 0.00040310700725436864, 'max_depth': 4, 'min_samples_leaf': 92, 'n_estimators': 250}. Best is trial 0 with value: 0.7818477345949554.
[I 2023-07-24 18:09:00,725] Trial 2 finished with value: 0.7818477345949554 and parameters: {'k': 8, 'learning_rate': 0.00105542301842524, 'max_depth': 5, 'min_samples_leaf': 52, 'n_estimators': 350}. Best is trial 0 with value: 0.7818477345949554.
[I 2023-07-24 18:09:01,776] Trial 3 finished with value: 0.7864249847618494 and parameters: {'k': 5, 'learning_rate': 0.00538516245268

In [None]:
def objective_dt(trial, X, y):

  selector_params = {
      'k': trial.suggest_int("k", 3, 6)
      }


  hyper_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'n_estimators':trial.suggest_int('n_estimators', 200, 300, step=50),
        }


  stacking_lgbm = Pipeline([
    ('feature_selection', SelectKBest(**selector_params)),
    ('classifier', LGBMClassifier(**hyper_params))
    ])


  scorer = make_scorer(accuracy_score)
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  cv_scores = cross_val_score(stacking_lgbm, X, y, cv=cv, scoring=scorer)
  score = np.mean(cv_scores)

  return score

In [None]:
study_all2 = optuna.create_study(direction="maximize")
study_all2.optimize(lambda trial: objective_dt(trial, X_features, y_val), n_trials=500,  gc_after_trial=True)

[I 2023-07-24 19:59:31,999] A new study created in memory with name: no-name-535827f6-8155-4a29-9d97-4d3d3be294f5
[I 2023-07-24 19:59:32,175] Trial 0 finished with value: 0.7818477345949554 and parameters: {'k': 6, 'learning_rate': 0.002169828989611962, 'num_leaves': 167, 'max_depth': 6, 'min_child_samples': 44, 'n_estimators': 250}. Best is trial 0 with value: 0.7818477345949554.
[I 2023-07-24 19:59:32,427] Trial 1 finished with value: 0.788717963602589 and parameters: {'k': 6, 'learning_rate': 0.01671493683001545, 'num_leaves': 263, 'max_depth': 5, 'min_child_samples': 53, 'n_estimators': 200}. Best is trial 1 with value: 0.788717963602589.
[I 2023-07-24 19:59:32,607] Trial 2 finished with value: 0.7818477345949554 and parameters: {'k': 4, 'learning_rate': 0.018893092729019484, 'num_leaves': 663, 'max_depth': 3, 'min_child_samples': 85, 'n_estimators': 250}. Best is trial 1 with value: 0.788717963602589.
[I 2023-07-24 19:59:32,861] Trial 3 finished with value: 0.7887208661074507 and 

In [55]:
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Create the stacking classifier
stacking_classifier = StackingClassifier(
    estimators=[('Gradient Boosting', model_gbc1), ('AdaBoost', model_adaboost), ('CatBoost', model_catboost),
                ('LightGBM', model_lgbm), ('Random Forest', model_rf), ('SVC', model_svc)],
    final_estimator=KNeighborsClassifier(p=2,n_neighbors=16),
    stack_method='auto'  # Method used to train the meta-model ('auto', 'predict_proba', 'decision_function')
)

# Train the stacking classifier on your training data
stacking_classifier.fit(X_train, y_train)

# Evaluate the stacking classifier on your validation or test data
accuracy = stacking_classifier.score(X_val, y_val)
print("Stacking Classifier Accuracy:", accuracy)


Stacking Classifier Accuracy: 0.7917620137299771
