In [1]:
from sklearn.model_selection import train_test_split, cross_val_score,KFold, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import ADASYN
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from collections import Counter
from scipy.stats import randint
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import warnings

In [2]:
pd.set_option('display.max_columns',None)
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r"C:\Users\Lenovo\PycharmProjects\GermanBankCreditCard\SouthGermanCredit\Final_Model.csv")
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df.head()

Unnamed: 0,credit_risk,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,present_residence,property,age,number_credits,telephone
0,1,-1.254566,-0.21744,1.344014,-0.234588,-1.103856,-0.740466,-1.145978,0.918477,-0.96365,1.046987,-0.341055,-1.304413,-0.736573,-0.823318
1,1,-1.254566,-1.065709,1.344014,-1.230716,0.231565,-0.740466,-0.317959,-0.870183,0.449326,-0.765977,-1.293723,0.049696,1.138296,-0.823318
2,1,-0.459026,-0.782952,-0.503428,1.757667,-1.404433,0.168067,0.51006,-0.870183,-0.96365,1.046987,-1.293723,-1.123865,-0.736573,-0.823318
3,1,-1.254566,-0.782952,1.344014,-1.230716,-0.14529,-0.740466,-0.317959,0.024147,0.449326,-0.765977,-1.293723,0.320518,1.138296,-0.823318
4,1,-1.254566,-0.782952,1.344014,-1.230716,-0.114223,-0.740466,-0.317959,0.918477,0.449326,1.046987,-0.341055,0.230244,1.138296,-0.823318


In [4]:
X=df.drop(['credit_risk'],axis=1)
y=df['credit_risk']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def model(X=X_train, y=y_train):
    
    models=[]
    models.append(('GradientBoostingClassifier',GradientBoostingClassifier()))
    models.append(('XGBClassifier',XGBClassifier(eval_metric='mlogloss')))
    models.append(('DecisionTreeClassifier',DecisionTreeClassifier()))
    models.append(('RandomForestClassifier',RandomForestClassifier()))
    models.append(('AdaBoostClassifier',AdaBoostClassifier()))
    models.append(('LogisticRegression',LogisticRegression()))
    models.append(('BaggingClassifier',BaggingClassifier()))
    models.append(('LGBMClassifier',LGBMClassifier()))
    models.append(('SGDClassifier',SGDClassifier()))
    models.append(('SVC',SVC()))
    
    results =[]
    names   =[]
    scoring ='accuracy'
    
    for name, model in models:
        #kfold = KFold(n_splits=10, random_state=42)
        kfold = RepeatedStratifiedKFold(n_splits=10, random_state=42, n_repeats=3)
        cross = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
        results.append(cross)
        names.append(name)
        print (f'Model:{name},Mean: {cross.mean()},Std Dev: {cross.std()}')
        print('---'*25)

In [7]:
model(X_train,y_train)

Model:GradientBoostingClassifier,Mean: 0.7662499999999999,Std Dev: 0.047450632240255766
---------------------------------------------------------------------------
Model:XGBClassifier,Mean: 0.75,Std Dev: 0.0484122918275927
---------------------------------------------------------------------------
Model:DecisionTreeClassifier,Mean: 0.6979166666666667,Std Dev: 0.05774254401123047
---------------------------------------------------------------------------
Model:RandomForestClassifier,Mean: 0.7608333333333335,Std Dev: 0.04433928531474343
---------------------------------------------------------------------------
Model:AdaBoostClassifier,Mean: 0.7595833333333333,Std Dev: 0.03773307199556142
---------------------------------------------------------------------------
Model:LogisticRegression,Mean: 0.7604166666666667,Std Dev: 0.040192678299522284
---------------------------------------------------------------------------
Model:BaggingClassifier,Mean: 0.73875,Std Dev: 0.04568346710426723
-----

In [8]:
ada = ADASYN(sampling_strategy='minority',random_state=42,n_neighbors=7)
X_res,y_res = ada.fit_resample(X_train,y_train)
Counter(y_res)

Counter({1: 562, 0: 575})

In [9]:
model(X_res,y_res)

Model:GradientBoostingClassifier,Mean: 0.8238213527920095,Std Dev: 0.03443809259041476
---------------------------------------------------------------------------
Model:XGBClassifier,Mean: 0.8311494074419085,Std Dev: 0.031994446383129
---------------------------------------------------------------------------
Model:DecisionTreeClassifier,Mean: 0.7639910986906796,Std Dev: 0.03712163712236569
---------------------------------------------------------------------------
Model:RandomForestClassifier,Mean: 0.8446436888681881,Std Dev: 0.031539578166877254
---------------------------------------------------------------------------
Model:AdaBoostClassifier,Mean: 0.7938984629715883,Std Dev: 0.03896871758233318
---------------------------------------------------------------------------
Model:LogisticRegression,Mean: 0.7183020234953165,Std Dev: 0.0314257596972927
---------------------------------------------------------------------------
Model:BaggingClassifier,Mean: 0.8044739429695184,Std Dev: 0.0

In [10]:
param_distribs = {'n_estimators': randint(low=1, high=500),
                  'max_depth': randint(low=1, high=10),
                  'max_features':randint(low=1,high=10),
                 }

rnd_RF = RandomizedSearchCV(RandomForestClassifier(),param_distributions=param_distribs,n_iter=10,cv=5,scoring='accuracy', 
                            random_state=42)

rnd_RF.fit(X_res,y_res)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A5EC049BB0>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A5EC30E5B0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A5EC2E9AF0>},
                   random_state=42, scoring='accuracy')

In [11]:
rnd_RF.best_params_

{'max_depth': 8, 'max_features': 6, 'n_estimators': 386}

In [12]:
RFClassifier = RandomForestClassifier(max_depth=8, max_features=5, n_estimators=103, random_state=42)
RFClassifier.fit(X_res,y_res)

RandomForestClassifier(max_depth=8, max_features=5, n_estimators=103,
                       random_state=42)

In [13]:
param_grid = [{'n_estimators': [3, 10, 30], 'max_depth': [2, 4, 6, 8],'booster': ['gbtree','dart'],
              'learning_rate':[0.3,0.5,0.01,0.1]}]

grid_xgb = GridSearchCV(XGBClassifier(eval_metric='mlogloss'), param_grid=param_grid, cv=5, scoring='accuracy', 
                        return_train_score=True)

grid_xgb.fit(X_res,y_res)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='mlogloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample

In [14]:
grid_xgb.best_params_

{'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 8, 'n_estimators': 30}

In [15]:
XGClassifier=XGBClassifier(booster='gbtree', learning_rate=0.5, max_depth=8, n_estimators=30, eval_metric='mlogloss')
XGClassifier.fit(X_res,y_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.5, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=30, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [16]:
BClassifier = BaggingClassifier(base_estimator=XGBClassifier(eval_metric='mlogloss'),random_state=42)
BClassifier.fit(X_res,y_res)

BaggingClassifier(base_estimator=XGBClassifier(base_score=None, booster=None,
                                               colsample_bylevel=None,
                                               colsample_bynode=None,
                                               colsample_bytree=None,
                                               eval_metric='mlogloss',
                                               gamma=None, gpu_id=None,
                                               importance_type='gain',
                                               interaction_constraints=None,
                                               learning_rate=None,
                                               max_delta_step=None,
                                               max_depth=None,
                                               min_child_weight=None,
                                               missing=nan,
                                               monotone_constraints=None,
                  

In [17]:
LGBClassifier=LGBMClassifier(random_state=42)
LGBClassifier.fit(X_res,y_res)

LGBMClassifier(random_state=42)

In [18]:
GBClassifier=GradientBoostingClassifier(random_state=42)
GBClassifier.fit(X_res,y_res)

GradientBoostingClassifier(random_state=42)

In [19]:
prediction_RF = RFClassifier.predict(X_test)

In [20]:
prediction_XGB = XGClassifier.predict(X_test)

In [21]:
prediction_Bagg = BClassifier.predict(X_test)

In [22]:
prediction_LGBM = LGBClassifier.predict(X_test)

In [23]:
prediction_GB = GBClassifier.predict(X_test)

In [24]:
print('Accuracy RForest...{}'.format(accuracy_score(y_test,prediction_RF)))
print('---'*25)
print('Accuracy XGBoost...{}'.format(accuracy_score(y_test,prediction_XGB)))
print('---'*25)
print('Accuracy Bagging...{}'.format(accuracy_score(y_test,prediction_Bagg)))
print('---'*25)
print('Accuracy LightGBM...{}'.format(accuracy_score(y_test,prediction_LGBM)))
print('---'*25)
print('Accuracy Gradient...{}'.format(accuracy_score(y_test,prediction_GB)))
print('---'*25)

Accuracy RForest...0.76
---------------------------------------------------------------------------
Accuracy XGBoost...0.75
---------------------------------------------------------------------------
Accuracy Bagging...0.745
---------------------------------------------------------------------------
Accuracy LightGBM...0.745
---------------------------------------------------------------------------
Accuracy Gradient...0.755
---------------------------------------------------------------------------


In [25]:
print('Precision RForest...{}'.format(precision_score(y_test,prediction_RF)))
print('---'*25)
print('Precision XGBoost...{}'.format(precision_score(y_test,prediction_XGB)))
print('---'*25)
print('Precision Bagging...{}'.format(precision_score(y_test,prediction_Bagg)))
print('---'*25)
print('Precision LightGBM...{}'.format(precision_score(y_test,prediction_LGBM)))
print('---'*25)
print('Precision Gradient...{}'.format(precision_score(y_test,prediction_GB)))
print('---'*25)

Precision RForest...0.835820895522388
---------------------------------------------------------------------------
Precision XGBoost...0.8188405797101449
---------------------------------------------------------------------------
Precision Bagging...0.8041958041958042
---------------------------------------------------------------------------
Precision LightGBM...0.8
---------------------------------------------------------------------------
Precision Gradient...0.8201438848920863
---------------------------------------------------------------------------


In [26]:
print('Recall RForest...{}'.format(recall_score(y_test,prediction_RF)))
print('---'*25)
print('Recall XGBoost...{}'.format(recall_score(y_test,prediction_XGB)))
print('---'*25)
print('Recall Bagging...{}'.format(recall_score(y_test,prediction_Bagg)))
print('---'*25)
print('Recall LightGBM...{}'.format(recall_score(y_test,prediction_LGBM)))
print('---'*25)
print('Recall Gradient...{}'.format(recall_score(y_test,prediction_GB)))
print('---'*25)

Recall RForest...0.8115942028985508
---------------------------------------------------------------------------
Recall XGBoost...0.8188405797101449
---------------------------------------------------------------------------
Recall Bagging...0.8333333333333334
---------------------------------------------------------------------------
Recall LightGBM...0.8405797101449275
---------------------------------------------------------------------------
Recall Gradient...0.8260869565217391
---------------------------------------------------------------------------


In [27]:
print('Confusion Matrix RForest... \n {}'.format(confusion_matrix(y_test,prediction_RF)))
print('---'*25)
print('Confusion Matrix XGBoost... \n {}'.format(confusion_matrix(y_test,prediction_XGB)))
print('---'*25)
print('Confusion Matrix Bagging... \n {}'.format(confusion_matrix(y_test,prediction_Bagg)))
print('---'*25)
print('Confusion Matrix LightGBM... \n {}'.format(confusion_matrix(y_test,prediction_LGBM)))
print('---'*25)
print('Confusion Matrix Gradient... \n {}'.format(confusion_matrix(y_test,prediction_GB)))
print('---'*25)

Confusion Matrix RForest... 
 [[ 40  22]
 [ 26 112]]
---------------------------------------------------------------------------
Confusion Matrix XGBoost... 
 [[ 37  25]
 [ 25 113]]
---------------------------------------------------------------------------
Confusion Matrix Bagging... 
 [[ 34  28]
 [ 23 115]]
---------------------------------------------------------------------------
Confusion Matrix LightGBM... 
 [[ 33  29]
 [ 22 116]]
---------------------------------------------------------------------------
Confusion Matrix Gradient... 
 [[ 37  25]
 [ 24 114]]
---------------------------------------------------------------------------


In [28]:
import pickle
file = 'Credit_Data_RF.pkl'

pickle.dump(RFClassifier,open(file,'wb'))