In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv(r'data/data.csv')

In [3]:
df.drop('ID',axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [5]:
#Feature Engineering
df['SEX'] = df['SEX'] - 1
df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x >= 4 else x)
df['PAY_0'] = df['PAY_0'].apply(lambda x: 0 if x<=0 else x)
df['PAY_2'] = df['PAY_2'].apply(lambda x: 0 if x<=0 else x)
df['PAY_3'] = df['PAY_3'].apply(lambda x: 0 if x<=0 else x)
df['PAY_4'] = df['PAY_4'].apply(lambda x: 0 if x<=0 else x)
df['PAY_5'] = df['PAY_5'].apply(lambda x: 0 if x<=0 else x)
df['PAY_6'] = df['PAY_6'].apply(lambda x: 0 if x<=0 else x)

In [6]:
#separate into independent and dependent features
target = 'default.payment.next.month'
X = df.drop(target,axis=1)
y = df[target]

In [7]:
#rectifying target imbalance
from imblearn.combine import SMOTETomek
resampler = SMOTETomek(random_state=42)
X , y = resampler.fit_resample(X, y)

In [8]:
X[y==1].shape , X[y==0].shape

((22746, 23), (22746, 23))

In [9]:
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE']

In [10]:
numerical_cols = [x for x in df.columns if x not in cat_cols and x!= target]
df[numerical_cols] = np.log(df[numerical_cols]+1e-10)

In [11]:
numerical_cols

['LIMIT_BAL',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [48]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import (StandardScaler , OneHotEncoder,)# (FunctionTransformer, PowerTransformer) # Handling Feature Scaling
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [49]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        # ('log',FunctionTransformer(np.log1p,feature_names_out='one-to-one'))
        # ('box-cox', PowerTransformer())
        ]
    )

cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OneHotEncoder(drop='first',handle_unknown='ignore')),
    ('scaler',StandardScaler(with_mean=False))
        ]
    )

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline, numerical_cols),
    ('categorical_pipeline',cat_pipeline , cat_cols)
    ])

In [50]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [51]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [52]:
X_train.shape , X_test.shape

((31844, 28), (13648, 28))

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score , roc_auc_score , f1_score , recall_score

In [54]:
# a function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    roc_score = roc_auc_score(true, predicted)
    f_1_score = f1_score(true, predicted)
    recall = recall_score(true, predicted)

    return accuracy , roc_score , f_1_score , recall

In [55]:
models={
    'LogisticRegression':LogisticRegression(),
    'RidgeClassifier':RidgeClassifier(),
    'BernoulliNB':BernoulliNB(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'BaggingClassifier':BaggingClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'SVC':SVC(),
    'XGBClassifier':XGBClassifier()
}
model_list=[]
roc=[]
acc = []
f1 = []
rec = []
performance = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy , roc_score  , f_1_score , recall =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    roc.append(roc_score)
    acc.append(accuracy)
    f1.append(f_1_score)
    rec.append(recall)
    performance.append((list(models.keys())[i] , accuracy ))


    print('Model Training Performance')
    print("ROC:",roc_score)
    print("Accuracy:",accuracy)

    
    print('='*35)
    print('\n')

print("Best Model" , sorted(performance , key = lambda x: x[1])[-1])
model = models[sorted(performance , key = lambda x: x[1])[-1][0]]
print(model)

metrics = pd.DataFrame({"models": model_list , "accuracy" : acc , "roc_auc_score": roc , "f1_score" : f1 , "recall_score" : rec}).sort_values('accuracy',ascending=False)
print(metrics)


LogisticRegression
Model Training Performance
ROC: 0.7357311347395462
Accuracy: 0.7356389214536928


RidgeClassifier
Model Training Performance
ROC: 0.7319370761276155
Accuracy: 0.7318288393903869


BernoulliNB
Model Training Performance
ROC: 0.7138479316844395
Accuracy: 0.7136576787807737


DecisionTreeClassifier
Model Training Performance
ROC: 0.7418746939322808
Accuracy: 0.7419402110199297


KNeighborsClassifier
Model Training Performance
ROC: 0.7424438585582097
Accuracy: 0.7425263774912075


AdaBoostClassifier
Model Training Performance
ROC: 0.7546803611489565
Accuracy: 0.7546160609613131


GradientBoostingClassifier
Model Training Performance
ROC: 0.7728221400057362
Accuracy: 0.7727139507620164


BaggingClassifier
Model Training Performance
ROC: 0.8042672185828386
Accuracy: 0.8041471277842908


RandomForestClassifier
Model Training Performance
ROC: 0.8313044710346659
Accuracy: 0.831257327080891


SVC
Model Training Performance
ROC: 0.7543879405818243
Accuracy: 0.7542497069167644



In [56]:
metrics

Unnamed: 0,models,accuracy,roc_auc_score,f1_score,recall_score
8,RandomForestClassifier,0.831257,0.831304,0.829218,0.816681
10,XGBClassifier,0.813892,0.813954,0.810758,0.794771
7,BaggingClassifier,0.804147,0.804267,0.797116,0.767017
6,GradientBoostingClassifier,0.772714,0.772822,0.765426,0.739264
5,AdaBoostClassifier,0.754616,0.75468,0.750242,0.734736
9,SVC,0.75425,0.754388,0.743891,0.71151
4,KNeighborsClassifier,0.742526,0.742444,0.749537,0.76804
3,DecisionTreeClassifier,0.74194,0.741875,0.747672,0.762197
0,LogisticRegression,0.735639,0.735731,0.728518,0.707128
1,RidgeClassifier,0.731829,0.731937,0.723189,0.698364


In [57]:
from sklearn.model_selection import RandomizedSearchCV

In [58]:
params = {
    'n_estimators':[200,300,400],
    "criterion":['gini',], #['entropy','log_loss'],
    'max_features':['sqrt'] #['log2',None]
}

In [61]:
classifier = RandomForestClassifier()
clf = RandomizedSearchCV(classifier, param_distributions=params,scoring='accuracy', verbose=3, cv=5,random_state=42)

In [62]:
clf.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END criterion=gini, max_features=sqrt, n_estimators=200;, score=0.832 total time=  16.8s
[CV 2/5] END criterion=gini, max_features=sqrt, n_estimators=200;, score=0.830 total time=  16.8s
[CV 3/5] END criterion=gini, max_features=sqrt, n_estimators=200;, score=0.827 total time=  16.6s
[CV 4/5] END criterion=gini, max_features=sqrt, n_estimators=200;, score=0.831 total time=  16.7s
[CV 5/5] END criterion=gini, max_features=sqrt, n_estimators=200;, score=0.830 total time=  16.8s
[CV 1/5] END criterion=gini, max_features=sqrt, n_estimators=300;, score=0.834 total time=  25.3s
[CV 2/5] END criterion=gini, max_features=sqrt, n_estimators=300;, score=0.831 total time=  24.9s
[CV 3/5] END criterion=gini, max_features=sqrt, n_estimators=300;, score=0.827 total time=  25.2s
[CV 4/5] END criterion=gini, max_features=sqrt, n_estimators=300;, score=0.833 total time=  25.7s
[CV 5/5] END criterion=gini, max_features=sqrt, n_estimato

In [63]:
clf.best_score_

0.8312398071374301

In [64]:
clf.best_params_

{'n_estimators': 300, 'max_features': 'sqrt', 'criterion': 'gini'}