In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mrandika/CII3C3_ML_Ensemble-Learning/main/german_credit.csv")

In [None]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   status                   1000 non-null   object
 1   duration                 1000 non-null   int64 
 2   credit_history           1000 non-null   object
 3   purpose                  1000 non-null   object
 4   amount                   1000 non-null   int64 
 5   savings                  1000 non-null   object
 6   employment_duration      1000 non-null   object
 7   installment_rate         1000 non-null   object
 8   personal_status_sex      1000 non-null   object
 9   other_debtors            1000 non-null   object
 10  present_residence        1000 non-null   object
 11  property                 1000 non-null   object
 12  age                      1000 non-null   int64 
 13  other_installment_plans  1000 non-null   object
 14  housing                  1000 non-null   

In [None]:
# Get dataset sample
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,no checking account,18,all credits at this bank paid back duly,car (used),1049,unknown/no savings account,< 1 yr,< 20,female : non-single or male : single,none,...,car or other,21,none,for free,1,skilled employee/official,0 to 2,no,no,good
1,no checking account,9,all credits at this bank paid back duly,others,2799,unknown/no savings account,1 <= ... < 4 yrs,25 <= ... < 35,male : married/widowed,none,...,unknown / no property,36,none,for free,3-Feb,skilled employee/official,3 or more,no,no,good
2,... < 0 DM,12,no credits taken/all credits paid back duly,retraining,841,... < 100 DM,4 <= ... < 7 yrs,25 <= ... < 35,female : non-single or male : single,none,...,unknown / no property,23,none,for free,1,unskilled - resident,0 to 2,no,no,good
3,no checking account,12,all credits at this bank paid back duly,others,2122,unknown/no savings account,1 <= ... < 4 yrs,20 <= ... < 25,male : married/widowed,none,...,unknown / no property,39,none,for free,3-Feb,unskilled - resident,3 or more,no,yes,good
4,no checking account,12,all credits at this bank paid back duly,others,2171,unknown/no savings account,1 <= ... < 4 yrs,< 20,male : married/widowed,none,...,car or other,38,bank,rent,3-Feb,unskilled - resident,0 to 2,no,yes,good


In [None]:
# Check if dataset contain any null
df.isnull().sum()

status                     0
duration                   0
credit_history             0
purpose                    0
amount                     0
savings                    0
employment_duration        0
installment_rate           0
personal_status_sex        0
other_debtors              0
present_residence          0
property                   0
age                        0
other_installment_plans    0
housing                    0
number_credits             0
job                        0
people_liable              0
telephone                  0
foreign_worker             0
credit_risk                0
dtype: int64

In [None]:
# Describe the dataset
df.describe()

Unnamed: 0,duration,amount,age
count,1000.0,1000.0,1000.0
mean,20.903,3271.248,35.542
std,12.058814,2822.75176,11.35267
min,4.0,250.0,19.0
25%,12.0,1365.5,27.0
50%,18.0,2319.5,33.0
75%,24.0,3972.25,42.0
max,72.0,18424.0,75.0


In [None]:
# Feature selection, drop the unwanted column
df.drop(['savings', 'employment_duration', 'personal_status_sex', 'other_debtors', 'present_residence', 'other_installment_plans', 'housing', 'people_liable', 'telephone'], axis=1, inplace=True)

In [None]:
# Encode each column
for col in df.columns:
    encoder = LabelEncoder()
    encoder.fit(df[col])
    df[col] = encoder.transform(df[col])

In [None]:
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,installment_rate,property,age,number_credits,job,foreign_worker,credit_risk
0,3,13,0,2,116,2,1,2,0,1,0,1
1,3,5,0,5,528,1,3,17,1,1,0,1
2,0,8,4,8,72,1,3,4,0,3,0,1
3,3,8,0,5,398,0,3,20,1,3,1,1
4,3,8,0,5,411,2,1,19,1,3,1,1


In [None]:
# Normalize the column
columns = ['duration', 'amount', 'age']

# Normalize each column
for col in columns:
    X = df[col].values.reshape(-1, 1)

    scaler = MinMaxScaler()
    scaler.fit(X)
    df[col] = scaler.transform(X)

In [None]:
# Select the features
X = df.drop('credit_risk', axis=1, inplace=False)

# Select the label
y = df['credit_risk']

In [None]:
# Split the dataset - training, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [None]:
# Create base model using DecisionTree
base_model = RandomForestClassifier()

In [None]:
# Modelling
model = BaggingClassifier(base_estimator=base_model, n_estimators=10)

# Define the hyperparameter grid
param_grid = {'max_samples': [0.5, 0.7, 1.0],
              'max_features': [0.5, 0.7, 1.0]}

# Create an instance of the GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=BaggingClassifier(base_estimator=RandomForestClassifier()),
             n_jobs=-1,
             param_grid={'max_features': [0.5, 0.7, 1.0],
                         'max_samples': [0.5, 0.7, 1.0]})

In [None]:
# Set the hyperparameter
model.set_params(max_features=grid_search.best_params_['max_features'], max_samples=grid_search.best_params_['max_samples'])

BaggingClassifier(base_estimator=RandomForestClassifier())

In [None]:
# Train the model
model.fit(X_train, y_train)

BaggingClassifier(base_estimator=RandomForestClassifier())

In [None]:
# Run on test set
y_pred = model.predict(X_test)

# Compute the training accuracy
train_accuracy = model.score(X_train, y_train)

In [None]:
print(f'Training accuracy: {train_accuracy:.3f}')

Training accuracy: 0.959


In [None]:
print(confusion_matrix(y_test.to_numpy(), y_pred))

[[2 0]
 [1 7]]


In [None]:
print(classification_report(y_test.to_numpy(), y_pred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.88      0.93         8

    accuracy                           0.90        10
   macro avg       0.83      0.94      0.87        10
weighted avg       0.93      0.90      0.91        10

