<a href="https://colab.research.google.com/github/soton-study/credit-risk-final-report/blob/main/Ada_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the uploaded dataset
credit_data = pd.read_csv('Credit data.csv')

# Display the first few rows of the dataset to understand its structure
credit_data.head()
# Separate features and target variable
X = credit_data.drop(columns=["ID", "Default"])  # Drop ID and target columns
y = credit_data["Default"]

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
# Apply SMOTE
smote = SMOTE(random_state=36621943)  # Random seed for reproducibility
X_resampled, y_resampled = smote.fit_resample(X, y)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size=0.3, random_state=36621943, stratify=y_resampled)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12738, 10), (5460, 10), (12738,), (5460,))

In [16]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the base model and AdaBoost
rf_model = RandomForestClassifier(n_estimators=100, random_state=36621943)
ada_rf_model = AdaBoostClassifier(estimator=rf_model, n_estimators=50, random_state=36621943)

# Fit the RF model

rf_model.fit(X_train, y_train)

# Fit the AdaBoost model
ada_rf_model.fit(X_train, y_train)

# Make predictions RF
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)




In [17]:
# Make predictions ADRF
y_pred_adrf = ada_rf_model.predict(X_test)

# Evaluate the model ADRF
conf_matrix_adrf = confusion_matrix(y_test, y_pred_adrf)
class_report_adrf = classification_report(y_test, y_pred_adrf)
accuracy_adrf = accuracy_score(y_test, y_pred_adrf)

In [18]:
print(conf_matrix_rf)
print(class_report_rf)
print(accuracy_rf)

[[2505  225]
 [ 176 2554]]
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      2730
           1       0.92      0.94      0.93      2730

    accuracy                           0.93      5460
   macro avg       0.93      0.93      0.93      5460
weighted avg       0.93      0.93      0.93      5460

0.9265567765567766


In [19]:
print(conf_matrix_adrf)
print(class_report_adrf)
print(accuracy_adrf)

[[2515  215]
 [ 178 2552]]
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      2730
           1       0.92      0.93      0.93      2730

    accuracy                           0.93      5460
   macro avg       0.93      0.93      0.93      5460
weighted avg       0.93      0.93      0.93      5460

0.9280219780219781


In [20]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(penalty='l2', # Type of penalization l1 = lasso, l2 = ridge
                                     tol=0.0001, # Tolerance for parameters
                                     C=0.1, # Penalty constant, see below
                                     fit_intercept=True, # Use constant?
                                     class_weight='balanced', # Weights, see below
                                     random_state=36621943, # Random seed student ID
                                     max_iter=300, # Maximum iterations
                                     verbose=1, # Show process. 1 is yes.
                                     solver = 'liblinear',
                                     warm_start=False # Train anew or start from previous weights. For repeated training.
                                    )
ada_lr_model = AdaBoostClassifier(estimator=lr_model, n_estimators=50, random_state=36621943)

# Fit LR model
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
class_report_lr = classification_report(y_test, y_pred_lr)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print(conf_matrix_lr)
print(class_report_lr)
print(accuracy_lr)




[LibLinear][[2111  619]
 [ 694 2036]]
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2730
           1       0.77      0.75      0.76      2730

    accuracy                           0.76      5460
   macro avg       0.76      0.76      0.76      5460
weighted avg       0.76      0.76      0.76      5460

0.7595238095238095


In [23]:
# Fit the AdaBoost model
ada_lr_model.fit(X_train, y_train)

# Make predictions
y_pred_adlr = ada_rf_model.predict(X_test)

# Evaluate the model
conf_matrix_adlr = confusion_matrix(y_test, y_pred_lr)
class_report_adlr = classification_report(y_test, y_pred_lr)
accuracy_adlr = accuracy_score(y_test, y_pred_lr)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [24]:
print(conf_matrix_adlr)
print(class_report_adlr)
print(accuracy_adlr)

[[2515  215]
 [ 178 2552]]
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      2730
           1       0.92      0.93      0.93      2730

    accuracy                           0.93      5460
   macro avg       0.93      0.93      0.93      5460
weighted avg       0.93      0.93      0.93      5460

0.9280219780219781
