In [4]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline as pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [5]:
Data = pd.read_csv('/content/Loan_default.csv')

Data.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0.0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0.0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1.0
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0.0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0.0


In [6]:
Data = Data.drop(columns='LoanID')

print("Updated Data Shape:", Data.shape)

Updated Data Shape: (150936, 17)


In [7]:
num = Data.select_dtypes(include=['int', 'float'])

print("Selected Numerical Features:", num.columns.tolist())

Selected Numerical Features: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Default']


In [8]:
def ColumnTrans(cat):
    for column in cat.columns:
        # Get unique values for the column
        unique_values = cat[column].unique()

        # Create a mapping of each unique value to a corresponding integer
        value_map = {value: index for index, value in enumerate(unique_values)}

        # Map the column's categorical values to their integer indices
        cat[column] = cat[column].map(value_map)

    return cat

In [9]:
cat = Data.select_dtypes(include='object')

# Apply the custom ColumnTrans function to transform categorical columns into numerical format.
cat = ColumnTrans(cat)

# Verify the transformation by displaying the first few rows of the transformed categorical data.
print("Transformed Categorical Columns (first few rows):")
print(cat.head())

Transformed Categorical Columns (first few rows):
   Education  EmploymentType  MaritalStatus  HasMortgage  HasDependents  \
0          0               0              0            0              0   
1          1               0              1            1              1   
2          1               1              0            0              0   
3          2               0              1            1              1   
4          0               1              0            1              0   

   LoanPurpose  HasCoSigner  
0            0            0  
1            0            0  
2            1            1  
3            2            1  
4            1            1  


In [10]:
df = pd.concat([num, cat], axis=1)

# Verify the combined DataFrame by displaying the first few rows.
print("Combined DataFrame (first few rows):")
print(df.head())

Combined DataFrame (first few rows):
   Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
0   56   85994       50587          520              80               4   
1   69   50432      124440          458              15               1   
2   46   84208      129188          451              26               3   
3   32   31713       44799          743               0               3   
4   60   20437        9139          633               8               4   

   InterestRate  LoanTerm  DTIRatio  Default  Education  EmploymentType  \
0         15.23        36      0.44      0.0          0               0   
1          4.81        60      0.68      0.0          1               0   
2         21.17        24      0.31      1.0          1               1   
3          7.07        24      0.23      0.0          2               0   
4          6.51        48      0.73      0.0          0               1   

   MaritalStatus  HasMortgage  HasDependents  LoanPurpose  Ha

In [11]:
x1 = df.drop(columns='Default')  # Features: All columns except 'Default'
y1 = df['Default']  # Target: 'Default' column

# Verify the separation by displaying the shapes of the features and target.
print("Shape of Features (X):", x1.shape)
print("Shape of Target (y):", y1.shape)

Shape of Features (X): (150936, 16)
Shape of Target (y): (150936,)


In [12]:
ros = RandomOverSampler()  # Random Over-Sampling to balance the dataset by increasing the minority class.
rus = RandomUnderSampler()  # Random Under-Sampling to balance the dataset by decreasing the majority class.
smote = SMOTE()  # SMOTE (Synthetic Minority Over-sampling Technique) to generate synthetic examples for the minority class.

# Verify that the resampling methods are correctly initialized
print("Resampling techniques initialized:")
print("RandomOverSampler:", ros)
print("RandomUnderSampler:", rus)
print("SMOTE:", smote)

Resampling techniques initialized:
RandomOverSampler: RandomOverSampler()
RandomUnderSampler: RandomUnderSampler()
SMOTE: SMOTE()


In [15]:
# Remove rows with NaN values in the target variable
df_cleaned = df.dropna(subset=['Default'])

x1 = df_cleaned.drop(columns='Default')  # Features: All columns except 'Default'
y1 = df_cleaned['Default']  # Target: 'Default' column

ros = RandomOverSampler()  # Random Over-Sampling to balance the dataset by increasing the minority class.
rus = RandomUnderSampler()  # Random Under-Sampling to balance the dataset by decreasing the majority class.
smote = SMOTE()  # SMOTE (Synthetic Minority Over-sampling Technique) to generate synthetic examples for the minority class.

x2, y2 = ros.fit_resample(x1, y1)
print("Shape after RandomOverSampler (ROS):", x2.shape, y2.shape)

# Apply SMOTE to generate synthetic samples for the minority class
x3, y3 = smote.fit_resample(x2, y2)
print("Shape after SMOTE:", x3.shape, y3.shape)

# Apply Random Under-Sampling to decrease the majority class
x, y = rus.fit_resample(x3, y3)
print("Shape after RandomUnderSampler (RUS):", x.shape, y.shape)

# Final balanced dataset
print("Final Balanced Dataset Shape (X):", x.shape)
print("Final Balanced Dataset Shape (Y):", y.shape)

Shape after RandomOverSampler (ROS): (267064, 16) (267064,)
Shape after SMOTE: (267064, 16) (267064,)
Shape after RandomUnderSampler (RUS): (267064, 16) (267064,)
Final Balanced Dataset Shape (X): (267064, 16)
Final Balanced Dataset Shape (Y): (267064,)


In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets to verify the split
print("Shape of Training Features (X_train):", x_train.shape)
print("Shape of Testing Features (X_test):", x_test.shape)
print("Shape of Training Target (y_train):", y_train.shape)
print("Shape of Testing Target (y_test):", y_test.shape)

Shape of Training Features (X_train): (213651, 16)
Shape of Testing Features (X_test): (53413, 16)
Shape of Training Target (y_train): (213651,)
Shape of Testing Target (y_test): (53413,)


In [17]:
model = RandomForestClassifier(n_estimators=2000, random_state=42)

# Display the model's parameters to verify the configuration
print("Random Forest Classifier initialized with parameters:")
print(model)

Random Forest Classifier initialized with parameters:
RandomForestClassifier(n_estimators=2000, random_state=42)


In [18]:
print("Model training complete with Random Forest Classifier.")
model.fit(x_train, y_train)

# Display a message to indicate that training is complete
print("Model training complete with Random Forest Classifier.")

Model training complete with Random Forest Classifier.
Model training complete with Random Forest Classifier.


In [19]:
prediction = model.predict(x_test)

# Display the shape of the predictions to verify
print("Shape of predictions:", prediction.shape)


Shape of predictions: (53413,)


In [20]:
(prediction == 0).sum()

np.int64(26181)

In [21]:
(prediction == 1).sum()

np.int64(27232)

In [22]:
# Import the evaluation metrics from sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9909
Precision: 0.9833
Recall: 0.9988
F1 Score: 0.9910
