In [1]:
# Loan Default Prediction
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
# Step 2: Load Dataset
df = pd.read_csv(r'D:\Github_Share\Dataset\ameen_dataset.csv')
print("\nDataset Loaded Successfully!\n")
df.head()


Dataset Loaded Successfully!



Unnamed: 0,Customer_ID,Age,Gender,Employment_Type,Monthly_Income,Loan_Amount,Loan_Term_Months,Credit_Score,Existing_Loans_Count,Previous_Default,EMI_Burden,Default,Loan_Date,Loan_Year
0,CUST1000,59,Male,Non-Government,65393,348512,12,462,1,1,18376,1,2022-11-17 04:54:13.089070,2022
1,CUST1001,49,Male,Non-Government,148013,712143,12,823,3,0,8588,0,2024-01-06 04:54:13.089070,2024
2,CUST1002,35,Female,Non-Government,35934,616202,48,570,4,0,27000,0,2022-06-07 04:54:13.089070,2022
3,CUST1003,63,Male,Government,85745,251579,36,791,4,0,8304,0,2022-12-24 04:54:13.089070,2022
4,CUST1004,28,Female,Non-Government,104456,702026,60,507,1,0,9345,0,2023-08-31 04:54:13.089070,2023


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer_ID           300 non-null    object
 1   Age                   300 non-null    int64 
 2   Gender                300 non-null    object
 3   Employment_Type       300 non-null    object
 4   Monthly_Income        300 non-null    int64 
 5   Loan_Amount           300 non-null    int64 
 6   Loan_Term_Months      300 non-null    int64 
 7   Credit_Score          300 non-null    int64 
 8   Existing_Loans_Count  300 non-null    int64 
 9   Previous_Default      300 non-null    int64 
 10  EMI_Burden            300 non-null    int64 
 11  Default               300 non-null    int64 
 12  Loan_Date             300 non-null    object
 13  Loan_Year             300 non-null    int64 
dtypes: int64(10), object(4)
memory usage: 32.9+ KB


In [4]:
from df_find_highly_correlated_columns import find_highly_correlated_columns

# Prepare data
numeric_df = df.select_dtypes(include='number')

# Find correlated columns
to_drop, high_corr_pairs = find_highly_correlated_columns(numeric_df,0.80)

# Output results
print("Highly correlated pairs (above threshold):")
for pair in high_corr_pairs:
    print(pair)

print("\nRecommended columns to drop:")
print(to_drop)


Highly correlated pairs (above threshold):

Recommended columns to drop:
set()


In [5]:
from df_find_categorical_columns_to_drop import find_categorical_columns_to_drop

drop_suggestions = find_categorical_columns_to_drop(df, target_col='Default')

# Print the results
for reason, cols in drop_suggestions.items():
    print(f"\nColumns to drop due to {reason.replace('_', ' ')}:")
    print(cols)



Columns to drop due to low variance:
[]

Columns to drop due to high cardinality:
['Customer_ID']

Columns to drop due to redundant:
[]

Columns to drop due to low predictive value:
['Gender', 'Employment_Type']


In [6]:
# Step 4: Data Preprocessing
# # Define features, target column and categories
target = 'Default'
yes_no_columns = []
cat_columns = []
num_columns = []
columns_to_drop = ['Customer_ID','Gender', 'Employment_Type','Loan_Date']

# Classify columns
for column in df.columns:
    if column == target:
        continue
    elif column in columns_to_drop:
        continue
    elif df[column].nunique() == 2:  # Check if the column has exactly 2 unique values
        yes_no_columns.append(column)
    elif df[column].nunique() <= 4:  # Check if the column has 4 or fewer unique values
        cat_columns.append(column)
    else:
        num_columns.append(column)

# Print the results
print("Yes/No Columns:", yes_no_columns)
print("Categorical Columns:", cat_columns)
print("Numerical Columns:", num_columns)

Yes/No Columns: ['Previous_Default']
Categorical Columns: ['Loan_Year']
Numerical Columns: ['Age', 'Monthly_Income', 'Loan_Amount', 'Loan_Term_Months', 'Credit_Score', 'Existing_Loans_Count', 'EMI_Burden']


In [7]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer

# Numeric pipeline: Impute missing values and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())                
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing with mode
    ('encoder', OneHotEncoder())    # One-hot encode categories
])
yes_no_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing with mode
    ('Ordinal', OrdinalEncoder())    # Ordinal categories
])


In [8]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_columns),
    ('cat', categorical_transformer, cat_columns),
    ('yes_no',yes_no_transformer,yes_no_columns)
])

In [9]:
def drop_columns(X):
    return X.drop(columns =columns_to_drop)

In [10]:
# Feature Scaling
x=df.drop(columns=target)
y=df[target]
x_transformed = preprocessor.fit_transform(x)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x_transformed, y, test_size=0.2, random_state=42)

# Logistic Regression Model

In [13]:
from sklearn.linear_model import LogisticRegression
# --- Logistic Regression ---
log_model = LogisticRegression()
log_model.fit(x_train, y_train)
y_pred_log = log_model.predict(x_test)

print("\n🔹 Logistic Regression Report:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))



🔹 Logistic Regression Report:
Accuracy: 0.9

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        49
           1       0.73      0.73      0.73        11

    accuracy                           0.90        60
   macro avg       0.83      0.83      0.83        60
weighted avg       0.90      0.90      0.90        60

Confusion Matrix:
 [[46  3]
 [ 3  8]]


In [14]:
# Save model to file
import joblib
filename = r'models\logistic_model.pkl'
joblib.dump(log_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\logistic_model.pkl'


# Support Vector Machine (SVM) Model

In [15]:
from sklearn.svm import SVC

# --- Support Vector Machine (SVM) ---
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')  # You can adjust kernel/C/gamma
svm_model.fit(x_train, y_train)
y_pred_svm = svm_model.predict(x_test)

print("\n🔹 SVM Report:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))



🔹 SVM Report:
Accuracy: 0.9

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94        49
           1       0.78      0.64      0.70        11

    accuracy                           0.90        60
   macro avg       0.85      0.80      0.82        60
weighted avg       0.90      0.90      0.90        60

Confusion Matrix:
 [[47  2]
 [ 4  7]]


In [16]:
# Save model to file
import joblib
filename = r'models\svm_model.pkl'
joblib.dump(svm_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\svm_model.pkl'


# Random Forests Model

In [17]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
score=r2_score(y_test, y_pred_rf)

print("\n🌲 Random Forest Report:")
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {score}')


🌲 Random Forest Report:
Mean Absolute Error: 0.01966666666666667
Mean Squared Error: 0.0026833333333333336
Root Mean Squared Error: 0.05180090089306685
R² Score: 0.9820779220779221


In [18]:
# Save model to file
import joblib
filename = r'models\rf_model.pkl'
joblib.dump(rf_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\rf_model.pkl'


# Run All 3 Models with Logging

In [21]:
import mlflow
import mlflow.sklearn
import joblib
import threading

# Set MLflow tracking URI and experiment
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
EXPERIMENT_NAME = "Pretrained_Model_Logging"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

# List of models to log
models_info = [
    {"file": r"models\logistic_model.pkl", "name": "Logistic Regression"},
    {"file": r"models\svm_model.pkl", "name": "SVM"},
    {"file": r"models\rf_model.pkl", "name": "Random Forest"}
]

# Function to load and log a model
def log_model_run(model_file, model_name):
    model = joblib.load(model_file)
    with mlflow.start_run(run_name=model_name):
        mlflow.set_tag("model_type", model_name)
        mlflow.sklearn.log_model(model, artifact_path="model")
        print(f"[{model_name}] Model logged to MLflow.")

# Run all model logging in parallel
threads = []
for model_info in models_info:
    t = threading.Thread(target=log_model_run, args=(model_info["file"], model_info["name"]))
    t.start()
    threads.append(t)

# Wait for all threads to finish
for t in threads:
    t.join()




[SVM] Model logged to MLflow.
[Logistic Regression] Model logged to MLflow.
🏃 View run SVM at: http://127.0.0.1:5000/#/experiments/273882292182050785/runs/32d3bd1b80db44f5a4b44c11a45cc92b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273882292182050785
🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/273882292182050785/runs/2c264bec92f44b90949e7f615f0a30c7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273882292182050785
[Random Forest] Model logged to MLflow.
🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/273882292182050785/runs/d930e903ea404eb7abf3257e745ca220
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273882292182050785
