In [1]:
# Loan Default Prediction

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [2]:

# Step 2: Load Dataset
df = pd.read_csv(r'D:\Github_Share\Dataset\ameen_dataset.csv')
print("\n✅ Dataset Loaded Successfully!\n")
print(df.info())  # check data structure


✅ Dataset Loaded Successfully!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer_ID           300 non-null    object
 1   Age                   300 non-null    int64 
 2   Gender                300 non-null    object
 3   Employment_Type       300 non-null    object
 4   Monthly_Income        300 non-null    int64 
 5   Loan_Amount           300 non-null    int64 
 6   Loan_Term_Months      300 non-null    int64 
 7   Credit_Score          300 non-null    int64 
 8   Existing_Loans_Count  300 non-null    int64 
 9   Previous_Default      300 non-null    int64 
 10  EMI_Burden            300 non-null    int64 
 11  Default               300 non-null    int64 
 12  Loan_Date             300 non-null    object
 13  Loan_Year             300 non-null    int64 
dtypes: int64(10), object(4)
memory usage: 32.9+ KB
None


In [3]:
# Step 3: Basic EDA
print(df.describe())
print(df.isnull().sum())


              Age  Monthly_Income    Loan_Amount  Loan_Term_Months  \
count  300.000000      300.000000     300.000000         300.00000   
mean    42.696667    84387.816667  532007.393333          36.40000   
std     13.048328    42000.322861  272304.042724          17.16339   
min     21.000000    10126.000000   54263.000000          12.00000   
25%     31.000000    50711.250000  295488.250000          24.00000   
50%     44.000000    85690.000000  549879.500000          36.00000   
75%     54.000000   121105.750000  745143.000000          48.00000   
max     64.000000   149789.000000  999491.000000          60.00000   

       Credit_Score  Existing_Loans_Count  Previous_Default    EMI_Burden  \
count    300.000000            300.000000        300.000000    300.000000   
mean     594.056667              2.093333          0.120000  23143.456667   
std      171.858346              1.401608          0.325504  13598.386578   
min      300.000000              0.000000          0.000000  

In [4]:
# Step 4: Define Target and Drop Columns
target = 'Default'
columns_to_drop = ['Customer_ID', 'Loan_Date']
x = df.drop(columns=target)
y = df[target]

In [5]:
# Step 5: Classify Columns
yes_no_columns = []
cat_columns = []
num_columns = []

for column in x.columns:
    if column in columns_to_drop:
        continue
    elif x[column].nunique() == 2:
        yes_no_columns.append(column)
    elif x[column].nunique() <= 4:
        cat_columns.append(column)
    else:
        num_columns.append(column)

print("✅ Column Classification:")
print("  - Yes/No Columns:", yes_no_columns)
print("  - Categorical Columns:", cat_columns)
print("  - Numerical Columns:", num_columns)

✅ Column Classification:
  - Yes/No Columns: ['Gender', 'Employment_Type', 'Previous_Default']
  - Categorical Columns: ['Loan_Year']
  - Numerical Columns: ['Age', 'Monthly_Income', 'Loan_Amount', 'Loan_Term_Months', 'Credit_Score', 'Existing_Loans_Count', 'EMI_Burden']


In [6]:
# Step 6: Define Preprocessing Steps
def drop_columns(X):
    return X.drop(columns=columns_to_drop)

preprocessing = ColumnTransformer([
    ('scale_num', StandardScaler(), num_columns),
    ('encode_cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_columns),
    ('encode_yn', OrdinalEncoder(), yes_no_columns)
])


In [7]:
# Step 7: Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 240 entries, 232 to 102
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Customer_ID           240 non-null    object
 1   Age                   240 non-null    int64 
 2   Gender                240 non-null    object
 3   Employment_Type       240 non-null    object
 4   Monthly_Income        240 non-null    int64 
 5   Loan_Amount           240 non-null    int64 
 6   Loan_Term_Months      240 non-null    int64 
 7   Credit_Score          240 non-null    int64 
 8   Existing_Loans_Count  240 non-null    int64 
 9   Previous_Default      240 non-null    int64 
 10  EMI_Burden            240 non-null    int64 
 11  Loan_Date             240 non-null    object
 12  Loan_Year             240 non-null    int64 
dtypes: int64(9), object(4)
memory usage: 26.2+ KB


# Logistic Regression Model

In [9]:
log_model = LogisticRegression(max_iter=1000)

log_train_pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('preprocessing', preprocessing),
    ('model', log_model)
])

log_train_pipeline.fit(x_train, y_train)
y_pred_log = log_train_pipeline.predict(x_test)

print("\n🔹 Logistic Regression Report:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))



🔹 Logistic Regression Report:
Accuracy: 0.9166666666666666

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95        49
           1       0.80      0.73      0.76        11

    accuracy                           0.92        60
   macro avg       0.87      0.84      0.86        60
weighted avg       0.91      0.92      0.92        60

Confusion Matrix:
 [[47  2]
 [ 3  8]]


In [10]:
# Save model to file
import joblib
filename = r'models\logistic_model.pkl'
joblib.dump(log_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\logistic_model.pkl'


# Support Vector Machine (SVM) Model

In [11]:
from sklearn.svm import SVC

# Define the SVM model
svm_model = SVC(kernel='rbf', probability=True)  

#  SVM model
svm_train_pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('preprocessing', preprocessing),
    ('model', svm_model)
])

# Fit the model
svm_train_pipeline.fit(x_train, y_train)

# Make predictions
y_pred_svm = svm_train_pipeline.predict(x_test)

# Evaluation
print("\n🔹 SVM Classification Report:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))



🔹 SVM Classification Report:
Accuracy: 0.8833333333333333

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93        49
           1       1.00      0.36      0.53        11

    accuracy                           0.88        60
   macro avg       0.94      0.68      0.73        60
weighted avg       0.90      0.88      0.86        60

Confusion Matrix:
 [[49  0]
 [ 7  4]]


In [12]:
# Save model to file
import joblib
filename = r'models\svm_model.pkl'
joblib.dump(svm_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\svm_model.pkl'


# Random Forests Model

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Define the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Random Forest model
rf_train_pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('preprocessing', preprocessing),
    ('model', rf_model)
])

# Fit the model
rf_train_pipeline.fit(x_train, y_train)

# Make predictions
y_pred_rf = rf_train_pipeline.predict(x_test)

# Evaluation
print("\n🔹 Random Forest Classification Report:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



🔹 Random Forest Classification Report:
Accuracy: 0.9833333333333333

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        49
           1       1.00      0.91      0.95        11

    accuracy                           0.98        60
   macro avg       0.99      0.95      0.97        60
weighted avg       0.98      0.98      0.98        60

Confusion Matrix:
 [[49  0]
 [ 1 10]]


In [14]:
# Save model to file
import joblib
filename = r'models\rf_model.pkl'
joblib.dump(rf_model, filename)
print(f"✅ Model saved to '{filename}'")

✅ Model saved to 'models\rf_model.pkl'


In [15]:
preprocessor = rf_train_pipeline.named_steps['preprocessing']

num_features = preprocessor.named_transformers_['scale_num'].get_feature_names_out(num_columns)
cat_features = preprocessor.named_transformers_['encode_cat'].get_feature_names_out(cat_columns)
yn_features = [f"{col}_ordinal" for col in yes_no_columns]

final_features = list(num_features) + list(cat_features) + yn_features
print(final_features)
print(f"Total features: {len(final_features)}")

['Age', 'Monthly_Income', 'Loan_Amount', 'Loan_Term_Months', 'Credit_Score', 'Existing_Loans_Count', 'EMI_Burden', 'Loan_Year_2022', 'Loan_Year_2023', 'Loan_Year_2024', 'Loan_Year_2025', 'Gender_ordinal', 'Employment_Type_ordinal', 'Previous_Default_ordinal']
Total features: 14


# Synthetic Data Generator Code

In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 300

# Numeric features
Age = np.random.randint(21, 60, size=n)
Monthly_Income = np.random.randint(20000, 150000, size=n)
Loan_Amount = np.random.randint(50000, 500000, size=n)
Loan_Term_Months = np.random.choice([12, 24, 36, 48, 60], size=n)
Credit_Score = np.random.randint(300, 850, size=n)
Existing_Loans_Count = np.random.randint(0, 5, size=n)
EMI_Burden = np.round(np.random.uniform(0.1, 0.5, size=n), 3)

# One-hot encoded Loan_Year columns (exactly one per sample)
Loan_Year_2022 = np.zeros(n, dtype=int)
Loan_Year_2023 = np.zeros(n, dtype=int)
Loan_Year_2024 = np.zeros(n, dtype=int)
Loan_Year_2025 = np.zeros(n, dtype=int)

for i in range(n):
    chosen = np.random.choice(4)
    if chosen == 0:
        Loan_Year_2022[i] = 1
    elif chosen == 1:
        Loan_Year_2023[i] = 1
    elif chosen == 2:
        Loan_Year_2024[i] = 1
    else:
        Loan_Year_2025[i] = 1

# Ordinal encoded categorical columns
Gender_ordinal = np.random.choice([0, 1], size=n)  # Male=0, Female=1
Employment_Type_ordinal = np.random.choice([0, 1, 2], size=n)  # Salaried=0, Self-Employed=1, Unemployed=2
Previous_Default_ordinal = np.random.choice([0, 1], size=n)  # No=0, Yes=1

# Create final DataFrame
df_final = pd.DataFrame({
    'Age': Age,
    'Monthly_Income': Monthly_Income,
    'Loan_Amount': Loan_Amount,
    'Loan_Term_Months': Loan_Term_Months,
    'Credit_Score': Credit_Score,
    'Existing_Loans_Count': Existing_Loans_Count,
    'EMI_Burden': EMI_Burden,
    'Loan_Year_2022': Loan_Year_2022,
    'Loan_Year_2023': Loan_Year_2023,
    'Loan_Year_2024': Loan_Year_2024,
    'Loan_Year_2025': Loan_Year_2025,
    'Gender_ordinal': Gender_ordinal,
    'Employment_Type_ordinal': Employment_Type_ordinal,
    'Previous_Default_ordinal': Previous_Default_ordinal,
})

# Save to CSV file
csv_path = "D:/Github_Share/Dataset/new_loan_testing.csv"
df_final.to_csv(csv_path, index=False)

print(f"Synthetic dataset with 14 features saved to:\n{csv_path}")
df_final.info()
print("Total features:", df_final.shape[1])


Synthetic dataset with 14 features saved to:
D:/Github_Share/Dataset/new_loan_testing.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       300 non-null    int32  
 1   Monthly_Income            300 non-null    int32  
 2   Loan_Amount               300 non-null    int32  
 3   Loan_Term_Months          300 non-null    int64  
 4   Credit_Score              300 non-null    int32  
 5   Existing_Loans_Count      300 non-null    int32  
 6   EMI_Burden                300 non-null    float64
 7   Loan_Year_2022            300 non-null    int64  
 8   Loan_Year_2023            300 non-null    int64  
 9   Loan_Year_2024            300 non-null    int64  
 10  Loan_Year_2025            300 non-null    int64  
 11  Gender_ordinal            300 non-null    int64  
 12  Employment_Type_ordinal   300 

In [17]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 300

# Numeric features
Age = np.random.randint(21, 60, size=n)
Monthly_Income = np.random.randint(20000, 150000, size=n)
Loan_Amount = np.random.randint(50000, 500000, size=n)
Loan_Term_Months = np.random.choice([12, 24, 36, 48, 60], size=n)
Credit_Score = np.random.randint(300, 850, size=n)
Existing_Loans_Count = np.random.randint(0, 5, size=n)
EMI_Burden = np.random.randint(1000, 25000, size=n)  # Changed to int to match your previous data

# One-hot encoded Loan_Year columns (exactly one per sample)
Loan_Year_2022 = np.zeros(n, dtype=int)
Loan_Year_2023 = np.zeros(n, dtype=int)
Loan_Year_2024 = np.zeros(n, dtype=int)
Loan_Year_2025 = np.zeros(n, dtype=int)

for i in range(n):
    chosen = np.random.choice(4)
    if chosen == 0:
        Loan_Year_2022[i] = 1
    elif chosen == 1:
        Loan_Year_2023[i] = 1
    elif chosen == 2:
        Loan_Year_2024[i] = 1
    else:
        Loan_Year_2025[i] = 1

# Ordinal encoded categorical columns
Gender_ordinal = np.random.choice([0, 1], size=n)  # Male=0, Female=1
Employment_Type_ordinal = np.random.choice([0, 1, 2], size=n)  # Salaried=0, Self-Employed=1, Unemployed=2
Previous_Default_ordinal = np.random.choice([0, 1], size=n)  # No=0, Yes=1

# Binary target variable
Default = np.random.choice([0, 1], size=n)

# Create final DataFrame
df_final = pd.DataFrame({
    'Age': Age,
    'Monthly_Income': Monthly_Income,
    'Loan_Amount': Loan_Amount,
    'Loan_Term_Months': Loan_Term_Months,
    'Credit_Score': Credit_Score,
    'Existing_Loans_Count': Existing_Loans_Count,
    'EMI_Burden': EMI_Burden,
    'Loan_Year_2022': Loan_Year_2022,
    'Loan_Year_2023': Loan_Year_2023,
    'Loan_Year_2024': Loan_Year_2024,
    'Loan_Year_2025': Loan_Year_2025,
    'Gender_ordinal': Gender_ordinal,
    'Employment_Type_ordinal': Employment_Type_ordinal,
    'Previous_Default_ordinal': Previous_Default_ordinal,
    'Default': Default
})

# Split features and target
x_testing = df_final.drop(columns=['Default'])
y_testing = df_final['Default']

# Save to CSV files

x_path=x_testing.to_csv("D:/Github_Share/Dataset/x_testing.csv", index=False)
y_path=y_testing.to_csv("D:/Github_Share/Dataset/y_testing.csv", index=False)

print(f"Synthetic x_test saved to:{x_path}")
print(f"Synthetic y_test saved to:{y_path}")

print("\nX_test info:")
x_testing.info()
print("\nTotal features in x_test:", x_testing.shape[1])
print("\nSample target distribution:")
print(y_testing.value_counts(normalize=True))


Synthetic x_test saved to:None
Synthetic y_test saved to:None

X_test info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       300 non-null    int32
 1   Monthly_Income            300 non-null    int32
 2   Loan_Amount               300 non-null    int32
 3   Loan_Term_Months          300 non-null    int64
 4   Credit_Score              300 non-null    int32
 5   Existing_Loans_Count      300 non-null    int32
 6   EMI_Burden                300 non-null    int32
 7   Loan_Year_2022            300 non-null    int64
 8   Loan_Year_2023            300 non-null    int64
 9   Loan_Year_2024            300 non-null    int64
 10  Loan_Year_2025            300 non-null    int64
 11  Gender_ordinal            300 non-null    int64
 12  Employment_Type_ordinal   300 non-null    int64
 13  Previous_Default_or

# Run All 3 Models with Logging

In [18]:
import mlflow
import mlflow.sklearn
import joblib
import threading
from sklearn.metrics import accuracy_score
from pathlib import Path
import pandas as pd
from mlflow.models.signature import infer_signature
from sklearn.exceptions import ConvergenceWarning
import warnings

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
EXPERIMENT_NAME = "Pretrained_Model_Logging"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

models_info = [
    {"file": r"models\logistic_model.pkl", "name": "Logistic Regression"},
    {"file": r"models\svm_model.pkl", "name": "SVM"},
    {"file": r"models\rf_model.pkl", "name": "Random Forest"}
]

# Load test data (adjust paths as needed)
X_test = pd.read_csv(r'D:\Github_Share\Dataset\x_testing.csv')
y_test = pd.read_csv(r'D:\Github_Share\Dataset\y_testing.csv').squeeze()  # Convert to Series if possible

print(f"X_test shape: {X_test.shape}", flush=True)
print(f"y_test shape: {y_test.shape}", flush=True)

def log_model_run(model_file, model_name, X_test=None, y_test=None):
    try:
        model_path = Path(model_file)
        if not model_path.exists():
            print(f"[{model_name}] Model file not found: {model_file}", flush=True)
            return

        model = joblib.load(model_path)
        thread_name = threading.current_thread().name

        with mlflow.start_run(run_name=model_name):
            mlflow.set_tag("model_type", model_name)

            # Log model parameters if available
            try:
                params = model.get_params()
                for param_name, param_value in params.items():
                    mlflow.log_param(param_name, param_value)
            except Exception:
                print(f"[{model_name}] Could not get parameters from model.", flush=True)

            # Prepare input_example and signature for MLflow
            signature = None
            input_example = None
            if X_test is not None:
                try:
                    input_example = X_test.head(5)
                    preds_example = model.predict(input_example)
                    signature = infer_signature(input_example, preds_example)
                except Exception as e:
                    print(f"[{model_name}] Could not infer signature: {e}", flush=True)

            # Log model with signature and input_example
            mlflow.sklearn.log_model(model, artifact_path="model", signature=signature, input_example=input_example)

            # Log accuracy metric if test data provided
            if X_test is not None and y_test is not None:
                try:
                    preds = model.predict(X_test.values)  # avoid sklearn warning by removing feature names
                    acc = accuracy_score(y_test, preds)
                    mlflow.log_metric("accuracy", acc)
                    print(f"[{model_name}] Accuracy logged: {acc}", flush=True)
                except Exception as e:
                    print(f"[{model_name}] Failed to log accuracy metric: {e}", flush=True)

            print(f"[{model_name}] Model logged to MLflow by thread {thread_name}.", flush=True)

    except Exception as e:
        print(f"[{model_name}] Error during logging: {e}", flush=True)

threads = []
for model_info in models_info:
    t = threading.Thread(target=log_model_run, args=(model_info["file"], model_info["name"], X_test, y_test))
    t.start()
    threads.append(t)

for t in threads:
    t.join()


X_test shape: (300, 14)
y_test shape: (300,)




[Logistic Regression] Accuracy logged: 0.53
[Logistic Regression] Model logged to MLflow by thread Thread-3 (log_model_run).
[SVM] Accuracy logged: 0.5266666666666666
[SVM] Model logged to MLflow by thread Thread-4 (log_model_run).
🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/126304146287965710/runs/73d247e982ea4ef3ad5d6e0373020b9c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/126304146287965710
🏃 View run SVM at: http://127.0.0.1:5000/#/experiments/126304146287965710/runs/61f2f6a3d78c4f36a893ae2d57e23ee2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/126304146287965710
[Random Forest] Accuracy logged: 0.49666666666666665
[Random Forest] Model logged to MLflow by thread Thread-5 (log_model_run).
🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/126304146287965710/runs/69ab625d5ea64ffb86672d294abcecad
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/126304146287965710
