In [63]:
from pydantic import BaseModel
class LoanApplicationModel(BaseModel):
    user_id: int
    total_debt: float
    avg_monthly_spending: float
    total_balance: float
    income: float
    age: int
    gender: str
    credit_score: int
    dependents: int
    loan_amount: float


In [None]:
import random
import pandas as pd

# Function to generate fake loan applications
def generate_fake_application():
    income = max(20000, round(random.normalvariate(80000, 25000), 2))  # Mean: 80k, SD: 25k
    total_debt = round(random.uniform(0, income * 0.5), 2)  # Debt up to 50% of income
    avg_monthly_spending = max(500, round(random.normalvariate(2000, 500), 2))  # Spending: Mean 2k, SD 500
    total_balance = max(0, round(random.normalvariate(income * 0.8, income * 0.5), 2))  # Balance is income-based
    credit_score = min(850, max(300, round(random.normalvariate(650, 100))))  # Clamp to 300-850
    age = random.randint(18, 80)
    dependents = random.randint(0, 5)

    application = LoanApplicationModel(
        user_id=random.randint(1000, 9999),
        total_debt=total_debt,
        avg_monthly_spending=avg_monthly_spending,
        total_balance=total_balance,
        income=income,
        age=age,
        gender=random.choice(["Male", "Female", "Non-Binary"]),
        credit_score=credit_score,
        dependents=dependents,
        loan_amount=0
    )

    # Calculate the loan amount
    application.loan_amount = determine_loan_amount(application)
    return application

# Function to determine loan eligibility
def determine_loan_amount(application: LoanApplicationModel) -> float:
    """
    Determines the loan amount based on income, credit score, and existing debt.
    """
    base_loan = application.income * 3  # Base loan eligibility
    credit_multiplier = (application.credit_score - 300) / 550  # Normalize credit score (300-850)
    debt_penalty = max(0, (1 - (application.total_debt / (application.income + 1))))  # Reduce based on debt
    dependents_penalty = max(0.7, 1 - (application.dependents * 0.1))  # Reduce based on dependents

    loan_amount = base_loan * credit_multiplier * debt_penalty * dependents_penalty
    return round(max(0, loan_amount), 2)

# Generate 10,000 data points
applications = [generate_fake_application() for _ in range(10000)]

# Convert to DataFrame for easy analysis
df = pd.DataFrame([app.model_dump() for app in applications])


In [65]:
df

Unnamed: 0,user_id,total_debt,avg_monthly_spending,total_balance,income,age,gender,credit_score,dependents,loan_amount
0,7147,17161.02,2050.62,135397.68,77292.33,39,Male,525,5,51658.45
1,5015,8828.77,2179.66,0.00,62148.07,76,Male,583,2,65844.66
2,8291,30635.57,2071.04,149569.25,99367.60,36,Non-Binary,726,2,127767.17
3,5605,31163.61,2385.51,61246.22,88173.19,46,Non-Binary,759,3,99912.50
4,3722,34325.62,2220.68,55612.41,74076.23,60,Female,641,0,73937.00
...,...,...,...,...,...,...,...,...,...,...
9995,5036,28379.64,2008.75,23432.46,78811.53,33,Female,625,0,89402.63
9996,5663,22324.41,1400.26,51972.07,76606.59,48,Male,732,1,115118.32
9997,8712,16420.35,1686.61,12972.26,42331.54,53,Female,661,0,51022.25
9998,4498,152.33,911.91,34886.14,26117.10,52,Female,696,2,44867.13


In [66]:
df = df.drop(columns=["user_id"])
df.index.name = "user_id"
df = df.reset_index()
df.to_csv('loan_data.csv', index=False)

In [67]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Regenerate the dataset since the CSV file is missing
applications = [generate_fake_application() for _ in range(10000)]
df = pd.DataFrame([app.model_dump() for app in applications])

# Drop the 'gender' column
df = df.drop(columns=["gender"])
# Add noise to the dataset to improve model generalization
def add_noise(data, noise_level=0.05):
    noise = data * noise_level * (2 * (random.random() - 0.5))  # Add noise within ±noise_level%
    return data + noise

# Apply noise to numerical columns
for col in ["total_debt", "avg_monthly_spending", "total_balance", "income", "credit_score", "loan_amount"]:
    df[col] = df[col].apply(lambda x: add_noise(x, noise_level=0.05))

# Split into features and target
X = df.drop(columns=["loan_amount", "user_id"])  # Features
print(X.columns)
y = df["loan_amount"]  # Target

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
evaluation_metrics = {
    "Mean Absolute Error (MAE)": mae,
    "Mean Squared Error (MSE)": mse,
    "R-squared (R²)": r2
}

evaluation_metrics


Index(['total_debt', 'avg_monthly_spending', 'total_balance', 'income', 'age',
       'credit_score', 'dependents'],
      dtype='object')


{'Mean Absolute Error (MAE)': 6483.617004220547,
 'Mean Squared Error (MSE)': 76773587.85616672,
 'R-squared (R²)': 0.9632246743679234}

### Save the Model

In [68]:
import joblib

# Save the trained model
model_filename = "random_forest_loan_model.pkl"
joblib.dump(rf_model, model_filename)


['random_forest_loan_model.pkl']

### Sample Prediction

In [69]:
import numpy as np

sample_loan_application = LoanApplicationModel(
    user_id=1234,
    total_debt=25000.50,
    avg_monthly_spending=2000.75,
    total_balance=50000.00,
    income=85000.00,
    age=35,
    gender="Male",
    credit_score=720,
    dependents=1,
    loan_amount=0  # Placeholder, since the model will predict this
)

# Convert to feature array for model prediction
sample_features = np.array([
    sample_loan_application.total_debt,
    sample_loan_application.avg_monthly_spending,
    sample_loan_application.total_balance,
    sample_loan_application.income,
    sample_loan_application.age,
    sample_loan_application.credit_score,
    sample_loan_application.dependents
]).reshape(1, -1)

# Load pretrained model
import joblib
loan_model = joblib.load("random_forest_loan_model.pkl")

# Predict loan eligibility amount
approved_loan_amount = loan_model.predict(sample_features)[0]
approved_loan_amount = max(0, approved_loan_amount)  # Ensure non-negative loan amount

# Print the prediction
print("Predicted Loan Amount:", approved_loan_amount)


Predicted Loan Amount: 129687.26904747862


