### Using weights with a threshold to determine loan defaulted/not defaulted

In [None]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Define dataset size
num_samples = 1000

# Generate features with reasonable ranges
age = np.random.randint(20, 70, num_samples)  # Age range from 20 to 70
# Generate right-skewed income distribution using log-normal with minimum 4000 and maximum 100000
income = np.random.lognormal(mean=10, sigma=1, size=num_samples)  # log-normal for skew
income = np.clip(income, 4000, 100000)  # Clip income to the range [4000, 100000]
income = np.round(income, 2)  # Round income to 2 decimal places

credit_score = np.random.randint(300, 850, num_samples)  # Credit score range
dependents = np.random.randint(0, 5, num_samples)  # Number of dependents, 0 to 4
home_owner = np.random.randint(0, 2, num_samples)  # Boolean, 0 or 1

# Adjusted weights
weights = {
    "income": 0.30,
    "credit_score": 0.30,
    "age": 0.20,
    "dependents": -0.10,  # Slightly reduce negative weight on dependents
    "home_owner": 0.10,
}

# Normalize features to a 0-1 range for scoring purposes
income_scaled = (income - income.min()) / (income.max() - income.min())
credit_score_scaled = (credit_score - credit_score.min()) / (credit_score.max() - credit_score.min())
age_scaled = (age - age.min()) / (age.max() - age.min())
dependents_scaled = (dependents - dependents.min()) / (dependents.max() - dependents.min())

# Compute the weighted score
score = (
    weights["income"] * income_scaled +
    weights["credit_score"] * credit_score_scaled +
    weights["age"] * age_scaled +
    weights["dependents"] * dependents_scaled +
    weights["home_owner"] * home_owner
)

# Map score to probability of default (inverse relationship)
default_probability = 1 - score  # Higher score means lower probability of default

# Generate default status based on probability
loan_default = np.random.binomial(1, default_probability)  # 1 for default, 0 for no default

# Create the DataFrame
data = pd.DataFrame({
    "age": age,
    "income": income,
    "credit_score": credit_score,
    "dependents": dependents,
    "home_owner": home_owner,
    "loan_default": loan_default
})

# Display the first few rows
data.head(100)


Unnamed: 0,age,income,credit_score,dependents,home_owner,loan_approved
0,58,11883.54,408,4,0,0
1,48,6255.13,375,4,1,0
2,34,72316.02,841,0,0,1
3,62,21135.12,613,2,1,1
4,27,26211.84,416,1,0,0
...,...,...,...,...,...,...
95,34,31513.52,828,1,0,1
96,64,19693.18,793,3,0,1
97,20,30528.73,497,3,1,0
98,44,14662.61,816,0,0,1


### Using logistic regression to determine loan approved/not approved

In [13]:
import pandas as pd
import numpy as np

# Define dataset size
num_samples = 1000

# Generate features with reasonable ranges
age = np.random.randint(20, 70, num_samples)  # Age range from 20 to 70
# Generate right-skewed income distribution using log-normal with minimum 4000 and maximum 100000
income = np.random.lognormal(mean=10, sigma=1, size=num_samples)  # log-normal for skew
income = np.clip(income, 4000, 100000)  # Clip income to the range [4000, 100000]
income = np.round(income, 2)  # Round income to 2 decimal places

credit_score = np.random.randint(300, 850, num_samples)  # Credit score range
dependents = np.random.randint(0, 5, num_samples)  # Number of dependents, 0 to 4
home_owner = np.random.randint(0, 2, num_samples)  # Boolean, 0 or 1

# Scale income and credit score to a 0-1 range for more meaningful coefficient interpretation
income_scaled = (income - 4000) / (100000 - 4000)  # Scaled between 0 and 1
credit_score_scaled = (credit_score - 300) / (850 - 300)  # Scaled between 0 and 1

# Logistic regression coefficients (weights) for each feature
coefficients = {
    "intercept": 0.2,  # Lower baseline probability
    "income": -2,      # Higher income decreases default likelihood
    "credit_score": -3,  # Higher credit score decreases default likelihood
    "age": -0.01,      # Reduced negative influence for age
    "dependents": 0.5, # Moderate positive impact for dependents
    "home_owner": -0.5, # Owning a home decreases default likelihood
}

# Recalculate the logit
logit = (
    coefficients["intercept"] +
    coefficients["income"] * income_scaled +
    coefficients["credit_score"] * credit_score_scaled +
    coefficients["age"] * age +
    coefficients["dependents"] * dependents +
    coefficients["home_owner"] * home_owner
)

# Add noise to the logit
noise = np.random.normal(0, 0.3, num_samples)  # Reduced standard deviation of noise
logit += noise

# Apply sigmoid function to get probability
probability = 1 / (1 + np.exp(-logit))

# Adjust threshold for default to a moderate value, such as 0.4
loan_default = (probability > 0.4).astype(int)

# Update DataFrame
data = pd.DataFrame({
    "age": age,
    "income": income,
    "credit_score": credit_score,
    "dependents": dependents,
    "home_owner": home_owner,
    "loan_default": loan_default
})

# Display new default distribution
default_counts = data['loan_default'].value_counts()
print("Default Distribution:\n", default_counts)

# Save the DataFrame to a CSV file
data.to_csv("loan_default_prediction.csv", index=False)


Default Distribution:
 loan_default
0    805
1    195
Name: count, dtype: int64
