### Imports

In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os

##### Parameters for each class

In [50]:
params = {
    "LowRisk": {
        "income": (85000, 15000),
        "age": (44, 10),
        "credit_score": (760, 30),
        "debt_ratio": (0.22, 0.10),
        "loan_amount": (12000, 5000)
    },
    "MediumRisk": {
        "income": (55000, 12000),
        "age": (37, 12),
        "credit_score": (660, 40),
        "debt_ratio": (0.38, 0.15),
        "loan_amount": (18000, 7000)
    },
    "HighRisk": {
        "income": (32000, 8000),
        "age": (30, 8),
        "credit_score": (550, 50),
        "debt_ratio": (0.58, 0.20),
        "loan_amount": (25000, 10000)
    }
}

##### Sample size per class

In [51]:
samples_per_class = 2000   # Total = 6000 samples

##### Function to create data for one class

In [52]:
def generate_class_data(class_name, params, n_samples):
    """Generate synthetic samples for a given risk class."""
    cls_params = params[class_name]

    data = {
        "income": np.random.normal(cls_params["income"][0], cls_params["income"][1], n_samples),
        "age": np.random.normal(cls_params["age"][0], cls_params["age"][1], n_samples),
        "credit_score": np.random.normal(cls_params["credit_score"][0], cls_params["credit_score"][1], n_samples),
        "debt_ratio": np.random.normal(cls_params["debt_ratio"][0], cls_params["debt_ratio"][1], n_samples),
        "loan_amount": np.random.normal(cls_params["loan_amount"][0], cls_params["loan_amount"][1], n_samples),
        "risk_class": [class_name] * n_samples
    }
    return pd.DataFrame(data)

##### Append all classes in dataframe list

In [53]:
df_list = []

for cls in params.keys():
    df_list.append(generate_class_data(cls, params, samples_per_class))

df = pd.concat(df_list, ignore_index=True)

In [54]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (6000, 6)


Unnamed: 0,income,age,credit_score,debt_ratio,loan_amount,risk_class
0,96359.775783,24.498687,770.482515,0.310828,16307.668295,LowRisk
1,53382.929816,53.26455,792.647327,-0.054303,8267.389475,LowRisk
2,51643.648171,25.902914,720.66092,0.305462,5618.372157,LowRisk
3,85794.008145,36.870437,722.591723,0.376439,17509.397881,LowRisk
4,85317.488292,47.660473,775.629325,0.283255,9175.436815,LowRisk


##### Save to out.csv inside Dataset folder

In [55]:
folder_name = "Dataset"
if not os.path.exists(folder_name):
    os.mkdir(folder_name)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")

# Source - https://stackoverflow.com/questions/16923281/writing-a-pandas-dataframe-to-csv-file
# Posted by Andy Hayden, modified by community. See post 'Timeline' for change history
# Retrieved 2025-12-11, License - CC BY-SA 4.0

df.to_csv("Dataset\out.csv", encoding='utf-8', index=False, header=True)

Folder 'Dataset' already exists.


##### Encode labels with Train/Test split

In [56]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["risk_label"] = le.fit_transform(df["risk_class"])

X = df[["income", "age", "credit_score", "debt_ratio", "loan_amount"]]
y = df["risk_label"]

# test_size=0.30 has 1800 samples in test set (30% of 6000)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((4200, 5), (1800, 5))