In [29]:
import numpy as np
import pandas as pd

def make_practice_dataset_v2(n=15000, seed=99, save_path="loan_default_15k.csv"):
    rng = np.random.default_rng(seed)

    # categorical
    gender = rng.choice(["male", "female"], size=n, p=[0.55, 0.45])
    city = rng.choice(
        ["delhi", "mumbai", "bangalore", "hyderabad", "pune", "chennai", "kolkata", "jaipur"],
        size=n,
        p=[0.16, 0.15, 0.13, 0.12, 0.12, 0.10, 0.12, 0.10]
    )
    education = rng.choice(["school", "ug", "pg"], size=n, p=[0.35, 0.50, 0.15])
    job_type = rng.choice(["salaried", "self_employed", "student", "unemployed"], size=n, p=[0.55, 0.25, 0.12, 0.08])
    married = rng.choice(["yes", "no"], size=n, p=[0.52, 0.48])

    # numeric
    age = rng.integers(18, 66, size=n)
    experience_years = rng.integers(0, 41, size=n)
    monthly_income = rng.lognormal(mean=10.2, sigma=0.55, size=n)  # skewed income
    loan_amount = rng.normal(loc=450000, scale=220000, size=n).clip(50000, 1500000)
    credit_score = rng.normal(loc=680, scale=70, size=n).clip(300, 900)

    # target (default) with signal
    edu_w = np.where(education == "school", 0.18, np.where(education == "ug", 0.05, -0.06))
    job_w = np.where(job_type == "unemployed", 0.35, np.where(job_type == "student", 0.20, 0.0))
    mar_w = np.where(married == "no", 0.08, -0.03)

    debt_pressure = (loan_amount / (monthly_income * 12 + 1e-9))  # annual-ish ratio

    z = (
        -1.7
        + 1.2 * debt_pressure
        - 0.004 * (credit_score - 650)
        - 0.015 * experience_years
        + 0.10 * (age < 23)
        + edu_w + job_w + mar_w
        + rng.normal(0, 0.55, size=n)
    )
    p = 1 / (1 + np.exp(-z))
    default = (rng.random(n) < p).astype(int)

    df = pd.DataFrame({
        "applicant_id": np.arange(200000, 200000 + n),
        "gender": gender,
        "age": age,
        "city": city,
        "education": education,
        "job_type": job_type,
        "married": married,
        "experience_years": experience_years,
        "monthly_income": np.round(monthly_income, 2),
        "loan_amount": np.round(loan_amount, 0),
        "credit_score": np.round(credit_score, 0),
        "default": default
    })

    # inject missing values
    def inject_missing(col, frac):
        idx = rng.choice(df.index, size=int(frac * n), replace=False)
        df.loc[idx, col] = np.nan

    inject_missing("age", 0.05)
    inject_missing("city", 0.04)
    inject_missing("education", 0.03)
    inject_missing("monthly_income", 0.06)
    inject_missing("credit_score", 0.04)

    # dirty categories for handle_unknown practice
    bad_idx = rng.choice(df.index, size=140, replace=False)
    df.loc[bad_idx, "job_type"] = rng.choice(["unknown_job", "??", "n/a"], size=len(bad_idx))

    df.to_csv(save_path, index=False)
    return df

df = make_practice_dataset_v2()
df.head(), df.shape


(   applicant_id  gender   age     city education       job_type married  \
 0        200000    male   NaN   mumbai    school  self_employed      no   
 1        200001  female  36.0      NaN        ug       salaried     yes   
 2        200002    male  18.0  kolkata        pg       salaried     yes   
 3        200003  female  44.0    delhi    school        student      no   
 4        200004  female  18.0   mumbai        ug  self_employed     yes   
 
    experience_years  monthly_income  loan_amount  credit_score  default  
 0                 2        79174.71     574802.0         679.0        0  
 1                37        18171.36      50000.0         632.0        0  
 2                36        13916.89     350348.0         666.0        0  
 3                35        14802.06     494370.0         810.0        1  
 4                 6        31079.68      50000.0         652.0        0  ,
 (15000, 12))

In [30]:
df = df.drop(columns=["applicant_id"])
df.head()

Unnamed: 0,gender,age,city,education,job_type,married,experience_years,monthly_income,loan_amount,credit_score,default
0,male,,mumbai,school,self_employed,no,2,79174.71,574802.0,679.0,0
1,female,36.0,,ug,salaried,yes,37,18171.36,50000.0,632.0,0
2,male,18.0,kolkata,pg,salaried,yes,36,13916.89,350348.0,666.0,0
3,female,44.0,delhi,school,student,no,35,14802.06,494370.0,810.0,1
4,female,18.0,mumbai,ug,self_employed,yes,6,31079.68,50000.0,652.0,0


In [31]:
x = df.drop(columns=["default"])
y = df["default"]
x.head(), y.head()


(   gender   age     city education       job_type married  experience_years  \
 0    male   NaN   mumbai    school  self_employed      no                 2   
 1  female  36.0      NaN        ug       salaried     yes                37   
 2    male  18.0  kolkata        pg       salaried     yes                36   
 3  female  44.0    delhi    school        student      no                35   
 4  female  18.0   mumbai        ug  self_employed     yes                 6   
 
    monthly_income  loan_amount  credit_score  
 0        79174.71     574802.0         679.0  
 1        18171.36      50000.0         632.0  
 2        13916.89     350348.0         666.0  
 3        14802.06     494370.0         810.0  
 4        31079.68      50000.0         652.0  ,
 0    0
 1    0
 2    0
 3    1
 4    0
 Name: default, dtype: int64)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=41, stratify=y)




In [34]:
x_train.isnull().sum()

Unnamed: 0,0
gender,0
age,559
city,468
education,333
job_type,0
married,0
experience_years,0
monthly_income,699
loan_amount,0
credit_score,444


In [35]:
# simple imputation numerical -> age, monthly_income, credit_score
num_col = ["age", "monthly_income", "credit_score"]

# simple imputation categorical -> city, education
cat_col = ["city", "education", 'job_type', 'married']

In [36]:
# Numeric pipeline: Imputation (median) + Scaling (MinMaxScaler)
num_trnf = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", MinMaxScaler())
])



In [37]:
df.sample(10)

Unnamed: 0,gender,age,city,education,job_type,married,experience_years,monthly_income,loan_amount,credit_score,default
2483,female,37.0,kolkata,,unemployed,no,38,31346.23,408786.0,739.0,1
3593,male,65.0,mumbai,ug,salaried,yes,39,12801.37,279009.0,723.0,1
14490,male,,delhi,school,self_employed,no,2,39745.34,686528.0,609.0,1
1877,male,58.0,chennai,school,salaried,no,35,16131.87,611431.0,638.0,1
173,male,65.0,delhi,pg,self_employed,no,1,,772170.0,713.0,1
11277,male,26.0,mumbai,school,salaried,yes,35,26028.71,709270.0,703.0,0
4744,female,28.0,kolkata,ug,self_employed,no,15,41094.17,765438.0,661.0,0
4585,male,52.0,mumbai,ug,salaried,no,29,,334351.0,684.0,1
3715,male,64.0,pune,school,student,no,40,23523.96,408732.0,720.0,0
8814,female,20.0,,ug,self_employed,yes,37,10869.72,557116.0,676.0,1


In [38]:
# one hot encoding on gender cities, education, job type, married,

In [39]:
# Categorical pipeline: Imputation (most frequent) + One-hot encoding

cat_trnf = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))

])


In [40]:
pp = ColumnTransformer([
    ('num', num_trnf, num_col),
    ('cat', cat_trnf, cat_col)
], remainder='drop')

In [41]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline # Changed from make_pipeline for explicit Pipeline



# model
tnf5 = DecisionTreeClassifier(random_state=41, max_depth=6, min_samples_leaf=20)

# pipeline (explicitly defining steps)
pipe = Pipeline(steps=[
    ('preprocessor', pp),
    ('model', tnf5)
])

pipe.fit(x_train, y_train)

pred = pipe.predict(x_test)
print("Accuracy:", round(accuracy_score(y_test, pred), 4))

Accuracy: 0.6445


In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

best = (0, None)

for d in [None, 6, 8, 10, 12]:
    for leaf in [1, 2, 5, 10, 20]:
        for split in [2, 5, 10, 20]:
            model = DecisionTreeClassifier(
                random_state=41,
                max_depth=d,
                min_samples_leaf=leaf,
                min_samples_split=split
            )
            pipe2 = Pipeline(steps=[
                ('preprocessor', pp),
                ('model', model)
            ])
            pipe2.fit(x_train, y_train)
            pred2 = pipe2.predict(x_test)
            acc = accuracy_score(y_test, pred2)
            if acc > best[0]:
                best = (acc, (d, leaf, split))
print("best_acc:", best[0], "best_params(max_depth, min_leaf, min_split):", best[1])


best_acc: 0.6469333333333334 best_params(max_depth, min_leaf, min_split): (6, 10, 2)
