In [None]:
!pip install numpy pandas scikit-learn matplotlib seaborn umap-learn

In [None]:
import pandas as pd
import numpy as np

# ---------- Step 0: First we randomly sample 1 lakh instances ----------
total_rows = sum(1 for _ in open("loan.csv"))

skip = sorted(np.random.choice(
    np.arange(1, total_rows),
    total_rows - 200000,
    replace=False
))

df = pd.read_csv("loan.csv", skiprows=skip)

# ---------- Step 1: We drop the columns that have a lot of NaN values ----------
nan_threshold = 0.10  
df = df.dropna(axis=1, thresh=int((1 - nan_threshold) * len(df)))

# ---------- Step 2: Now we will create clean target label----------
good_status = [
    "Fully Paid",
    "Current",
    "In Grace Period",
    "Does not meet the credit policy. Status:Fully Paid"
]

bad_status = [
    "Charged Off",
    "Late (31–120 days)",
    "Late (16–30 days)",
    "Does not meet the credit policy. Status:Charged Off"
]

df["default_binary"] = df["loan_status"].apply(
    lambda x: 1 if x in good_status else (0 if x in bad_status else None)
)

df = df[df["default_binary"].notna()]
df["default_binary"] = df["default_binary"].astype(int)

# ---------- Step 3: Drop all rows with any NaN ----------
df = df.dropna(axis=0, how="any")

# ---------- Step 4: Balance dataset ----------
good_df = df[df["default_binary"] == 1].sample(10000, random_state=42)
bad_df  = df[df["default_binary"] == 0].sample(10000, random_state=42)

df_final = pd.concat([good_df, bad_df]).sample(frac=1, random_state=42)

# ---------- Step 5: Final dataset to work with ----------
print(df_final.shape)
print(df_final['default_binary'].value_counts())

In [None]:
df_final.to_csv("data2.csv", index=False)