In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

app_path = "../Dataset/application_record.csv"   
cred_path = "../Dataset/credit_record.csv"

app = pd.read_csv(app_path)
cred = pd.read_csv(cred_path)

print("application_record shape:", app.shape)
print("credit_record shape:", cred.shape)


display(app.head())
display(cred.head())


print("\n--- application_record info ---")
display(app.info())
display(app.isna().sum().sort_values(ascending=False).head(20))

print("\n--- credit_record info ---")
display(cred.info())
display(cred.isna().sum())


def status_to_int(s):
    if s in ['C','X'] or pd.isna(s):
        return np.nan
    try:
        return int(s)
    except:
        return np.nan

cred['STATUS_NUM'] = cred['STATUS'].apply(status_to_int)


agg = cred.groupby('ID').agg(
    num_records=('STATUS', 'size'),
    num_delinq=('STATUS', lambda x: x.isin(['2','3','4','5']).sum()),
).reset_index()



grp = cred.groupby('ID')
agg = pd.DataFrame({
    'ID': grp.size().index,
    'num_records': grp.size().values,
    'num_delinq': grp.apply(lambda g: g['STATUS'].isin(['2','3','4','5']).sum()).values,
    'num_missed': grp.apply(lambda g: g['STATUS'].isin(['1','2','3','4','5']).sum()).values,
    'num_closed': grp.apply(lambda g: (g['STATUS']=='C').sum()).values,
    'num_no_loan': grp.apply(lambda g: (g['STATUS']=='X').sum()).values,
    'max_status': grp.apply(lambda g: pd.to_numeric(g['STATUS'], errors='coerce').max(skipna=True)).fillna(0).values,
    'last_status': grp.apply(lambda g: g.sort_values('MONTHS_BALANCE')['STATUS'].iloc[0]).values,
    'min_month': grp['MONTHS_BALANCE'].min().values,
    'max_month': grp['MONTHS_BALANCE'].max().values,
})


agg['fraction_delinq'] = agg['num_delinq'] / agg['num_records']


agg['risk_score'] = agg['num_delinq'] + 0.5*agg['num_missed']


agg['high_risk'] = ((agg['num_delinq'] >= 2) | (agg['max_status'] >= 3) | (agg['fraction_delinq'] > 0.2)).astype(int)


print("Target distribution (0 low-risk, 1 high-risk):")
display(agg['high_risk'].value_counts(normalize=False))
display(agg['high_risk'].value_counts(normalize=True))


merged = app.merge(agg, on='ID', how='inner')
print("Merged shape:", merged.shape)
display(merged.head())


merged.to_csv("../Dataset/merged_credit_data.csv", index=False)
print("Saved merged data to Dataset/merged_credit_data.csv")