In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import joblib
import warnings
warnings.filterwarnings("ignore")


app_path = "../data/application_record.csv"   
cred_path = "../data/credit_record.csv"

app = pd.read_csv(app_path)
cred = pd.read_csv(cred_path)

print("application_record shape:", app.shape)
print("credit_record shape:", cred.shape)


display(app.head())
display(cred.head())


print("\n--- application_record info ---")
display(app.info())
display(app.isna().sum().sort_values(ascending=False).head(20))

print("\n--- credit_record info ---")
display(cred.info())
display(cred.isna().sum())


def status_to_int(s):
    if s in ['C','X'] or pd.isna(s):
        return np.nan
    try:
        return int(s)
    except:
        return np.nan

cred['STATUS_NUM'] = cred['STATUS'].apply(status_to_int)


agg = cred.groupby('ID').agg(
    num_records = ('STATUS', 'size'),
    num_delinq = (lambda x: sum(x.isin(['2','3','4','5'])), 'sum'), 
).reset_index()


grp = cred.groupby('ID')
agg = pd.DataFrame({
    'ID': grp.size().index,
    'num_records': grp.size().values,
    'num_delinq': grp.apply(lambda g: g['STATUS'].isin(['2','3','4','5']).sum()).values,
    'num_missed': grp.apply(lambda g: g['STATUS'].isin(['1','2','3','4','5']).sum()).values,
    'num_closed': grp.apply(lambda g: (g['STATUS']=='C').sum()).values,
    'num_no_loan': grp.apply(lambda g: (g['STATUS']=='X').sum()).values,
    'max_status': grp.apply(lambda g: pd.to_numeric(g['STATUS'], errors='coerce').max(skipna=True)).fillna(0).values,
    'last_status': grp.apply(lambda g: g.sort_values('MONTHS_BALANCE')['STATUS'].iloc[0]).values,
    'min_month': grp['MONTHS_BALANCE'].min().values,
    'max_month': grp['MONTHS_BALANCE'].max().values,
})


agg['fraction_delinq'] = agg['num_delinq'] / agg['num_records']


agg['risk_score'] = agg['num_delinq'] + 0.5*agg['num_missed']


agg['high_risk'] = ((agg['num_delinq'] >= 2) | (agg['max_status'] >= 3) | (agg['fraction_delinq'] > 0.2)).astype(int)


print("Target distribution (0 low-risk, 1 high-risk):")
display(agg['high_risk'].value_counts(normalize=False))
display(agg['high_risk'].value_counts(normalize=True))


merged = app.merge(agg, on='ID', how='inner')
print("Merged shape:", merged.shape)
display(merged.head())


merged.to_csv("../data/merged_credit_data.csv", index=False)
print("Saved merged data to ../data/merged_credit_data.csv")


print("\nMissing values (merged):")
display(merged.isna().sum().sort_values(ascending=False).head(20))


counts = merged['high_risk'].value_counts()
print("Class counts:")
print(counts)


plt.figure(figsize=(5,4))
sns.barplot(x=counts.index, y=counts.values)
plt.xlabel("High risk (1) vs Low risk (0)")
plt.ylabel("Number of applicants")
plt.title("Class balance - initial target")
plt.tight_layout()
plt.show()


if 'OCCUPATION_TYPE' in merged.columns:
    occ_tab = merged.groupby('OCCUPATION_TYPE')['high_risk'].mean().sort_values(ascending=False).head(10)
    print("\nTop occupation types by mean high-risk rate (top 10):")
    display(occ_tab)


numeric_cols = merged.select_dtypes(include=['number']).columns.tolist()

for dropcol in ['ID','high_risk']:
    if dropcol in numeric_cols:
        numeric_cols.remove(dropcol)


possible_cats = ['OCCUPATION_TYPE','FLAG_MOBIL','CODE_GENDER','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']
categorical_cols = [c for c in possible_cats if c in merged.columns]

print("Numeric cols used:", numeric_cols[:10])
print("Categorical cols used:", categorical_cols)


numeric_subset = [c for c in ['AMT_INCOME_TOTAL','DAYS_EMPLOYED','DAYS_BIRTH','CNT_FAM_MEMBERS'] if c in merged.columns]
print("Numeric subset for baseline:", numeric_subset)


X = merged[numeric_subset + categorical_cols].copy()
y = merged['high_risk'].copy()


mask = y.notna()
X = X[mask]
y = y[mask]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.17647, stratify=y_train_val, random_state=42)


print("Train shape:", X_train.shape, "Val shape:", X_val.shape, "Test shape:", X_test.shape)
print("Train class dist:", y_train.value_counts(normalize=True).to_dict())


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_subset),
    ('cat', cat_transformer, categorical_cols)
])


baseline = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', Perceptron(max_iter=1000, tol=1e-3, random_state=42))
])


baseline.fit(X_train, y_train)


def evaluate_model(model, X, y, split_name="set"):
    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    prec = precision_score(y, preds, zero_division=0)
    rec = recall_score(y, preds, zero_division=0)
    f1 = f1_score(y, preds, zero_division=0)
    print(f"\nEvaluation on {split_name}:")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    print("Confusion matrix:\n", confusion_matrix(y, preds))
    print("\nClassification report:\n", classification_report(y, preds, zero_division=0))

evaluate_model(baseline, X_train, y_train, "TRAIN")
evaluate_model(baseline, X_val, y_val, "VALIDATION")
evaluate_model(baseline, X_test, y_test, "TEST")


joblib.dump(baseline, "../models/perceptron_baseline.joblib")
print("Saved baseline model to ../models/perceptron_baseline.joblib")



