In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay)
from sklearn import set_config
set_config(display='diagram')  


merged_path = "../data/merged_credit_data.csv"   # adjust path if needed
df = pd.read_csv(merged_path)
print("Merged data shape:", df.shape)
display(df.head())


numeric_subset = [c for c in ['AMT_INCOME_TOTAL','DAYS_EMPLOYED','DAYS_BIRTH','CNT_FAM_MEMBERS'] if c in df.columns]
possible_cats = ['OCCUPATION_TYPE','FLAG_MOBIL','CODE_GENDER','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']
categorical_cols = [c for c in possible_cats if c in df.columns]

print("Numeric subset:", numeric_subset)
print("Categorical cols:", categorical_cols)

X = df[numeric_subset + categorical_cols].copy()
y = df['high_risk'].copy()


mask = y.notna()
X = X[mask]
y = y[mask]


from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.17647, stratify=y_train_val, random_state=42)
print("Train/Val/Test shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Train class distribution:", y_train.value_counts(normalize=True).to_dict())


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_subset),
    ('cat', cat_transformer, categorical_cols)
])


dt_pipeline = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', DecisionTreeClassifier(random_state=42))
])


param_grid = {
    'clf__max_depth': [3, 5, 8, None],            
    'clf__min_samples_split': [2, 10, 20],
    'clf__min_samples_leaf': [1, 5, 10],
    'clf__criterion': ['gini', 'entropy']
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(dt_pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1, verbose=1, return_train_score=True)


grid_search.fit(X_train, y_train)

print("Best params (from GridSearchCV):")
print(grid_search.best_params_)
print("Best CV score (f1):", grid_search.best_score_)


results_df = pd.DataFrame(grid_search.cv_results_)
results_df.to_csv("../models/dt_gridsearch_results.csv", index=False)
print("Saved grid search results to ../models/dt_gridsearch_results.csv")


best_dt = grid_search.best_estimator_

def evaluate_and_report(model, X, y, split_name):
    preds = model.predict(X)
    probs = None
    try:
        probs = model.predict_proba(X)[:,1]
    except:
        pass
    acc = accuracy_score(y, preds)
    prec = precision_score(y, preds, zero_division=0)
    rec = recall_score(y, preds, zero_division=0)
    f1 = f1_score(y, preds, zero_division=0)
    print(f"\n--- Evaluation on {split_name} ---")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1-score:", f1)
    if probs is not None and len(np.unique(y))==2:
        try:
            auc = roc_auc_score(y, probs)
            print("ROC-AUC:", auc)
        except:
            pass
    print("Confusion matrix:\n", confusion_matrix(y, preds))
    print("\nClassification report:\n", classification_report(y, preds, zero_division=0))


evaluate_and_report(best_dt, X_train, y_train, "TRAIN")

evaluate_and_report(best_dt, X_val, y_val, "VALIDATION")

evaluate_and_report(best_dt, X_test, y_test, "TEST")


preproc = best_dt.named_steps['preproc']


num_features = numeric_subset


cat_features = []
if categorical_cols:
    cat_ohe = preproc.named_transformers_['cat'].named_steps['onehot']
    cat_names = list(cat_ohe.get_feature_names_out(categorical_cols))
    cat_features = cat_names

all_feature_names = num_features + cat_features


importances = best_dt.named_steps['clf'].feature_importances_
fi_df = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
fi_df = fi_df.sort_values('importance', ascending=False).reset_index(drop=True)
display(fi_df.head(20))


topk = min(15, len(fi_df))
plt.figure(figsize=(8,6))
sns.barplot(x='importance', y='feature', data=fi_df.head(topk))
plt.title("Decision Tree - Feature Importances (top {})".format(topk))
plt.tight_layout()
plt.show()


best_clf = best_dt.named_steps['clf']

if best_clf.get_depth() is not None and best_clf.get_depth() <= 6:
    plt.figure(figsize=(20,10))
    plot_tree(best_clf, feature_names=all_feature_names, class_names=['low-risk','high-risk'], filled=True, max_depth=4, fontsize=8)
    plt.title("Decision Tree (truncated view)")
    plt.show()
else:
    print("Tree too deep to plot usefully (depth = {}). Consider limiting max_depth for visualization.".format(best_clf.get_depth()))



joblib.dump(best_dt, "../models/decision_tree_best.joblib")
print("Saved best decision tree pipeline to ../models/decision_tree_best.joblib")


