In [None]:
# STEP 1: EDA singkat
import pandas as pd
import numpy as np

df = pd.read_csv('/content/application_train.csv')

# 1) Basic info (panggil saja agar terlihat tipe kolom & ukuran)
print("=== SHAPE ===")
print(df.shape)
print("\n=== DTYPE COUNTS ===")
print(df.dtypes.value_counts())

# 2) Top 20 kolom dengan missing value (persentase)
print("\n=== TOP 20 MISSING (%) ===")
print((df.isnull().mean().sort_values(ascending=False).head(20) * 100).round(2))

# 3) Target distribution
print("\n=== TARGET DISTRIBUTION ===")
print(df['TARGET'].value_counts().to_frame('count'))
print((df['TARGET'].value_counts(normalize=True) * 100).round(2).astype(str) + '%')

# 4) Quick numeric summary for key financial cols if present
cols_to_check = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','DAYS_BIRTH','DAYS_EMPLOYED',
                 'EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']
present = [c for c in cols_to_check if c in df.columns]
print("\n=== DESCRIBE FOR KEY COLUMNS ===")
print(df[present].describe().T)

# 5) Create AGE (years) from DAYS_BIRTH for convenience and show simple groups
if 'DAYS_BIRTH' in df.columns:
    df['AGE_YEARS'] = (-df['DAYS_BIRTH'] / 365).round().astype(int)
    print("\n=== AGE BUCKET COUNTS ===")
    print(pd.cut(df['AGE_YEARS'], bins=[18,25,35,45,55,65,100]).value_counts().sort_index())

In [None]:
# STEP 2: detail missing & sample values for high-missing columns
# list kolom dengan missing > 20%
high_miss = (df.isnull().mean() > 0.20)
high_miss_cols = df.columns[high_miss].tolist()
print("Columns with >20% missing:", len(high_miss_cols))
print(high_miss_cols[:30])   # tampilkan 30 pertama kalau banyak

# show sample values for some categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print("\nSample categorical columns (count):", len(cat_cols))
print(cat_cols[:10])
for c in cat_cols[:10]:
    print(f"--- Unique values for {c} (top 10) ---")
    print(df[c].value_counts(dropna=False).head(10))

In [None]:
# STEP 3: imbalance + per-feature mean by TARGET for a few features
print("Target balance (counts):")
print(df['TARGET'].value_counts())
print("Target balance (percent):")
print((df['TARGET'].value_counts(normalize=True)*100).round(2))

# Compare means for some numeric features by TARGET
num_feats = df.select_dtypes(include=['int64','float64']).columns.tolist()
check_feats = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AGE_YEARS','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']
check_feats = [c for c in check_feats if c in num_feats]
print("\nMean of some features by TARGET:")
print(df.groupby('TARGET')[check_feats].median().T)

In [None]:
# STEP 4: Quick plots (histogram + boxplot by TARGET). Run in Colab to see plots.
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,4))
sns.histplot(df['AMT_INCOME_TOTAL'].clip(upper=df['AMT_INCOME_TOTAL'].quantile(0.99)), bins=60, kde=False)
plt.title('AMT_INCOME_TOTAL (capped at 99th pct)')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x='TARGET', y='AMT_CREDIT', data=df[df['AMT_CREDIT'] < df['AMT_CREDIT'].quantile(0.99)])
plt.title('AMT_CREDIT by TARGET (capped)')
plt.show()

plt.figure(figsize=(8,4))
sns.kdeplot(df.loc[df['TARGET']==0,'AGE_YEARS'], label='no default')
sns.kdeplot(df.loc[df['TARGET']==1,'AGE_YEARS'], label='default')
plt.title('AGE distribution by TARGET')
plt.legend()
plt.show()

In [None]:
# STEP 5: preprocessing minimal & split
from sklearn.model_selection import train_test_split

# keep reasonable feature set to avoid memory explosion (drop ID + text)
drop_cols = [c for c in ['SK_ID_CURR'] if c in df.columns]
X = df.drop(columns=drop_cols + ['TARGET'], errors='ignore')
y = df['TARGET'].astype(int)

# Simple encoding for categorical: one-hot for low-cardinality, else drop high-card
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
# keep only categories with <=50 unique values to one-hot encode (avoid huge dim)
small_card = [c for c in cat_cols if X[c].nunique() <= 50]
X = pd.get_dummies(X, columns=small_card, drop_first=True)

# Drop remaining high-cardinality categorical columns (those not in small_card)
high_card_cols = [c for c in cat_cols if c not in small_card]
X = X.drop(columns=high_card_cols, errors='ignore')

# Fill remaining numeric missing with median
X = X.fillna(X.median())

# Train-test split (80/20 stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train TARGET balance:", y_train.value_counts(normalize=True).round(3))

In [None]:
# STEP 6: Logistic Regression baseline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, RocCurveDisplay

logreg = LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1)
logreg.fit(X_train, y_train)

y_prob_log = logreg.predict_proba(X_test)[:,1]
y_pred_log = logreg.predict(X_test)

print("LogReg AUC:", round(roc_auc_score(y_test, y_prob_log),4))
print("\nClassification report (threshold 0.5):")
print(classification_report(y_test, y_pred_log))
RocCurveDisplay.from_estimator(logreg, X_test, y_test)

In [None]:
# STEP 7: XGBoost
!pip install -q xgboost
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_clf = xgb.XGBClassifier(
    n_estimators=400, learning_rate=0.05, max_depth=4, subsample=0.8,
    colsample_bytree=0.8, random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1
)
xgb_clf.fit(X_train, y_train)

y_prob_xgb = xgb_clf.predict_proba(X_test)[:,1]
y_pred_xgb = xgb_clf.predict(X_test)
print("XGBoost AUC:", round(roc_auc_score(y_test, y_prob_xgb),4))
print("\nClassification report:")
print(classification_report(y_test, y_pred_xgb))
RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test)

In [None]:
# STEP 8: ROC-AUC plot comparison
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)

plt.figure(figsize=(6,5))
plt.plot(fpr_log, tpr_log, label=f'LogReg AUC={roc_auc_score(y_test,y_prob_log):.4f}')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGB AUC={roc_auc_score(y_test,y_prob_xgb):.4f}')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Comparison'); plt.legend()
plt.show()

In [None]:
# STEP 9: quick automatic insights
# 1) Feature importance from XGBoost (top 10)
fi = pd.Series(xgb_clf.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(10)
print("Top 10 feature importances (XGBoost):")
print(fi)

# 2) Risk concentration: proportion of defaults in top decile of predicted risk
X_test_copy = X_test.copy()
X_test_copy['y_true'] = y_test.values
X_test_copy['y_score'] = y_prob_xgb
top_decile = X_test_copy['y_score'].quantile(0.9)
cohort = X_test_copy[X_test_copy['y_score'] >= top_decile]
print("\nTop decile size:", cohort.shape[0])
print("Default rate in top decile:", cohort['y_true'].mean())
print("Overall default rate:", y_test.mean())