# Home Credit ML Pipeline
This notebook contains EDA, Feature Engineering, Modeling, and Evaluation.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
# Upload file first in Colab
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))
df.head()

In [ ]:
df.info()

In [ ]:
# Missing values
(df.isnull().mean().sort_values(ascending=False).head(20))

In [ ]:
# TARGET distribution
sns.countplot(data=df, x='TARGET')
plt.title('Target Distribution')
plt.show()
df['TARGET'].value_counts(normalize=True)

In [ ]:
# Feature Engineering

# Age
df['AGE'] = df['DAYS_BIRTH'] / -365

# Fill numeric
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Encoding
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

In [ ]:
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

X = df_encoded.drop('TARGET', axis=1)
y = df_encoded['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [ ]:
# Evaluation
from sklearn.metrics import roc_auc_score, classification_report, RocCurveDisplay

y_pred_logreg = logreg.predict(X_test)
y_prob_logreg = logreg.predict_proba(X_test)[:,1]

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

print("AUC Logistic Regression:", roc_auc_score(y_test, y_prob_logreg))
print("AUC Random Forest:", roc_auc_score(y_test, y_prob_rf))

print('\n--- Logistic Regression Report ---')
print(classification_report(y_test, y_pred_logreg))

print('\n--- Random Forest Report ---')
print(classification_report(y_test, y_pred_rf))

In [ ]:
RocCurveDisplay.from_estimator(logreg, X_test, y_test)
plt.title('ROC Logistic Regression')
plt.show()

RocCurveDisplay.from_estimator(rf, X_test, y_test)
plt.title('ROC Random Forest')
plt.show()