{ “cells”: \[ { “cell_type”: “code”, “execution_count”: null,
“metadata”: {}, “outputs”: \[\], “source”: \[ “import pandas as pd”,
“import numpy as np”, “import seaborn as sns”, “import matplotlib.pyplot
as plt”, “from sklearn.preprocessing import StandardScaler”, “from
sklearn.model_selection import train_test_split, GridSearchCV”, “from
sklearn.linear_model import LogisticRegression”, “from sklearn.ensemble
import RandomForestClassifier”, “from sklearn.metrics import
roc_auc_score, classification_report, roc_curve, confusion_matrix”,
“from imblearn.over_sampling import SMOTE”, “”, “\# Load dataset”, “df =
pd.read_csv(‘home-credit-default-risk/application_train.csv’)”, “bureau
= pd.read_csv(‘home-credit-default-risk/bureau.csv’)”, “”, “\# Aggregasi
bureau”, “bureau_agg = bureau.groupby(‘SK_ID_CURR’).agg({”, ”
‘CREDIT_ACTIVE’: lambda x: (x == ‘Active’).sum(),“,” ‘AMT_CREDIT_SUM’:
‘mean’,“,” ‘DAYS_CREDIT’: ‘mean’,“,” ‘SK_ID_BUREAU’:
‘count’“,”}).rename(columns={“,” ‘CREDIT_ACTIVE’:
‘BUREAU_ACTIVE_COUNT’,“,” ‘AMT_CREDIT_SUM’: ‘BUREAU_CREDIT_SUM_MEAN’,“,”
‘DAYS_CREDIT’: ‘BUREAU_DAYS_CREDIT_MEAN’,“,” ‘SK_ID_BUREAU’:
‘BUREAU_CREDIT_COUNT’“,”}).reset_index()“,”“,”\# Gabungkan dengan
application_train“,”df = pd.merge(df, bureau_agg, on=‘SK_ID_CURR’,
how=‘left’)” \] }, { “cell_type”: “markdown”, “metadata”: {}, “source”:
\[ “\## 1. Eksplorasi Data” \] }, { “cell_type”: “code”,
“execution_count”: null, “metadata”: {}, “outputs”: \[\], “source”: \[
“\# Distribusi TARGET”, “sns.countplot(x=‘TARGET’, data=df)”,
“plt.title(‘Distribusi TARGET (0: Non-Default, 1: Default)’)”,
“plt.savefig(‘target_distribution.png’, dpi=300)”, “plt.show()”, “”, “\#
Missing values (contoh)”, “missing =
df.isnull().mean().sort_values(ascending=False)”, “print(‘Top 5 kolom
dengan missing values:’)”, “print(missing.head())”, “”, “\# Distribusi
pendapatan”, “sns.histplot(df\[‘AMT_INCOME_TOTAL’\].clip(upper=1e6),
bins=50)”, “plt.title(‘Distribusi Pendapatan’)”,
“plt.savefig(‘income_distribution.png’, dpi=300)”, “plt.show()” \] }, {
“cell_type”: “markdown”, “metadata”: {}, “source”: \[ “\## 2. Data
Cleaning dan Processing” \] }, { “cell_type”: “code”, “execution_count”:
null, “metadata”: {}, “outputs”: \[\], “source”: \[ “\# Hapus kolom
dengan missing \>70%”, “missing = df.isnull().mean()”, “df = df.loc\[:,
missing \< 0.7\]”, “”, “\# Imputasi”,
“df\[‘AMT_ANNUITY’\].fillna(df\[‘AMT_ANNUITY’\].median(),
inplace=True)”, “df\[‘OCCUPATION_TYPE’\].fillna(‘Unknown’,
inplace=True)”, “df\[‘BUREAU_ACTIVE_COUNT’\].fillna(0, inplace=True)”,
“df\[‘BUREAU_CREDIT_SUM_MEAN’\].fillna(df\[‘BUREAU_CREDIT_SUM_MEAN’\].median(),
inplace=True)”,
“df\[‘BUREAU_DAYS_CREDIT_MEAN’\].fillna(df\[‘BUREAU_DAYS_CREDIT_MEAN’\].median(),
inplace=True)”, “df\[‘BUREAU_CREDIT_COUNT’\].fillna(0, inplace=True)”,
“”, “\# One-Hot Encoding”, “df = pd.get_dummies(df,
columns=\[‘CODE_GENDER’, ‘NAME_CONTRACT_TYPE’, ‘OCCUPATION_TYPE’\],
drop_first=True)”, “”, “\# Normalisasi”, “scaler = StandardScaler()”,
“num_cols = \[‘AMT_INCOME_TOTAL’, ‘AMT_CREDIT’, ‘AMT_ANNUITY’,
‘DAYS_BIRTH’, ‘BUREAU_CREDIT_SUM_MEAN’, ‘BUREAU_DAYS_CREDIT_MEAN’\]”,
“df\[num_cols\] = scaler.fit_transform(df\[num_cols\])”, “”, “\# Split
data”, “X = df.drop(\[‘TARGET’, ‘SK_ID_CURR’\], axis=1)”, “y =
df\[‘TARGET’\]”, “X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.2, random_state=42)”, “”, “\# SMOTE”, “smote =
SMOTE(random_state=42)”, “X_train_sm, y_train_sm =
smote.fit_resample(X_train, y_train)” \] }, { “cell_type”: “markdown”,
“metadata”: {}, “source”: \[ “\## 3. Pemodelan” \] }, { “cell_type”:
“code”, “execution_count”: null, “metadata”: {}, “outputs”: \[\],
“source”: \[ “\# Logistic Regression”, “lr =
LogisticRegression(random_state=42, max_iter=1000)”, “lr_params = {‘C’:
\[0.01, 0.1, 1, 10\]}”, “lr_grid = GridSearchCV(lr, lr_params, cv=5,
scoring=‘roc_auc’)”, “lr_grid.fit(X_train_sm, y_train_sm)”, “print(‘Best
LR AUC:’, lr_grid.best_score\_)”, “”, “\# Random Forest”, “rf =
RandomForestClassifier(random_state=42)”, “rf_params = {”, ”
‘n_estimators’: \[100, 200\],“,” ‘max_depth’: \[10, 20,
None\]“,”}“,”rf_grid = GridSearchCV(rf, rf_params, cv=5,
scoring=‘roc_auc’)“,”rf_grid.fit(X_train_sm, y_train_sm)“,”print(‘Best
RF AUC:’, rf_grid.best_score\_)” \] }, { “cell_type”: “markdown”,
“metadata”: {}, “source”: \[ “\## 4. Evaluasi” \] }, { “cell_type”:
“code”, “execution_count”: null, “metadata”: {}, “outputs”: \[\],
“source”: \[ “\# Prediksi”, “y_pred_lr = lr_grid.predict(X_test)”,
“y_pred_rf = rf_grid.predict(X_test)”, “”, “\# Evaluasi”,
“print(‘Logistic Regression:’)”, “print(classification_report(y_test,
y_pred_lr))”, “print(‘ROC AUC:’, roc_auc_score(y_test,
lr_grid.predict_proba(X_test)\[:, 1\]))”, “”, “print(‘Random Forest:’)”,
“print(classification_report(y_test, y_pred_rf))”, “print(‘ROC AUC:’,
roc_auc_score(y_test, rf_grid.predict_proba(X_test)\[:, 1\]))”, “”, “\#
ROC Curve (terpisah untuk PPT)”, “fpr_lr, tpr_lr, \_ = roc_curve(y_test,
lr_grid.predict_proba(X_test)\[:, 1\])”, “plt.plot(fpr_lr, tpr_lr,
label=‘Logistic Regression’, color=‘blue’)”, “plt.plot(\[0, 1\], \[0,
1\], ‘k–’)”, “plt.xlabel(‘False Positive Rate’)”, “plt.ylabel(‘True
Positive Rate’)”, “plt.title(‘ROC Curve: Logistic Regression’)”,
“plt.legend()”, “plt.savefig(‘roc_curve_lr.png’, dpi=300)”,
“plt.show()”, “”, “fpr_rf, tpr_rf, \_ = roc_curve(y_test,
rf_grid.predict_proba(X_test)\[:, 1\])”, “plt.plot(fpr_rf, tpr_rf,
label=‘Random Forest’, color=‘red’)”, “plt.plot(\[0, 1\], \[0, 1\],
‘k–’)”, “plt.xlabel(‘False Positive Rate’)”, “plt.ylabel(‘True Positive
Rate’)”, “plt.title(‘ROC Curve: Random Forest’)”, “plt.legend()”,
“plt.savefig(‘roc_curve_rf.png’, dpi=300)”, “plt.show()”, “”, “\#
Feature Importance”, “feat_importance =
pd.Series(rf_grid.best_estimator\_.feature_importances\_,
index=X_train.columns)”, “feat_importance.nlargest(5).plot(kind=‘barh’,
color=‘green’)”, “plt.title(‘Top 5 Feature Importance’)”,
“plt.savefig(‘feature_importance.png’, dpi=300)”, “plt.show()” \] } \],
“metadata”: { “kernelspec”: { “display_name”: “Python 3”, “language”:
“python”, “name”: “python3” }, “language_info”: { “codemirror_mode”: {
“name”: “ipython”, “version”: 3 }, “file_extension”: “.py”, “mimetype”:
“text/x-python”, “name”: “python”, “nbconvert_exporter”: “python”,
“pygments_lexer”: “ipython3”, “version”: “3.8.5” } }, “nbformat”: 4,
“nbformat_minor”: 4 }