# Feature Engineering

## Setup and Load Data

In [2]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('../../data/dataset/application_train.csv')

In [5]:
# make a copy dataset
df = data.copy()

## Handling Special Values (Flag & Normalization)

### Flag Employment Status

In [11]:
# 1 = Bekerja; 0 = Tidak bekerja
df['FLAG_EMPLOYED'] = (df['DAYS_EMPLOYED'] != 365243).astype(int)

### Flag Years Employed

In [12]:
df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'].where(
    df['DAYS_EMPLOYED'] != 365243
) / -365

### Age Transformation

In [13]:
df['AGE_YEARS'] = df['DAYS_BIRTH'] / -365

### Credit to Income Ration

In [14]:
df['CREDIT_INCOME_RATIO'] = (
    df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
)

### Annuity to Income Ration

In [15]:
df['ANNUITY_INCOME_RATIO'] = (
    df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
)

### Log Transformation

In [16]:
df['LOG_INCOME'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['LOG_CREDIT'] = np.log1p(df['AMT_CREDIT'])
df['LOG_ANNUITY'] = np.log1p(df['AMT_ANNUITY'])

### Family Burden Feature

In [17]:
df['FAMILY_BURDEN'] = (
    df['CNT_FAM_MEMBERS'] / (df['AMT_INCOME_TOTAL'] + 1)
)

## Categorical Feature Selection

In [18]:
cat_features = [
    'NAME_EDUCATION_TYPE',
    'NAME_INCOME_TYPE',
    'OCCUPATION_TYPE',
    'NAME_HOUSING_TYPE',
    'FLAG_OWN_CAR',
    'FLAG_OWN_REALTY'
]

### One-Hot Encoding

In [19]:
df_encoded = pd.get_dummies(
    df,
    columns=cat_features,
    dummy_na=True,
    drop_first=True
)

## Drop Raw Column

In [20]:
drop_cols = [
    'SK_ID_CURR',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_ANNUITY'
]

df_model = df_encoded.drop(columns=drop_cols)


## Final Dataset Check

In [21]:
df_model.shape


(307511, 160)

In [22]:
df_model.isnull().mean().sort_values(ascending=False).head(10)

COMMONAREA_MODE             0.698723
COMMONAREA_MEDI             0.698723
COMMONAREA_AVG              0.698723
NONLIVINGAPARTMENTS_MODE    0.694330
NONLIVINGAPARTMENTS_AVG     0.694330
NONLIVINGAPARTMENTS_MEDI    0.694330
FONDKAPREMONT_MODE          0.683862
LIVINGAPARTMENTS_AVG        0.683550
LIVINGAPARTMENTS_MODE       0.683550
LIVINGAPARTMENTS_MEDI       0.683550
dtype: float64

## Final Summary
Feature engineering dilakukan dengan menekankan rasio finansial, stabilitas pekerjaan, dan karakteristik sosio-ekonomi untuk meningkatkan kemampuan model dalam memprediksi risiko gagal bayar secara interpretable.