# Feature Engineering

## Setup and Load Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../../data/dataset/application_train.csv')

In [3]:
# make a copy dataset
df = data.copy()

## Handling Special Values (Flag & Normalization)

### Flag Employment Status

In [5]:
# 1 = Bekerja; 0 = Tidak bekerja
df['FLAG_EMPLOYED'] = (df['DAYS_EMPLOYED'] != 365243).astype(int)

### Flag Years Employed

In [6]:
df['EMPLOYED_YEARS'] = df['DAYS_EMPLOYED'].where(
    df['DAYS_EMPLOYED'] != 365243
) / -365

### Age Transformation

In [7]:
df['AGE_YEARS'] = df['DAYS_BIRTH'] / -365

### Credit to Income Ration

In [8]:
df['CREDIT_INCOME_RATIO'] = (
    df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
)

### Annuity to Income Ration

In [9]:
df['ANNUITY_INCOME_RATIO'] = (
    df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
)

### Log Transformation

In [10]:
df['LOG_INCOME'] = np.log1p(df['AMT_INCOME_TOTAL'])
df['LOG_CREDIT'] = np.log1p(df['AMT_CREDIT'])
df['LOG_ANNUITY'] = np.log1p(df['AMT_ANNUITY'])

### Family Burden Feature

In [11]:
df['FAMILY_BURDEN'] = (
    df['CNT_FAM_MEMBERS'] / (df['AMT_INCOME_TOTAL'] + 1)
)

## Categorical Feature Selection

### Binary Label Encoding

In [20]:
binary_map = {
    'FLAG_OWN_CAR': {'Y': 1, 'N': 0},
    'FLAG_OWN_REALTY': {'Y': 1, 'N': 0}
}

for col, mapping in binary_map.items():
    df[col] = df[col].map(mapping)

### One-Hot Encoding

In [21]:
cat_features = [
    'NAME_EDUCATION_TYPE',
    'NAME_INCOME_TYPE',
    'OCCUPATION_TYPE',
    'NAME_HOUSING_TYPE',
]

df_encoded = pd.get_dummies(
    df,
    columns=cat_features,
    dummy_na=True,
    drop_first=True
)

## Drop Raw Column

In [23]:
drop_cols = [
    'SK_ID_CURR',
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_ANNUITY',
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'NAME_TYPE_SUITE',
    'NAME_FAMILY_STATUS',
    'WEEKDAY_APPR_PROCESS_START',
    'ORGANIZATION_TYPE',
    'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE',
    'WALLSMATERIAL_MODE',
    'EMERGENCYSTATE_MODE'
]

df_model = df_encoded.drop(columns=drop_cols)


## Final Dataset Check

In [24]:
df_model.shape


(307511, 148)

In [25]:
df_model.isnull().mean().sort_values(ascending=False).head(10)

COMMONAREA_MODE             0.698723
COMMONAREA_MEDI             0.698723
COMMONAREA_AVG              0.698723
NONLIVINGAPARTMENTS_AVG     0.694330
NONLIVINGAPARTMENTS_MEDI    0.694330
NONLIVINGAPARTMENTS_MODE    0.694330
LIVINGAPARTMENTS_AVG        0.683550
LIVINGAPARTMENTS_MEDI       0.683550
LIVINGAPARTMENTS_MODE       0.683550
FLOORSMIN_MODE              0.678486
dtype: float64

In [26]:
df_model.select_dtypes(include=['object', 'category']).columns

Index([], dtype='object')

Fitur kategorikal dipilih secara selektif berdasarkan relevansi bisnis dan stabilitas data. 
Encoding hanya dilakukan pada fitur yang telah ditentukan sebelumnya untuk menjaga konsistensi pipeline dan menghindari feature leakage.


## Final Summary
Feature engineering dilakukan dengan menekankan rasio finansial, stabilitas pekerjaan, dan karakteristik sosio-ekonomi untuk meningkatkan kemampuan model dalam memprediksi risiko gagal bayar secara interpretable.

## Generate new CSV

In [27]:
# Save final dataset for modeling
df_model.to_csv('../../data/processed/df_model.csv', index=False)