## Load data

In [5]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [6]:
IS_LOCAL = True

import os

if IS_LOCAL:
    PATH = r"/Users/trinhthilananh/Documents/Feature Engineering/home-credit-default-risk"
else:
    PATH = "../input"

print(os.listdir(PATH))


['test_final_selected_features.csv', 'application_test.csv', '.DS_Store', 'HomeCredit_columns_description.csv', 'POS_CASH_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'application_train.csv', 'bureau.csv', 'train_final_selected_features.csv', 'previous_application.csv', 'bureau_balance.csv', 'sample_submission.csv']


In [7]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Loading Libraries

In [8]:
#importing Useful DataStructures
import pandas as pd
import numpy as np
from scipy.stats import uniform

#importing plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable

#importing Misc Libraries
import os
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

#for 100% jupyter notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV


## 1. X·ª≠ l√Ω b·∫£ng application_train.csv v√† application_test.csv

In [9]:
# L∆∞u l·∫°i ID c·ªßa t·∫≠p test ƒë·ªÉ d√πng ·ªü cu·ªëi
test_ids = application_test['SK_ID_CURR']

# Th√™m c·ªôt TARGET v√†o t·∫≠p test (v·ªõi gi√° tr·ªã NaN) ƒë·ªÉ g·ªôp 2 file
application_test['TARGET'] = np.nan

# G·ªôp train v√† test
df_app = pd.concat([application_train, application_test], ignore_index=True)

print(f"K√≠ch th∆∞·ªõc df_app (g·ªôp): {df_app.shape}")

# --- Feature Engineering C∆° b·∫£n ---

# 1. L√†m s·∫°ch d·ªØ li·ªáu b·∫•t th∆∞·ªùng (Anomaly)
df_app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
df_app['CODE_GENDER'].replace('XNA', np.nan, inplace=True)

# 2. T·∫°o c√°c thu·ªôc t√≠nh t·ª∑ l·ªá (Domain Features)
# Th√™m 0.00001 ƒë·ªÉ tr√°nh l·ªói chia cho 0
df_app['CREDIT_INCOME_RATIO'] = df_app['AMT_CREDIT'] / (df_app['AMT_INCOME_TOTAL'] + 0.00001)
df_app['ANNUITY_INCOME_RATIO'] = df_app['AMT_ANNUITY'] / (df_app['AMT_INCOME_TOTAL'] + 0.00001)
df_app['CREDIT_ANNUITY_RATIO'] = df_app['AMT_CREDIT'] / (df_app['AMT_ANNUITY'] + 0.00001)
df_app['CREDIT_GOODS_PRICE_RATIO'] = df_app['AMT_CREDIT'] / (df_app['AMT_GOODS_PRICE'] + 0.00001)
df_app['DAYS_EMPLOYED_TO_BIRTH_RATIO'] = df_app['DAYS_EMPLOYED'] / (df_app['DAYS_BIRTH'] + 0.00001)

# 3. K·∫øt h·ª£p c√°c thu·ªôc t√≠nh EXT_SOURCE
df_app['EXT_SOURCES_MEAN'] = df_app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
df_app['EXT_SOURCES_PROD'] = df_app['EXT_SOURCE_1'] * df_app['EXT_SOURCE_2'] * df_app['EXT_SOURCE_3']
df_app['EXT_SOURCES_MIN'] = df_app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].min(axis=1)
df_app['EXT_SOURCES_MAX'] = df_app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].max(axis=1)

# 4. X·ª≠ l√Ω c√°c c·ªôt Categorical (d√πng One-Hot Encoding)
# L·∫•y danh s√°ch c√°c c·ªôt 'object'
categorical_cols = df_app.select_dtypes(include=['object']).columns

# D√πng pd.get_dummies
df_app = pd.get_dummies(df_app, columns=categorical_cols, dummy_na=True)

print(f"K√≠ch th∆∞·ªõc df_app sau khi x·ª≠ l√Ω: {df_app.shape}")

K√≠ch th∆∞·ªõc df_app (g·ªôp): (356255, 122)
K√≠ch th∆∞·ªõc df_app sau khi x·ª≠ l√Ω: (356255, 270)


## 2. X·ª≠ l√Ω bureau.csv v√† bureau_balance.csv

In [10]:
print("2.1: X·ª≠ l√Ω bureau_balance")

# 1. X·ª≠ l√Ω c·ªôt STATUS (Ch·∫°y get_dummies m·ªôt c√°ch r√µ r√†ng)
# Ch√∫ng ta s·∫Ω l∆∞u l·∫°i t√™n c√°c c·ªôt m·ªõi ƒë∆∞·ª£c t·∫°o
original_cols = set(bureau_balance.columns)

if 'STATUS' in bureau_balance.columns:
    print("ƒêang ch·∫°y pd.get_dummies cho c·ªôt 'STATUS'...")
    # Th√™m dtype=int ƒë·ªÉ ƒë·∫£m b·∫£o c√°c c·ªôt m·ªõi l√† S·ªê
    bureau_balance = pd.get_dummies(bureau_balance, columns=['STATUS'], dummy_na=True, dtype=int)
    # L·∫•y danh s√°ch c√°c c·ªôt m·ªõi
    new_status_cols = list(set(bureau_balance.columns) - original_cols)
else:
    print("C·ªôt 'STATUS' d∆∞·ªùng nh∆∞ ƒë√£ ƒë∆∞·ª£c get_dummies. ƒêang t√¨m c√°c c·ªôt STATUS_...")
    new_status_cols = [col for col in bureau_balance.columns if col.startswith('STATUS_')]

# Ki·ªÉm tra xem c√≥ c·ªôt n√†o ƒë·ªÉ t·ªïng h·ª£p kh√¥ng
if not new_status_cols:
    print("!!! L·ªñI: Kh√¥ng t√¨m th·∫•y c·ªôt STATUS n√†o (STATUS_0, STATUS_C...) ƒë·ªÉ t·ªïng h·ª£p.")
    raise ValueError("Kh√¥ng t√¨m th·∫•y c·ªôt STATUS_... ƒë·ªÉ t·ªïng h·ª£p. Vui l√≤ng ki·ªÉm tra l·∫°i file bureau_balance.")

print(f"C√°c c·ªôt STATUS s·∫Ω ƒë∆∞·ª£c t·ªïng h·ª£p: {new_status_cols}")

# 2. ƒê·ªãnh nghƒ©a c√°c ph√©p t·ªïng h·ª£p
# Ch·ªâ t·ªïng h·ª£p tr√™n c√°c c·ªôt STATUS m·ªõi
balance_aggregations = {}
for col in new_status_cols:
    balance_aggregations[col] = ['mean', 'min', 'max', 'sum', 'count']

# 3. T·ªïng h·ª£p (Aggregate) theo SK_ID_BUREAU
# D√≤ng n√†y s·∫Ω ho·∫°t ƒë·ªông v√¨ balance_aggregations KH√îNG r·ªóng
agg_bureau_balance = bureau_balance.groupby('SK_ID_BUREAU').agg(balance_aggregations)

# 4. GI·∫¢I PH√ÅP S·ª¨A L·ªñI (MultiIndex) - Gi·ªØ nguy√™n nh∆∞ tr∆∞·ªõc
# Chuy·ªÉn 2 c·∫•p ƒë·ªô c·ªôt (v√≠ d·ª•: ('STATUS_0', 'MEAN')) v·ªÅ 1 c·∫•p ƒë·ªô ('STATUS_0_MEAN')
agg_bureau_balance.columns = ['_'.join(col).upper() for col in agg_bureau_balance.columns.values]
agg_bureau_balance.reset_index(inplace=True) # ƒê∆∞a SK_ID_BUREAU v·ªÅ l√†m c·ªôt

print(f"K√≠ch th∆∞·ªõc agg_bureau_balance (ƒë√£ agg): {agg_bureau_balance.shape}")
print(f"5 c·ªôt ƒë·∫ßu ti√™n: {list(agg_bureau_balance.columns[:5])}")

2.1: X·ª≠ l√Ω bureau_balance
ƒêang ch·∫°y pd.get_dummies cho c·ªôt 'STATUS'...
C√°c c·ªôt STATUS s·∫Ω ƒë∆∞·ª£c t·ªïng h·ª£p: ['STATUS_nan', 'STATUS_C', 'STATUS_2', 'STATUS_0', 'STATUS_1', 'STATUS_X', 'STATUS_3', 'STATUS_5', 'STATUS_4']
K√≠ch th∆∞·ªõc agg_bureau_balance (ƒë√£ agg): (817395, 46)
5 c·ªôt ƒë·∫ßu ti√™n: ['SK_ID_BUREAU', 'STATUS_NAN_MEAN', 'STATUS_NAN_MIN', 'STATUS_NAN_MAX', 'STATUS_NAN_SUM']


In [11]:
print("2.2: X·ª≠ l√Ω bureau...")

# 1. H·ª£p nh·∫•t (merge) bureau v·ªõi agg_bureau_balance ƒë√£ t·∫°o ·ªü b∆∞·ªõc 2.1
df_bureau = bureau.merge(agg_bureau_balance, on='SK_ID_BUREAU', how='left')

# 2. X·ª≠ l√Ω categorical trong bureau
bureau_categorical_cols = df_bureau.select_dtypes(include=['object']).columns
df_bureau = pd.get_dummies(df_bureau, columns=bureau_categorical_cols, dummy_na=True)

# 3. ƒê·ªãnh nghƒ©a c√°c ph√©p t·ªïng h·ª£p
# L·∫•y c√°c c·ªôt s·ªë (numeric)
numeric_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'count', 'std'], 
    'CREDIT_DAY_OVERDUE': ['mean', 'max', 'std'], 
    'AMT_CREDIT_SUM': ['sum', 'mean', 'max', 'std'], 
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean', 'std'], 
    'AMT_CREDIT_SUM_OVERDUE': ['sum', 'mean', 'std'],
    'CNT_CREDIT_PROLONG': ['sum', 'mean', 'std'], 
}
# L·∫•y c√°c c·ªôt categorical ƒë√£ one-hot (t·ª´ bureau v√† bureau_balance)
categorical_cols = [col for col in df_bureau.columns if col.startswith(('CREDIT_ACTIVE_', 'CREDIT_CURRENCY_', 'CREDIT_TYPE_', 'STATUS_'))]
for col in categorical_cols:
    numeric_aggregations[col] = ['mean', 'sum'] # L·∫•y t·ª∑ l·ªá v√† s·ªë l∆∞·ª£ng

# 4. T·ªïng h·ª£p (Aggregate) theo SK_ID_CURR
agg_bureau = df_bureau.groupby('SK_ID_CURR').agg(numeric_aggregations)

# 5. √ÅP D·ª§NG GI·∫¢I PH√ÅP S·ª¨A L·ªñI (ƒê·ªïi t√™n c·ªôt MultiIndex)
agg_bureau.columns = ['BUREAU_' + '_'.join(col).upper() for col in agg_bureau.columns.values]
agg_bureau.reset_index(inplace=True) # SK_ID_CURR v·ªÅ l√†m c·ªôt

print(f"K√≠ch th∆∞·ªõc agg_bureau (ƒë√£ agg): {agg_bureau.shape}")
print(f"5 c·ªôt ƒë·∫ßu ti√™n: {list(agg_bureau.columns[:5])}")

2.2: X·ª≠ l√Ω bureau...
K√≠ch th∆∞·ªõc agg_bureau (ƒë√£ agg): (305811, 164)
5 c·ªôt ƒë·∫ßu ti√™n: ['SK_ID_CURR', 'BUREAU_DAYS_CREDIT_MIN', 'BUREAU_DAYS_CREDIT_MAX', 'BUREAU_DAYS_CREDIT_MEAN', 'BUREAU_DAYS_CREDIT_COUNT']


## 3. X·ª≠ l√Ω Previous_applicaiton.csv

In [12]:
print("3: X·ª≠ l√Ω previous_application...")

# 1. L√†m s·∫°ch gi√° tr·ªã 365243
previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

# 2. T·∫°o thu·ªôc t√≠nh m·ªõi
previous_application['AMT_CREDIT_GOODS_PRICE_DIFF'] = previous_application['AMT_CREDIT'] - previous_application['AMT_GOODS_PRICE']

# 3. X·ª≠ l√Ω categorical
prev_categorical_cols = previous_application.select_dtypes(include=['object']).columns
previous_application = pd.get_dummies(previous_application, columns=prev_categorical_cols, dummy_na=True)

# 4. ƒê·ªãnh nghƒ©a c√°c ph√©p t·ªïng h·ª£p
prev_aggregations = {
    'AMT_ANNUITY': ['min', 'max', 'mean', 'sum', 'std'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean', 'sum', 'std'],
    'AMT_CREDIT_GOODS_PRICE_DIFF': ['mean', 'max', 'sum', 'std'],
    'AMT_DOWN_PAYMENT': ['mean', 'sum', 'std'],
    'DAYS_DECISION': ['min', 'max', 'mean', 'std'],
    'CNT_PAYMENT': ['mean', 'sum', 'std'],
}

# L·∫•y c√°c c·ªôt categorical (v√≠ d·ª•: T·ª∑ l·ªá c√°c h·ª£p ƒë·ªìng b·ªã t·ª´ ch·ªëi/ch·∫•p nh·∫≠n)
cat_cols = [col for col in previous_application.columns if col.startswith(('NAME_CONTRACT_STATUS_', 'CODE_REJECT_REASON_'))]
for col in cat_cols:
    prev_aggregations[col] = ['mean', 'sum']
    
# 5. T·ªïng h·ª£p (Aggregate) theo SK_ID_CURR
agg_previous = previous_application.groupby('SK_ID_CURR').agg(prev_aggregations)

# 6. √ÅP D·ª§NG GI·∫¢I PH√ÅP S·ª¨A L·ªñI (ƒê·ªïi t√™n c·ªôt MultiIndex)
agg_previous.columns = ['PREV_' + '_'.join(col).upper() for col in agg_previous.columns.values]
agg_previous.reset_index(inplace=True) # SK_ID_CURR v·ªÅ l√†m c·ªôt

print(f"K√≠ch th∆∞·ªõc agg_previous (ƒë√£ agg): {agg_previous.shape}")

3: X·ª≠ l√Ω previous_application...
K√≠ch th∆∞·ªõc agg_previous (ƒë√£ agg): (338857, 58)


## 4. X·ª≠ l√Ω installments_payments.csv

In [13]:
print("4. X·ª≠ l√Ω installments_payments...")

# 1. T·∫°o thu·ªôc t√≠nh m·ªõi (S·ªë ng√†y tr·∫£ ch·∫≠m, S·ªë ti·ªÅn tr·∫£ thi·∫øu)
installments_payments['DAYS_PAYMENT_DIFF'] = installments_payments['DAYS_INSTALMENT'] - installments_payments['DAYS_ENTRY_PAYMENT']
installments_payments['AMT_PAYMENT_DIFF'] = installments_payments['AMT_INSTALMENT'] - installments_payments['AMT_PAYMENT']

# 2. ƒê·ªãnh nghƒ©a t·ªïng h·ª£p
install_aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DAYS_PAYMENT_DIFF': ['mean', 'max', 'sum', 'std'],
    'AMT_PAYMENT_DIFF': ['mean', 'max', 'sum', 'std'],
    'AMT_INSTALMENT': ['mean', 'sum', 'std'],
    'AMT_PAYMENT': ['mean', 'sum', 'std'],
    'SK_ID_PREV': ['nunique'] # ƒê·∫øm s·ªë kho·∫£n vay
}

# 3. T·ªïng h·ª£p
agg_installments = installments_payments.groupby('SK_ID_CURR').agg(install_aggregations)

# 4. √ÅP D·ª§NG GI·∫¢I PH√ÅP S·ª¨A L·ªñI
agg_installments.columns = ['INSTALL_' + '_'.join(col).upper() for col in agg_installments.columns.values]
agg_installments.reset_index(inplace=True)

print(f"K√≠ch th∆∞·ªõc agg_installments (ƒë√£ agg): {agg_installments.shape}")

4. X·ª≠ l√Ω installments_payments...
K√≠ch th∆∞·ªõc agg_installments (ƒë√£ agg): (339587, 17)


## 5. X·ª≠ l√Ω credit_card_balance.csv

In [14]:
print("5: X·ª≠ l√Ω credit_card_balance...")

# 1. T·∫°o thu·ªôc t√≠nh m·ªõi
credit_card_balance['BALANCE_LIMIT_RATIO'] = credit_card_balance['AMT_BALANCE'] / (credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL'] + 0.00001)

# 2. ƒê·ªãnh nghƒ©a t·ªïng h·ª£p
cc_aggregations = {
    'AMT_BALANCE': ['mean', 'max', 'sum', 'std'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['mean', 'max', 'std'],
    'AMT_DRAWINGS_ATM_CURRENT': ['mean', 'max', 'sum', 'std'],
    'AMT_DRAWINGS_CURRENT': ['mean', 'max', 'sum', 'std'],
    'CNT_DRAWINGS_ATM_CURRENT': ['mean', 'max', 'sum', 'std'],
    'CNT_INSTALMENT_MATURE_CUM': ['max', 'mean', 'std'],
    'SK_DPD': ['mean', 'max', 'sum', 'std'], # S·ªë ng√†y qu√° h·∫°n
    'SK_DPD_DEF': ['mean', 'max', 'sum', 'std'], # S·ªë ng√†y qu√° h·∫°n (nghi√™m tr·ªçng)
    'BALANCE_LIMIT_RATIO': ['mean', 'max', 'std']
}

# 3. T·ªïng h·ª£p
agg_credit_card = credit_card_balance.groupby('SK_ID_CURR').agg(cc_aggregations)

# 4. √ÅP D·ª§NG GI·∫¢I PH√ÅP S·ª¨A L·ªñI
agg_credit_card.columns = ['CC_' + '_'.join(col).upper() for col in agg_credit_card.columns.values]
agg_credit_card.reset_index(inplace=True)

print(f"K√≠ch th∆∞·ªõc agg_credit_card (ƒë√£ agg): {agg_credit_card.shape}")

5: X·ª≠ l√Ω credit_card_balance...
K√≠ch th∆∞·ªõc agg_credit_card (ƒë√£ agg): (103558, 34)


## 6. X·ª≠ l√Ω POSH_CASH_balance.csv

In [15]:
print("6: X·ª≠ l√Ω POS_CASH_balance (Phi√™n b·∫£n s·ª≠a l·ªói KeyError)")

# 1. X·ª≠ l√Ω categorical (M·ªôt c√°ch an to√†n)
# KI·ªÇM TRA tr∆∞·ªõc khi ch·∫°y get_dummies
if 'NAME_CONTRACT_STATUS' in POS_CASH_balance.columns:
    print("ƒêang ch·∫°y pd.get_dummies cho c·ªôt 'NAME_CONTRACT_STATUS'...")
    POS_CASH_balance = pd.get_dummies(POS_CASH_balance, columns=['NAME_CONTRACT_STATUS'], dummy_na=True)
else:
    print("C·ªôt 'NAME_CONTRACT_STATUS' d∆∞·ªùng nh∆∞ ƒë√£ ƒë∆∞·ª£c get_dummies. B·ªè qua.")
# 2. ƒê·ªãnh nghƒ©a t·ªïng h·ª£p
pos_aggregations = {
    'MONTHS_BALANCE': ['min', 'max', 'mean', 'std'], 
    'CNT_INSTALMENT': ['min', 'max', 'mean', 'std'], 
    'CNT_INSTALMENT_FUTURE': ['min', 'max', 'mean', 'std'], 
    'SK_DPD': ['mean', 'max', 'sum', 'std'], 
    'SK_DPD_DEF': ['mean', 'max', 'sum', 'std'], 
}

# L·∫•y c√°c c·ªôt categorical (ƒë√£ ƒë∆∞·ª£c one-hot)
# D√≤ng n√†y s·∫Ω t√¨m c√°c c·ªôt 'NAME_CONTRACT_STATUS_Active' v.v.
cat_cols = [col for col in POS_CASH_balance.columns if col.startswith('NAME_CONTRACT_STATUS_')]
for col in cat_cols:
    pos_aggregations[col] = ['mean', 'sum']

# 3. T·ªïng h·ª£p
agg_pos_cash = POS_CASH_balance.groupby('SK_ID_CURR').agg(pos_aggregations)

# 4. √ÅP D·ª§NG GI·∫¢I PH√ÅP S·ª¨A L·ªñI (MultiIndex)
agg_pos_cash.columns = ['POS_' + '_'.join(col).upper() for col in agg_pos_cash.columns.values]
agg_pos_cash.reset_index(inplace=True)

print(f"K√≠ch th∆∞·ªõc agg_pos_cash (ƒë√£ agg): {agg_pos_cash.shape}")

6: X·ª≠ l√Ω POS_CASH_balance (Phi√™n b·∫£n s·ª≠a l·ªói KeyError)
ƒêang ch·∫°y pd.get_dummies cho c·ªôt 'NAME_CONTRACT_STATUS'...
K√≠ch th∆∞·ªõc agg_pos_cash (ƒë√£ agg): (337252, 41)


## 7. Merge c√°c b·∫£ng d·ªØ li·ªáu

In [16]:
print("7: H·ª£p nh·∫•t (Merge) t·∫•t c·∫£...")

# 1. H·ª£p nh·∫•t df_app (t·ª´ B∆∞·ªõc 1) v·ªõi agg_bureau (t·ª´ B∆∞·ªõc 2.2)
df_final = df_app.merge(agg_bureau, on='SK_ID_CURR', how='left')
print(f"K√≠ch th∆∞·ªõc sau khi merge bureau: {df_final.shape}")

# 2. H·ª£p nh·∫•t v·ªõi agg_previous (t·ª´ B∆∞·ªõc 2.3)
df_final = df_final.merge(agg_previous, on='SK_ID_CURR', how='left')
print(f"K√≠ch th∆∞·ªõc sau khi merge previous: {df_final.shape}")

# 3. H·ª£p nh·∫•t v·ªõi agg_installments (t·ª´ B∆∞·ªõc 2.4)
df_final = df_final.merge(agg_installments, on='SK_ID_CURR', how='left')
print(f"K√≠ch th∆∞·ªõc sau khi merge installments: {df_final.shape}")

# 4. H·ª£p nh·∫•t v·ªõi agg_credit_card (t·ª´ B∆∞·ªõc 2.5)
df_final = df_final.merge(agg_credit_card, on='SK_ID_CURR', how='left')
print(f"K√≠ch th∆∞·ªõc sau khi merge credit card: {df_final.shape}")

# 5. H·ª£p nh·∫•t v·ªõi agg_pos_cash (t·ª´ B∆∞·ªõc 2.6)
df_final = df_final.merge(agg_pos_cash, on='SK_ID_CURR', how='left')
print(f"K√≠ch th∆∞·ªõc sau khi merge POS cash: {df_final.shape}")

print("--- H·ª¢P NH·∫§T TH√ÄNH C√îNG, df_final ƒê√É ƒê∆Ø·ª¢C T·∫†O ---")

7: H·ª£p nh·∫•t (Merge) t·∫•t c·∫£...
K√≠ch th∆∞·ªõc sau khi merge bureau: (356255, 433)
K√≠ch th∆∞·ªõc sau khi merge previous: (356255, 490)
K√≠ch th∆∞·ªõc sau khi merge installments: (356255, 506)
K√≠ch th∆∞·ªõc sau khi merge credit card: (356255, 539)
K√≠ch th∆∞·ªõc sau khi merge POS cash: (356255, 579)
--- H·ª¢P NH·∫§T TH√ÄNH C√îNG, df_final ƒê√É ƒê∆Ø·ª¢C T·∫†O ---


## 8. T·∫°o Interaction Features

In [17]:
print("B·∫Øt ƒë·∫ßu B∆∞·ªõc 4: T·∫°o c√°c thu·ªôc t√≠nh t∆∞∆°ng t√°c (Interaction Features)...")

# T∆∞∆°ng t√°c Thu nh·∫≠p v√† Kho·∫£n vay hi·ªán t·∫°i
df_final['APP_INCOME_CREDIT_RATIO'] = df_final['AMT_INCOME_TOTAL'] / (df_final['AMT_CREDIT'] + 0.00001)
df_final['APP_INCOME_ANNUITY_RATIO'] = df_final['AMT_INCOME_TOTAL'] / (df_final['AMT_ANNUITY'] + 0.00001)
df_final['APP_PAYMENT_RATE'] = df_final['AMT_ANNUITY'] / (df_final['AMT_CREDIT'] + 0.00001) # T·ª∑ l·ªá tr·∫£ g√≥p

# T∆∞∆°ng t√°c Thu nh·∫≠p v√† L·ªãch s·ª≠ Bureau (kho·∫£n vay c≈© ·ªü n∆°i kh√°c)
df_final['BUREAU_INCOME_CREDIT_RATIO'] = df_final['BUREAU_AMT_CREDIT_SUM_SUM'] / (df_final['AMT_INCOME_TOTAL'] + 0.00001)
df_final['BUREAU_INCOME_DEBT_RATIO'] = df_final['BUREAU_AMT_CREDIT_SUM_DEBT_SUM'] / (df_final['AMT_INCOME_TOTAL'] + 0.00001)

# T∆∞∆°ng t√°c Thu nh·∫≠p v√† L·ªãch s·ª≠ Previous App (kho·∫£n vay c≈© ·ªü Home Credit)
df_final['PREV_INCOME_CREDIT_RATIO'] = df_final['PREV_AMT_CREDIT_SUM'] / (df_final['AMT_INCOME_TOTAL'] + 0.00001)

# T∆∞∆°ng t√°c Thu nh·∫≠p v√† L·ªãch s·ª≠ Tr·∫£ g√≥p
df_final['INSTALL_INCOME_PAYMENT_RATIO'] = df_final['INSTALL_AMT_PAYMENT_SUM'] / (df_final['AMT_INCOME_TOTAL'] + 0.00001)

print(f"K√≠ch th∆∞·ªõc df_final sau khi th√™m interaction features: {df_final.shape}")

B·∫Øt ƒë·∫ßu B∆∞·ªõc 4: T·∫°o c√°c thu·ªôc t√≠nh t∆∞∆°ng t√°c (Interaction Features)...
K√≠ch th∆∞·ªõc df_final sau khi th√™m interaction features: (356255, 586)


## 9. Feature Selection

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import re

In [19]:
print("L·ªçc Feature")

# T√°ch train_df t·∫°m th·ªùi ƒê·ªÇ FIT b·ªô l·ªçc (tr√°nh data leakage)
# df_final l√∫c n√†y ƒë√£ c√≥ 586 features (bao g·ªìm c·∫£ interaction)
train_df_for_selection = df_final[df_final['TARGET'].notnull()].copy()

# === 1. L·ªçc c√°c c·ªôt thi·∫øu qu√° nhi·ªÅu d·ªØ li·ªáu ===
missing_threshold = 0.8 # Ng∆∞·ª°ng 80%
missing_counts = train_df_for_selection.isnull().mean()
cols_to_drop_missing = missing_counts[missing_counts > missing_threshold].index

# √Åp d·ª•ng l·ªçc tr√™n df_final
df_final = df_final.drop(columns=cols_to_drop_missing)
print(f"ƒê√£ lo·∫°i b·ªè {len(cols_to_drop_missing)} c·ªôt do thi·∫øu d·ªØ li·ªáu > 80%.")


# === 2. L·ªçc c√°c c·ªôt c√≥ gi√° tr·ªã duy nh·∫•t (√≠t bi·∫øn thi√™n) ===

# C·∫≠p nh·∫≠t l·∫°i train_df_for_selection sau khi ƒë√£ drop ·ªü b∆∞·ªõc 1
train_df_for_selection = df_final[df_final['TARGET'].notnull()].copy()

unique_counts = train_df_for_selection.nunique()
cols_to_drop_unique = unique_counts[unique_counts == 1].index

# √Åp d·ª•ng l·ªçc tr√™n df_final
df_final = df_final.drop(columns=cols_to_drop_unique)
print(f"ƒê√£ lo·∫°i b·ªè {len(cols_to_drop_unique)} c·ªôt do ch·ªâ c√≥ 1 gi√° tr·ªã duy nh·∫•t.")


print(f"--- K√≠ch th∆∞·ªõc df_final sau L·ªçc 'Th√¥': {df_final.shape} ---")

L·ªçc Feature
ƒê√£ lo·∫°i b·ªè 6 c·ªôt do thi·∫øu d·ªØ li·ªáu > 80%.
ƒê√£ lo·∫°i b·ªè 35 c·ªôt do ch·ªâ c√≥ 1 gi√° tr·ªã duy nh·∫•t.
--- K√≠ch th∆∞·ªõc df_final sau L·ªçc 'Th√¥': (356255, 545) ---


In [20]:
print("ƒê√£ import RandomForestClassifier v√† SelectFromModel t·ª´ sklearn.")

# 1. T√°ch l·∫°i t·∫≠p train (ch·ªâ train m·ªõi c√≥ TARGET)
train_df = df_final[df_final['TARGET'].notnull()].copy()
print(f"K√≠ch th∆∞·ªõc t·∫≠p train ƒë·ªÉ ch·ªçn l·ªçc: {train_df.shape}")

# 2. X·ª≠ l√Ω t√™n c·ªôt (cho an to√†n)
train_df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in train_df.columns]

# 3. ƒê·ªãnh nghƒ©a X (features) v√† y (target)
y_train = train_df['TARGET']
cols_to_drop = ['TARGET', 'SK_ID_CURR']
X_train = train_df.drop(columns=cols_to_drop)

print(f"S·ªë l∆∞·ª£ng features ban ƒë·∫ßu: {X_train.shape[1]}")

# 4. X·ª¨ L√ù NaN/Infinity (B·∫ÆT BU·ªòC cho RandomForest)
X_train.replace([np.inf, -np.inf], 0, inplace=True)
X_train.fillna(0, inplace=True)

print("ƒê√£ x·ª≠ l√Ω NaN v√† Infinity cho X_train.")

ƒê√£ import RandomForestClassifier v√† SelectFromModel t·ª´ sklearn.
K√≠ch th∆∞·ªõc t·∫≠p train ƒë·ªÉ ch·ªçn l·ªçc: (307511, 545)
S·ªë l∆∞·ª£ng features ban ƒë·∫ßu: 543
ƒê√£ x·ª≠ l√Ω NaN v√† Infinity cho X_train.


In [21]:
print("9.1: Train RandomForest Selector...")

# 1. Kh·ªüi t·∫°o RandomForest
rf_selector = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=42
)

print("ƒêang training m√¥ h√¨nh RandomForest... (C√≥ th·ªÉ m·∫•t v√†i ph√∫t)")
rf_selector.fit(X_train, y_train)
print("Training ho√†n t·∫•t.")

9.1: Train RandomForest Selector...
ƒêang training m√¥ h√¨nh RandomForest... (C√≥ th·ªÉ m·∫•t v√†i ph√∫t)
Training ho√†n t·∫•t.


In [22]:
print("9.2: Ch·ªçn l·ªçc t·ª± ƒë·ªông (Threshold='median')...")

# 1. Kh·ªüi t·∫°o SelectFromModel
# ƒê∆∞a m√¥ h√¨nh ƒë√£ train (rf_selector) v√†o
# 'threshold="median"': T·ª± ƒë·ªông t√≠nh to√°n ƒë·ªô quan tr·ªçng trung v·ªã 
#                       v√† ch·ªâ gi·ªØ l·∫°i c√°c feature c√≥ ƒë·ªô quan tr·ªçng cao h∆°n.
# 'prefit=True': B√°o cho SelectFromModel bi·∫øt ch√∫ng ta ƒë√£ train rf_selector r·ªìi.
selector = SelectFromModel(rf_selector, threshold="median", prefit=True)

print(f"ƒê√£ t·∫°o b·ªô ch·ªçn (selector) v·ªõi ng∆∞·ª°ng = 'median'.")

# 2. L·∫•y danh s√°ch c√°c feature quan tr·ªçng
# .get_support() tr·∫£ v·ªÅ m·ªôt m·∫£ng True/False cho m·ªói feature
feature_mask = selector.get_support()
# L·∫•y t√™n c·ªßa c√°c feature ƒë∆∞·ª£c ch·ªçn (l√† True)
important_features = X_train.columns[feature_mask]

# Chuy·ªÉn v·ªÅ d·∫°ng list
important_features = important_features.tolist()

print(f"\n--- Ph√¢n t√≠ch ch·ªçn l·ªçc ---")
print(f"T·ªïng s·ªë features ban ƒë·∫ßu: {X_train.shape[1]}")
print(f"S·ªë l∆∞·ª£ng features quan tr·ªçng (tr√™n m·ª©c trung v·ªã) ƒë∆∞·ª£c gi·ªØ l·∫°i: {len(important_features)}")

print("\n--- Features ƒë∆∞·ª£c ch·ªçn: ---")
# Ch√∫ng ta c√≥ th·ªÉ xem l·∫°i DataFrame c≈© ƒë·ªÉ th·∫•y
features_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_selector.feature_importances_
}).sort_values(by='importance', ascending=False)
print(features_df[features_df['feature'].isin(important_features)])

9.2: Ch·ªçn l·ªçc t·ª± ƒë·ªông (Threshold='median')...
ƒê√£ t·∫°o b·ªô ch·ªçn (selector) v·ªõi ng∆∞·ª°ng = 'median'.

--- Ph√¢n t√≠ch ch·ªçn l·ªçc ---
T·ªïng s·ªë features ban ƒë·∫ßu: 543
S·ªë l∆∞·ª£ng features quan tr·ªçng (tr√™n m·ª©c trung v·ªã) ƒë∆∞·ª£c gi·ªØ l·∫°i: 272

--- Features ƒë∆∞·ª£c ch·ªçn: ---
                                              feature  importance
109                                  EXT_SOURCES_MEAN    0.022812
111                                   EXT_SOURCES_MIN    0.017340
112                                   EXT_SOURCES_MAX    0.016145
28                                       EXT_SOURCE_2    0.014417
6                                          DAYS_BIRTH    0.008653
..                                                ...         ...
142     NAME_EDUCATION_TYPE_Secondarysecondaryspecial    0.000910
530  POS_NAME_CONTRACT_STATUS_RETURNEDTOTHESTORE_MEAN    0.000908
179                WEEKDAY_APPR_PROCESS_START_TUESDAY    0.000906
121                           

## 10. X·ª≠ l√Ω Missing value, C√¢n b·∫±ng d·ªØ li·ªáu

In [23]:
# Th∆∞ vi·ªán c·∫ßn thi·∫øt cho c√°c b∆∞·ªõc ti·∫øp theo
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

In [25]:
print("--- B∆Ø·ªöC 10: X·ª≠ l√Ω NaN, Chia Train/Test & C√¢n b·∫±ng D·ªØ li·ªáu ---")

# S·ª≠a ƒë·ªïi logic ƒë·ªÉ gi·ªØ l·∫°i SK_ID_CURR trong t·∫≠p features khi ƒëi·ªÅn NaN

# 1. ƒê·ªãnh nghƒ©a l·∫°i t·∫≠p train v√† test
train_df_filtered = df_final[df_final['TARGET'].notnull()].copy()
test_df_filtered = df_final[df_final['TARGET'].isnull()].copy()
print(f"K√≠ch th∆∞·ªõc t·∫≠p Train ƒë√£ l·ªçc (v·∫´n c√≤n NaN): {train_df_filtered.shape}")


# 2. X·ª¨ L√ù MISSING VALUE TR√äN T·∫¨P TRAIN (Imputation) 
X_all_features = train_df_filtered.drop(columns=['TARGET']).copy()
y_train = train_df_filtered['TARGET']

X_features_only = X_all_features.drop(columns=['SK_ID_CURR'])

X_filled_features = X_features_only.fillna(X_features_only.mean())

for col in X_filled_features.select_dtypes(include=['object']).columns:
    X_filled_features[col] = X_filled_features[col].fillna(X_filled_features[col].mode()[0]) 

# Th√™m l·∫°i SK_ID_CURR v√†o t·∫≠p features ƒë√£ fill
X_filled_features['SK_ID_CURR'] = X_all_features['SK_ID_CURR'].values 

# clean_df gi·ªù ƒë√¢y s·∫Ω bao g·ªìm t·∫•t c·∫£ features, SK_ID_CURR, v√† TARGET
clean_df = pd.concat(
    [X_filled_features.reset_index(drop=True), y_train.reset_index(drop=True)], 
    axis=1
)


# 3. CHIA L·∫†I TRAIN/TEST 
X = clean_df.drop(columns=["TARGET", "SK_ID_CURR"])
y = clean_df["TARGET"]

# Chia d·ªØ li·ªáu train/test (80% - 20%)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Gi·∫£m k√≠ch th∆∞·ªõc (Sampling, theo logic c·ªßa AI_PROJECT.ipynb)
train_frac = 0.2 
test_frac = 0.1

X_train_small = X_train_full.sample(frac=train_frac, random_state=42)
y_train_small = y_train_full.loc[X_train_small.index]

X_test_small = X_test_full.sample(frac=test_frac, random_state=42)
y_test_small = y_test_full.loc[X_test_small.index]

print(f"\nK√≠ch th∆∞·ªõc t·∫≠p train nh·ªè: {X_train_small.shape}, Test nh·ªè: {X_test_small.shape}")

# 4. C√ÇN B·∫∞NG D·ªÆ LI·ªÜU (UnderSampling)
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_small, y_train_small)

print("‚úÖ Train nh·ªè ƒë√£ ƒë∆∞·ª£c c√¢n b·∫±ng b·∫±ng UnderSampling. K√≠ch th∆∞·ªõc m·ªõi:")
print(pd.Series(y_train_resampled).value_counts())

# 5. X·ª¨ L√ù MISSING VALUE TR√äN T·∫¨P TEST (√Åp d·ª•ng Mean t·ª´ t·∫≠p Train g·ªëc)
# T√°ch X_test_final (t·∫≠p test nh·ªè) v√† x·ª≠ l√Ω NaN b·∫±ng Mean t·ª´ T·∫¨P TRAIN g·ªëc (X_train)
X_test_final = X_test_small.fillna(X_train.mean()) # D√πng mean c·ªßa t·∫≠p train g·ªëc

# G·ªôp l·∫°i th√†nh DataFrame Test cu·ªëi c√πng (ƒë√£ clean)
test_df_final = pd.concat(
    [X_test_final.reset_index(drop=True),
     y_test_small.reset_index(drop=True)],
    axis=1
)

# 6. XU·∫§T FILE CU·ªêI C√ôNG
train_balanced_df = pd.concat(
    [pd.DataFrame(X_train_resampled, columns=X.columns),
     pd.Series(y_train_resampled, name='TARGET')],
    axis=1
)

# S·ª≠ d·ª•ng m·ªôt ƒë∆∞·ªùng d·∫´n chung (PATH ƒë√£ ƒë·ªãnh nghƒ©a ·ªü ƒë·∫ßu file)
output_path_train_balanced = PATH + "/train_balanced_under.csv"
output_path_test_small = PATH + "/test_modeling_small.csv" # ƒê·ªïi t√™n cho r√µ r√†ng h∆°n

train_balanced_df.to_csv(output_path_train_balanced, index=False)
test_df_final.to_csv(output_path_test_small, index=False)


print("\n--- HO√ÄN T·∫§T QUY TR√åNH FEATURE ENGINEERING & TI·ªÄN M√î H√åNH ---")
print(f"üíæ Train balanced ƒë√£ l∆∞u: {train_balanced_df.shape}")
print(f"üíæ Test modeling ƒë√£ l∆∞u: {test_df_final.shape}")

--- B∆Ø·ªöC 10: X·ª≠ l√Ω NaN, Chia Train/Test & C√¢n b·∫±ng D·ªØ li·ªáu ---
K√≠ch th∆∞·ªõc t·∫≠p Train ƒë√£ l·ªçc (v·∫´n c√≤n NaN): (307511, 545)

K√≠ch th∆∞·ªõc t·∫≠p train nh·ªè: (49202, 543), Test nh·ªè: (6150, 543)
‚úÖ Train nh·ªè ƒë√£ ƒë∆∞·ª£c c√¢n b·∫±ng b·∫±ng UnderSampling. K√≠ch th∆∞·ªõc m·ªõi:
TARGET
0.0    3958
1.0    3958
Name: count, dtype: int64

--- HO√ÄN T·∫§T QUY TR√åNH FEATURE ENGINEERING & TI·ªÄN M√î H√åNH ---
üíæ Train balanced ƒë√£ l∆∞u: (7916, 544)
üíæ Test modeling ƒë√£ l∆∞u: (6150, 544)
