# 0. Set Up
- 기본 분석: numpy, pandas, matplotlib, seaborn
- 모델: scikit-learn, torch, tqdm, lightgbm, xgboost

In [None]:
!pip install lightgbm
!pip install xgboost
!pip install tqdm
!pip install torch

# 1. Data Analysis

In [None]:
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# data loading
original_train_labels = pd.read_csv("./data/train_labels.csv")
original_train_values = pd.read_csv("./data/train_values.csv")
original_test_values = pd.read_csv("./data/test_values.csv")

In [None]:
# data info
original_train_values.info()

In [None]:
# check null
original_train_values.isnull().sum()

In [None]:
# damage_grade distribution (class imbalance)
sns.countplot(x="damage_grade", data=original_train_labels)
plt.title("damage drade distribution")
plt.show()

In [None]:
# data count
original_train_values['has_secondary_use_use_police'].value_counts()

In [None]:
# detailed histogram (height_percentage)
plt.figure(figsize=(18,9))
sns.countplot(x=np.clip(original_train_values["area_percentage"],0,30), hue=original_train_labels["damage_grade"])
plt.ylabel("frequency")
plt.xlabel("height Percentage")
plt.xticks(rotation=90)
plt.title("height percentage histograms")
plt.legend(["damage_grade = 1","damage_grade = 2","damage_grade = 3"])
plt.show()

# 2. Selecting Columns
- train a lightGBM model
- perform a hyperparameter tuning
- TODO: 중요도가 낮은 것부터 하나씩 지워나가며 성능 변화를 관찰하기

In [None]:
import pickle
from tqdm import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

## 2-1. Basic Model

In [None]:
# categorical cols definition
original_categorical_cols = [
    "geo_level_1_id", 
    "geo_level_2_id",
    "geo_level_3_id",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "land_surface_condition",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status",
]

In [None]:
# data merge + define model
original_train_merge = original_train_values.merge(original_train_labels, on="building_id", how="left")
lgb_original = LightGBMSmall(df=original_train_merge, target_col="damage_grade")
lgb_original.encoding(categorical_cols=original_categorical_cols)
lgb_original.kfold(n_splits=5)

In [None]:
# check validity (not necessary)
original_train_columns = lgb_original.X.columns.to_list()
"building_id" in original_train_columns, len(original_train_columns)

In [None]:
# train the model
lgb_original.train()

In [None]:
# evaluate the model
lgb_original.eval()

In [None]:
# save models
with open("lgb_original.pkl", "wb") as f:
    pickle.dump(lgb_original, f)

In [None]:
# load models
with open("lgb_original.pkl", "rb") as f:
    lgb_original = pickle.load(f)

print(f"모델 개수: {len(lgb_original.models)}")
print(f"모델 타입: {type(lgb_original.models[0])}")

## 2-2. Column Importance Measure

In [None]:
# importance measure on each features
importances = lgb_original.models[0].feature_importances_
cols = lgb_original.X.columns

featimp_df = pd.DataFrame({
    "feature": cols,
    "importance": importances
}).sort_values("importance", ascending=False)

In [None]:
low_K = 10
print(featimp_df.head(len(featimp_df)-low_K))

In [None]:
print(featimp_df.tail(low_K))

## 2-3. Column Selection Experiment
- omitted on the current pipeline

In [None]:
# select columns
top_col = featimp_df.head(len(featimp_df)-low_K)['feature'].tolist()
top_col.append("damage_grade")

In [None]:
top_train_merge = original_train_merge.copy()
top_train_merge["height_percentage"] = np.clip(top_train_merge["height_percentage"],0,11)
# top_train_merge["area_percentage"] = np.clip(original_train_values["area_percentage"],0,30)

In [None]:
# define a new model
top_categorical_cols = list(set(top_col).intersection(original_categorical_cols))
lgb_top = LightGBMSmall(df=top_train_merge[top_col], target_col="damage_grade")
lgb_top.encoding(categorical_cols=top_categorical_cols)
lgb_top.kfold(n_splits=5)

In [None]:
lgb_top.train()

In [None]:
lgb_top.eval()

In [None]:
with open("lgb_top.pkl", "wb") as f:
    pickle.dump(lgb_original, f)

# 3. Data Preprocessing
- Low-dim mapping

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
# 이 파트는 선택적임
try:
    top_col.remove('damage_grade')
except:
    pass
top_col.append('building_id')
modified_train_values = original_train_values[top_col]
modified_train_labels = original_train_labels['damage_grade'] - 1
modified_test_values = original_test_values[top_col]

## 3-1. Low-dim Mapping

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(original_train_values, original_train_labels, test_size=0.2, random_state=42)

y_train_idx = y_train['damage_grade'] - 1
y_test_idx  = y_test['damage_grade'] - 1

y_train_idx = y_train_idx.rename("damage_grade")
y_test_idx  = y_test_idx.rename("damage_grade")

att_train_dataset = pd.concat([X_train, y_train_idx], axis=1)
att_val_dataset   = pd.concat([X_test,  y_test_idx],  axis=1)

train_dataset = GeoOnlyDataset(att_train_dataset, label_col="damage_grade")
val_dataset = GeoOnlyDataset(att_val_dataset, label_col="damage_grade")

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 카테고리 개수
n1 = att_train_dataset["geo_level_1_id"].max() + 1
n2 = att_train_dataset["geo_level_2_id"].max() + 1
n3 = att_train_dataset["geo_level_3_id"].max() + 1
n_classes = att_train_dataset["damage_grade"].nunique()  # 예: 3

model = GeoEncoderWithHead(
    n1=n1, n2=n2, n3=n3,
    n_classes=n_classes,
    d=32,
    geo_dim=32,
    n_heads=1,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# 실제 학습
for epoch in range(1, 11):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = eval_one_epoch(model, val_loader, criterion, device)
    print(f"[Epoch {epoch:02d}] "
          f"Train loss: {train_loss:.4f}, acc: {train_acc:.4f} | "
          f"Val loss: {val_loss:.4f}, acc: {val_acc:.4f}")


In [None]:
ldim_train_values = modified_train_values
ldim_train_labels = original_train_labels
ldim_test_values = modified_test_values

In [None]:
geo_cols = ["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"]
all_geo_data = pd.concat([
    ldim_train_values[geo_cols],
    ldim_test_values[geo_cols]
], axis=0)

# column별 LabelEncoder
encoders = {}
for col in geo_cols:
    le = LabelEncoder()
    le.fit(all_geo_data[col]) # 각 geo 데이터별로 labelencoder를 만들기
    encoders[col] = le
    
    # column transformer
    ldim_train_values[f"{col}_enc"] = le.transform(ldim_train_values[col])
    ldim_test_values[f"{col}_enc"] = le.transform(ldim_test_values[col])

# 차원 수, 클래스 개수
N1_DIM = len(encoders["geo_level_1_id"].classes_)
N2_DIM = len(encoders["geo_level_2_id"].classes_)
N3_DIM = len(encoders["geo_level_3_id"].classes_)

print(f"Geo Level 1: {N1_DIM} classes")
print(f"Geo Level 2: {N2_DIM} classes")
print(f"Geo Level 3: {N3_DIM} classes")

In [None]:
# create dataset
ldim_merge_df = ldim_train_values.merge(ldim_train_labels, on="building_id", how="left")
full_ldim_dataset = GeoDataset(ldim_merge_df, label_col="damage_grade")

# 8:2 ratio
ldim_train_size = int(0.8 * len(full_ldim_dataset))
ldim_val_size = len(full_ldim_dataset) - ldim_train_size
ldim_train_dataset, ldim_val_dataset = random_split(full_ldim_dataset, [ldim_train_size, ldim_val_size])

# DataLoader
batch_size = 256
ldim_train_loader = DataLoader(ldim_train_dataset, batch_size=batch_size, shuffle=True)
ldim_val_loader = DataLoader(ldim_val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

EMBED_D1, EMBED_D2, EMBED_D3 = 16, 128, 160
ldim_model = HierarchicalEmbeddingModel(N1_DIM, N2_DIM, N3_DIM, EMBED_D1, EMBED_D2, EMBED_D3, 3).to(device)

# resolve class imbalance
ldim_weights = torch.tensor([0.5, 0.2, 0.3]).to(device)
ldim_criterion = nn.CrossEntropyLoss(weight=ldim_weights)
ldim_optimizer = optim.Adam(ldim_model.parameters(), lr=0.001)

In [None]:
# save ldim_base model
state_dict_cpu = ldim_model.state_dict()
torch.save(state_dict_cpu, 'ldim_base.pth')

In [None]:
# load ldim_base model
ldim_base_file = torch.load('model/ldim_base.pth', map_location=torch.device('cpu'))
ldim_model.load_state_dict(ldim_base_file)
ldim_model.eval()

In [None]:
train_values_embedded = extract_embeddings(ldim_train_values, ldim_model, device)
test_values_embedded = extract_embeddings(ldim_test_values, ldim_model, device)
train_labels_embedded = ldim_train_labels

# train_values_embedded["height_percentage"] = np.clip(train_values_embedded["height_percentage"],0,11)
# test_values_embedded["height_percentage"] = np.clip(test_values_embedded["height_percentage"],0,11)

print("dataset shape with embedding columns:")
print("Train:", train_values_embedded.shape)
print("Test:", test_values_embedded.shape)

# 4. Model Training
- 기본 모델들을 최종적으로 학습시킨다.
- 여기서 완성된 모델이 추론을 수행하거나, 더 큰 모델로 합쳐짐.

In [None]:
final_train_df = train_values_embedded.merge(train_labels_embedded, on="building_id", how="left")
final_train_df = final_train_df.drop(columns='building_id')

In [None]:
xgb_ensemble = XGBoostSmall(df=final_train_df, target_col="damage_grade")
xgb_ensemble.encoding(categorical_cols=original_categorical_cols)
xgb_ensemble.kfold(n_splits=5)

In [None]:
xgb_ensemble.train()

In [None]:
xgb_ensemble.eval()

In [None]:
# save models
with open("xgb_ensemble.pkl", "wb") as f:
    pickle.dump(xgb_ensemble, f)

In [None]:
# load models
with open("xgb_ensemble.pkl", "rb") as f:
    xgb_ensemble = pickle.load(f)

print(f"모델 개수: {len(xgb_ensemble.models)}")
print(f"모델 타입: {type(xgb_ensemble.models[0])}")

In [None]:
lgb_limit = LightGBMLimit(df=final_train_df, target_col="damage_grade")
lgb_limit.encoding(categorical_cols=original_categorical_cols)
lgb_limit.kfold(n_splits=5)

In [None]:
lgb_limit.train()

In [None]:
lgb_limit.eval()

In [None]:
# save models
with open("lgb_limit.pkl", "wb") as f:
    pickle.dump(lgb_limit, f)

In [None]:
final_categorical_cols = [
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "land_surface_condition",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status",
]

In [None]:
lgb_ensemble = LightGBM(df=final_train_df, target_col="damage_grade")
lgb_ensemble.encoding(categorical_cols=final_categorical_cols)
lgb_ensemble.kfold(n_splits=5)
train_columns = lgb_ensemble.X.columns.to_list()
"building_id" in train_columns, len(train_columns)

In [None]:
lgb_ensemble.train()

In [None]:
lgb_ensemble.eval()

In [None]:
# save model
with open("lgb_ensemble.pkl", "wb") as f:
    pickle.dump(lgb_ensemble, f)

In [None]:
# load model
with open("lgb_ensemble.pkl", "rb") as f:
    lgb_ensemble = pickle.load(f)

print(f"모델 개수: {len(lgb_ensemble.models)}")
print(f"모델 타입: {type(lgb_ensemble.models[0])}")

# 5. Combining Baseline Models

- Ensemble Method를 사용해서 모델의 성능을 높인다.
- Stacking 기법. LinearSVM을 이용해 Meta 모델을 만들기
- TODO: lgb_ensemble + xgb_ensemble 모델 만들어보기

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [None]:
# 함수 실행
prob_df = predict_prob(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=train_values_embedded,
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist(),
)

# 결과 확인
print(prob_df.shape)
prob_df.head()

In [None]:
# 1. 타겟 컬럼 이름 확인 (보통 'damage_grade' 혹은 'label' 입니다)
# train_labels_embedded의 컬럼을 확인해보세요. 보통 'building_id'와 'damage_grade'가 있을 겁니다.
target_col = 'damage_grade' 

# 만약 컬럼명이 다르다면 아래 주석을 풀고 컬럼명을 확인 후 수정하세요
# print(train_labels_embedded.columns) 

# 2. building_id를 제외하고 타겟값만 넘겨주도록 수정
X_train_meta_df = get_oof_predictions(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    X=train_values_embedded,
    
    # [수정된 부분] 전체 DF 대신 특정 컬럼만 선택해서 넘깁니다.
    y=train_labels_embedded[target_col], 
    
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

print("OOF Shape:", X_train_meta_df.shape)
# 이제 (260601, 6)이 정상적으로 출력될 것입니다.

In [None]:
y_train_tensor = train_labels_embedded['damage_grade'] - 1 
meta_model = train_stacking_model(X_train_meta_df, y_train_tensor, input_dim=6)

In [None]:
# 2-3. 테스트 데이터 메타 데이터 생성
X_test_meta, test_ids = get_test_meta_data(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=test_values_embedded,
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

In [None]:
# 2-4. 최종 예측 수행
meta_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_meta.values).to(device)
    outputs = meta_model(X_test_tensor)
    # Logits -> 확률 -> 가장 높은 클래스 선택
    # 0,1,2 로 나오므로 다시 +1 해서 1,2,3으로 복구
    final_preds = torch.argmax(outputs, dim=1).cpu().numpy() + 1 

# 2-5. 제출 파일 생성
submission = pd.DataFrame({
    'building_id': test_ids,
    'damage_grade': final_preds
})

print("최종 예측 완료!")
print(submission.head())

# 저장
submission.to_csv('submission_stacking_nn.csv', index=False)

# 6. Submission
- 모델 예측 결과를 통해 최종 결과를 도출한다.

In [None]:
print("train feature count:", len(lgb_ensemble.X.columns.tolist()), len(xgb_ensemble.X.columns.tolist()))

In [None]:
lgb_ensemble.X.columns.tolist()

In [None]:
xgb_ensemble.X.columns.tolist()

In [None]:
avg_test_probabilities = predict_lgb(
        models=models,
        test_df=test_df.drop(columns=['building_id']),
        categorical_cols=categorical_cols,
    )

## 6-1. Single Model Submission

### 6-1-1. LightGBM

In [None]:
lgb_submission_df = create_lgb_submission(
    models=lgb_ensemble.models,
    test_df=test_values_embedded,
    test_building_ids=test_values_embedded['building_id'], # 테스트셋 building_id 컬럼
    categorical_cols=final_categorical_cols
)

# 결과 확인
print(lgb_submission_df.head(5))
lgb_submission_df.to_csv("lgb_submission.csv", index=False)

### 6-1-2. XGBoost

In [None]:
geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
xgb_categorical_cols = [col for col in original_categorical_cols if col not in geo_cols]

xgb_submission_df = create_xgb_submission(
    models=xgb_ensemble.models,
    test_df=train_values_embedded,
    test_building_ids=train_values_embedded['building_id'], # 테스트셋 building_id 컬럼
    categorical_cols=xgb_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

# 결과 확인
print(xgb_submission_df.head(5))
xgb_submission_df.to_csv("xgb_submission.csv", index=False)

## 6-2. Ensemble Model Submission

In [None]:
# LightGBM 60%, XGBoost 40% 의견반영
ensemble_submission_df = create_ensemble_submission(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=test_values_embedded,
    test_building_ids=test_values_embedded['building_id'],
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist(),
    weights=(0.6, 0.4)
)

print(ensemble_submission_df.head(10))
ensemble_submission_df.to_csv("final_submission.csv", index=False)