# 0. Set Up
- 기본 분석: numpy, pandas, matplotlib, seaborn
- 모델: scikit-learn, torch, tqdm, lightgbm, xgboost

In [None]:
!pip install torch lightgbm xgboost

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn tqdm

# 1. Data Analysis

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# data loading
original_train_labels = pd.read_csv("./data/train_labels.csv")
original_train_values = pd.read_csv("./data/train_values.csv")
original_test_values = pd.read_csv("./data/test_values.csv")

In [6]:
# data info
original_train_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

In [7]:
# check null
original_train_values.isnull().sum()

building_id                               0
geo_level_1_id                            0
geo_level_2_id                            0
geo_level_3_id                            0
count_floors_pre_eq                       0
age                                       0
area_percentage                           0
height_percentage                         0
land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo       

In [None]:
# damage_grade distribution (class imbalance)
sns.countplot(x="damage_grade", data=original_train_labels)
plt.title("damage drade distribution")
plt.show()

In [None]:
# data count
original_train_values['has_secondary_use_use_police'].value_counts()

In [None]:
# detailed histogram (height_percentage)
plt.figure(figsize=(18,9))
sns.countplot(x=np.clip(original_train_values["area_percentage"],0,30), hue=original_train_labels["damage_grade"])
plt.ylabel("frequency")
plt.xlabel("height Percentage")
plt.xticks(rotation=90)
plt.title("height percentage histograms")
plt.legend(["damage_grade = 1","damage_grade = 2","damage_grade = 3"])
plt.show()

# 2. Selecting Columns
- train a lightGBM model
- perform a hyperparameter tuning
- TODO: 중요도가 낮은 것부터 하나씩 지워나가며 성능 변화를 관찰하기

In [39]:
import pickle
from model import LightGBM

## 2-1. Basic Model

In [50]:
# categorical cols definition
original_categorical_cols = [
    "geo_level_1_id", 
    "geo_level_2_id",
    "geo_level_3_id",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "land_surface_condition",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status",
]

In [10]:
# data merge + define model
original_train_merge = original_train_values.merge(original_train_labels, on="building_id", how="left")
lgb_original = LightGBM(df=original_train_merge, target_col="damage_grade")
lgb_original.encoding(categorical_cols=original_categorical_cols)
lgb_original.kfold(n_splits=5)

In [11]:
# check validity (not necessary)
original_train_columns = lgb_original.X.columns.to_list()
"building_id" in original_train_columns, len(original_train_columns)

(False, 38)

In [12]:
# train the model
lgb_original.train()

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	valid_0's multi_logloss: 0.56082


1it [00:22, 22.56s/it]

[Fold 0] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[146]	valid_0's multi_logloss: 0.560175


2it [00:40, 20.02s/it]

[Fold 1] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 0.557229


3it [00:58, 19.00s/it]

[Fold 2] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[140]	valid_0's multi_logloss: 0.562868


4it [01:16, 18.46s/it]

[Fold 3] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 0.562515


5it [01:32, 18.58s/it]

[Fold 4] done.





In [13]:
# evaluate the model
lgb_original.eval()

LightGBM Small Model CV Micro F1: 0.75014


0.7501429388221841

In [14]:
# save models
with open("lgb_original.pkl", "wb") as f:
    pickle.dump(lgb_original, f)

In [15]:
# load models
with open("lgb_original.pkl", "rb") as f:
    lgb_original = pickle.load(f)

print(f"모델 개수: {len(lgb_original.models)}")
print(f"모델 타입: {type(lgb_original.models[0])}")

모델 개수: 5
모델 타입: <class 'lightgbm.sklearn.LGBMClassifier'>


## 2-2. Column Importance Measure

In [16]:
# importance measure on each features
importances = lgb_original.models[0].feature_importances_
cols = lgb_original.X.columns

featimp_df = pd.DataFrame({
    "feature": cols,
    "importance": importances
}).sort_values("importance", ascending=False)

In [17]:
low_K = 10
print(featimp_df.head(len(featimp_df)-low_K))

                                   feature  importance
2                           geo_level_3_id       24941
1                           geo_level_2_id       11057
4                                      age        2890
5                          area_percentage        2016
0                           geo_level_1_id        1520
6                        height_percentage        1118
26                          count_families         781
9                                roof_type         644
12                                position         495
3                      count_floors_pre_eq         448
15     has_superstructure_mud_mortar_stone         407
19  has_superstructure_cement_mortar_brick         373
8                          foundation_type         367
11                        other_floor_type         357
7                   land_surface_condition         331
10                       ground_floor_type         312
27                       has_secondary_use         251
18     has

In [18]:
print(featimp_df.tail(low_K))

                          feature  importance
29        has_secondary_use_hotel          65
21      has_superstructure_bamboo          57
30       has_secondary_use_rental          44
37        has_secondary_use_other          17
33     has_secondary_use_industry           1
31  has_secondary_use_institution           0
32       has_secondary_use_school           0
34  has_secondary_use_health_post           0
35   has_secondary_use_gov_office           0
36   has_secondary_use_use_police           0


## 2-3. Column Selection Experiment
- omitted on the current pipeline

In [19]:
# select columns
top_col = featimp_df.head(len(featimp_df)-low_K)['feature'].tolist()
top_col.append("damage_grade")

In [20]:
top_train_merge = original_train_merge.copy()
top_train_merge["height_percentage"] = np.clip(top_train_merge["height_percentage"],0,11)
# top_train_merge["area_percentage"] = np.clip(original_train_values["area_percentage"],0,30)

In [21]:
# define a new model
top_categorical_cols = list(set(top_col).intersection(original_categorical_cols))
lgb_top = LightGBM(df=top_train_merge[top_col], target_col="damage_grade")
lgb_top.encoding(categorical_cols=top_categorical_cols)
lgb_top.kfold(n_splits=5)

In [22]:
lgb_top.train()

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[133]	valid_0's multi_logloss: 0.561004


1it [00:17, 17.16s/it]

[Fold 0] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's multi_logloss: 0.559979


2it [00:33, 16.75s/it]

[Fold 1] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[133]	valid_0's multi_logloss: 0.557605


3it [00:52, 17.55s/it]

[Fold 2] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[148]	valid_0's multi_logloss: 0.56314


4it [01:10, 18.04s/it]

[Fold 3] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 0.562342


5it [01:27, 17.42s/it]

[Fold 4] done.





In [23]:
lgb_top.eval()

LightGBM Small Model CV Micro F1: 0.75024


0.7502350336337926

In [24]:
with open("lgb_top.pkl", "wb") as f:
    pickle.dump(lgb_original, f)

# 3. Data Preprocessing
- Low-dim mapping
- columns add
- colum transformer

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import pandas as pd
from sklearn.model_selection import train_test_split

from preprocess import GeoHierDataset
from preprocess import GeoEncoderWithHead
from preprocess import hier_train, hier_eval, hier_embed

In [None]:
# 이 파트는 선택적임
try:
    top_col.remove('damage_grade')
except:
    pass
top_col.append('building_id')
modified_train_values = original_train_values[top_col]
modified_train_labels = original_train_labels['damage_grade'] - 1
modified_test_values = original_test_values[top_col]

## 3-1. Low-dim Mapping

In [7]:
X_train, X_test, y_train, y_test = train_test_split(original_train_values, original_train_labels, test_size=0.2, random_state=42)

y_train_idx = y_train['damage_grade'] - 1
y_test_idx  = y_test['damage_grade'] - 1

y_train_idx = y_train_idx.rename("damage_grade")
y_test_idx  = y_test_idx.rename("damage_grade")

att_train_dataset = pd.concat([X_train, y_train_idx], axis=1)
att_val_dataset   = pd.concat([X_test,  y_test_idx],  axis=1)

train_dataset = GeoHierDataset(att_train_dataset, label_col="damage_grade")
val_dataset = GeoHierDataset(att_val_dataset, label_col="damage_grade")

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 카테고리 개수
n1 = att_train_dataset["geo_level_1_id"].max() + 1
n2 = att_train_dataset["geo_level_2_id"].max() + 1
n3 = att_train_dataset["geo_level_3_id"].max() + 1
n_classes = att_train_dataset["damage_grade"].nunique()  # 예: 3

model = GeoEncoderWithHead(
    n1=n1, n2=n2, n3=n3,
    n_classes=n_classes,
    d=32,
    geo_dim=32,
    n_heads=1,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

In [None]:
# 실제 학습
epochs = 10
for epoch in range(1, epochs+1):
    train_loss, train_acc = hier_train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = hier_eval(model, val_loader, criterion, device)
    print(f"[Epoch {epoch:02d}] "
          f"Train loss: {train_loss:.4f}, acc: {train_acc:.4f} | "
          f"Val loss: {val_loss:.4f}, acc: {val_acc:.4f}")

In [36]:
torch.save(model.state_dict(), "geo_encoder_best_weights.pth")
print(f"모델 가중치를 저장했습니다.")

모델 가중치를 저장했습니다.


In [9]:
loaded_model = GeoEncoderWithHead(
    n1=n1, n2=n2, n3=n3,
    n_classes=n_classes,
    d=32,
    geo_dim=32,
    n_heads=1,
).to(device)

loaded_model.load_state_dict(torch.load("geo_encoder_best_weights.pth"))
loaded_model.eval()

print("저장된 모델 가중치 로드 완료.")

저장된 모델 가중치 로드 완료.


In [10]:
train_values_embedded = hier_embed(original_train_values, loaded_model, device)
test_values_embedded = hier_embed(original_test_values, loaded_model, device)
train_labels_embedded = original_train_labels

# train_values_embedded["height_percentage"] = np.clip(train_values_embedded["height_percentage"],0,11)
# test_values_embedded["height_percentage"] = np.clip(test_values_embedded["height_percentage"],0,11)

print("dataset shape with embedding columns:")
print("Train:", train_values_embedded.shape)
print("Test:", test_values_embedded.shape)

Extracting: 100%|██████████| 255/255 [00:18<00:00, 13.71it/s]
Extracting: 100%|██████████| 85/85 [00:06<00:00, 13.98it/s]

dataset shape with embedding columns:
Train: (260601, 68)
Test: (86868, 68)





## 3-2. Auto Encoder

In [1]:
from preprocess.auto import GeoAutoEncoder

In [None]:
df_train_geo = original_train_values[['geo_level_3_id', 'geo_level_2_id', 'geo_level_1_id']]
df_test_geo = original_test_values[['geo_level_3_id', 'geo_level_2_id', 'geo_level_1_id']]

df_combined = pd.concat([df_train_geo, df_test_geo], ignore_index=True)

In [20]:
autoencoder = GeoAutoEncoder(
    latent_dim=16,
    epochs=10,
    batch_size=1024,
    device='cpu' # TODO: GPU 사용하기
)

# 학습
print("학습 시작")
autoencoder.fit(df_combined) 

학습 시작
Detected sizes: L1=31, L2=1428, L3=12568
Training on cpu for 10 epochs...


                                                             

Epoch 1: Avg Loss = 12.1276


                                                             

Epoch 2: Avg Loss = 2.7655


                                                             

Epoch 3: Avg Loss = 0.9652


                                                             

Epoch 4: Avg Loss = 0.5165


                                                             

Epoch 5: Avg Loss = 0.3315


                                                             

Epoch 6: Avg Loss = 0.2321


                                                             

Epoch 7: Avg Loss = 0.1706


                                                             

Epoch 8: Avg Loss = 0.1293


                                                             

Epoch 9: Avg Loss = 0.0996


                                                              

Epoch 10: Avg Loss = 0.0784




0,1,2
,latent_dim,16
,epochs,10
,batch_size,1024
,learning_rate,0.001
,device,'cpu'


In [21]:
autoencoder.save("models/geo_ae_model.pth")

Model saved to models/geo_ae_model.pth


In [22]:
loaded_ae = GeoAutoEncoder().load("models/geo_ae_model.pth")

In [23]:
print("임베딩 추출")
train_embeddings = autoencoder.transform(original_train_values)
test_embeddings = autoencoder.transform(original_test_values)

임베딩 추출


Extracting Embeddings: 100%|██████████| 128/128 [00:01<00:00, 101.48it/s]
Extracting Embeddings: 100%|██████████| 43/43 [00:00<00:00, 77.40it/s] 


In [24]:
print(f"추출된 임베딩 형태: {train_embeddings.shape}")
print(f"추출된 임베딩 형태: {test_embeddings.shape}")

추출된 임베딩 형태: (260601, 16)
추출된 임베딩 형태: (86868, 16)


In [35]:
import pandas as pd
EMBEDDING_DIM = train_embeddings.shape[1]
embed_cols = [f"geo_emb2_{i}" for i in range(EMBEDDING_DIM)]

train_embeddings_df = pd.DataFrame(
    train_embeddings,
    columns=embed_cols
)
test_embeddings_df = pd.DataFrame(
    test_embeddings,
    columns=embed_cols
)

# 결과 확인
print("훈련 임베딩 DataFrame 생성 완료:")
print(f"차원: {train_embeddings_df.shape}")
print(train_embeddings_df.head())

print("테스트 임베딩 DataFrame 생성 완료:")
print(f"차원: {test_embeddings_df.shape}")
print(test_embeddings_df.head())

훈련 임베딩 DataFrame 생성 완료:
차원: (260601, 16)
   geo_emb2_0  geo_emb2_1  geo_emb2_2  geo_emb2_3  geo_emb2_4  geo_emb2_5  \
0    6.828994   21.101688    8.474803   14.649942   21.070877   47.433128   
1   37.395012   30.637888   18.684908   12.039928   37.425091   21.644316   
2   35.091362   15.104170   52.822598   48.692177    0.657782   42.670250   
3   28.726034   28.856403   38.410576   11.262142   46.244617   40.991688   
4    1.030481   12.293140   47.031216   47.473938    4.417999    6.388467   

   geo_emb2_6  geo_emb2_7  geo_emb2_8  geo_emb2_9  geo_emb2_10  geo_emb2_11  \
0         0.0   29.018898   59.155689   48.986164          0.0     0.632905   
1         0.0   37.616325   27.948729   44.663967          0.0     1.550905   
2         0.0   16.617846   38.701488   57.741741          0.0    29.273264   
3         0.0    0.000000   52.789894   45.588905          0.0    20.769320   
4         0.0   39.977135   54.841244   11.737732          0.0    46.425812   

   geo_emb2_12  geo_e

## 3-3. Hierarical Model by using Metric Learning

In [26]:
from preprocess.triplet import GeoTripletDataset, GeoMetricEncoder, train_metric, metric_embed

In [28]:
num_l3 = df_combined['geo_level_3_id'].max() + 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
trained_model = train_metric(df_combined, num_l3=num_l3, epochs=6, device=device)

Epoch 1: 100%|██████████| 1358/1358 [00:39<00:00, 34.41it/s]


Epoch 1 Avg Loss: 0.4948


Epoch 2: 100%|██████████| 1358/1358 [00:38<00:00, 34.93it/s]


Epoch 2 Avg Loss: 0.1717


Epoch 3: 100%|██████████| 1358/1358 [00:40<00:00, 33.71it/s]


Epoch 3 Avg Loss: 0.1132


Epoch 4: 100%|██████████| 1358/1358 [00:51<00:00, 26.54it/s]


Epoch 4 Avg Loss: 0.0931


Epoch 5: 100%|██████████| 1358/1358 [00:48<00:00, 28.10it/s]


Epoch 5 Avg Loss: 0.0840


Epoch 6: 100%|██████████| 1358/1358 [00:39<00:00, 34.59it/s]

Epoch 6 Avg Loss: 0.0770





In [30]:
import torch
from pathlib import Path

MODEL_DIR = Path('./models') 
MODEL_DIR.mkdir(exist_ok=True) # 폴더가 없으면 생성

MODEL_FILENAME = f'geo_metric_encoder_e6_dim{trained_model.projection[-1].out_features}.pth'
SAVE_PATH = MODEL_DIR / MODEL_FILENAME

torch.save(trained_model.state_dict(), SAVE_PATH)

In [31]:
import torch
from pathlib import Path

# 로드 설정
MODEL_DIR = Path('./models')
MODEL_FILENAME = 'geo_metric_encoder_e6_dim32.pth'
SAVE_PATH = MODEL_DIR / MODEL_FILENAME 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
map_device = device 

loaded_model = GeoMetricEncoder(num_l3=num_l3, embedding_dim=32) 
state_dict = torch.load(SAVE_PATH, map_location=map_device)
loaded_model.load_state_dict(state_dict)
loaded_model.eval()

print(f"모델 가중치 로드 완료. 모델은 현재 {map_device}에 있습니다.")

모델 가중치 로드 완료. 모델은 현재 cpu에 있습니다.


In [32]:
unique_l3 = df_combined['geo_level_3_id'].unique()
vector_map = metric_embed(loaded_model, unique_l3, device)

In [33]:
vector_map.shape

(11861, 32)

## 3.4 Data Combining

In [34]:
import pandas as pd
import numpy as np

EMBEDDING_DIM = vector_map.shape[1]
embed_cols = [f"geo_emb_{i}" for i in range(EMBEDDING_DIM)]

# Lookup table DataFrame (Geo L3 ID와 Vector 연결)
vector_map_df = pd.DataFrame(vector_map, columns=embed_cols)
vector_map_df['geo_level_3_id'] = unique_l3
# 이 DataFrame은 각 Geo L3 ID에 대한 고유한 임베딩 벡터를 포함합니다.

def merge_embeddings_and_clean(df_input, map_df):
    """원본 DataFrame에 임베딩을 병합하고, 원래의 Geo ID 컬럼을 삭제합니다."""
    
    # 1. Geo Level 3 ID를 기준으로 임베딩 벡터를 병합 (how='left'로 행 순서 유지)
    df_merged = df_input.merge(
        map_df, 
        on='geo_level_3_id', 
        how='left'
    )
    
    # 2. 삭제할 컬럼 정의 (원본 Geo ID)
    cols_to_drop = [
        'geo_level_1_id', 
        'geo_level_2_id', 
        'geo_level_3_id'
    ]
    
    # 3. 원본 Geo ID 컬럼들을 삭제하고 반환
    # errors='ignore'를 사용하여 이미 제거되었을 경우 오류 방지
    df_cleaned = df_merged.drop(columns=cols_to_drop, errors='ignore')
    
    return df_cleaned

train_features_embedded = merge_embeddings_and_clean(original_train_values, vector_map_df)
test_features_embedded = merge_embeddings_and_clean(original_test_values, vector_map_df)

# 3. 결과 출력
print("훈련 데이터 피처 (Geo ID 제거 후 임베딩 추가):")
print(f"새로운 형태: {train_features_embedded.shape}")
print(train_features_embedded[[col for col in train_features_embedded.columns if 'geo_emb' in col]].head())

print("테스트 데이터 피처 (Geo ID 제거 후 임베딩 추가):")
print(f"새로운 형태: {test_features_embedded.shape}")
print(test_features_embedded[[col for col in test_features_embedded.columns if 'geo_emb' in col]].head())

훈련 데이터 피처 (Geo ID 제거 후 임베딩 추가):
새로운 형태: (260601, 68)
   geo_emb_0  geo_emb_1  geo_emb_2  geo_emb_3  geo_emb_4  geo_emb_5  \
0   0.052421  -0.226410   0.014361   0.077549   0.031915   0.109300   
1  -0.304638   0.076604   0.085841   0.021475  -0.286006  -0.226572   
2   0.196524  -0.397095  -0.125743  -0.124669   0.209139   0.008871   
3  -0.063439   0.203958   0.061428   0.252623  -0.127502   0.186127   
4   0.146469  -0.358771  -0.043425  -0.032648   0.146772   0.075012   

   geo_emb_6  geo_emb_7  geo_emb_8  geo_emb_9  ...  geo_emb_22  geo_emb_23  \
0  -0.072760  -0.186794  -0.281290   0.367311  ...    0.116633    0.286511   
1  -0.026238  -0.070000  -0.111879  -0.198614  ...    0.066415    0.200670   
2   0.083708  -0.244732  -0.152054   0.338706  ...    0.015785    0.027957   
3  -0.110770   0.117965  -0.099924   0.039697  ...    0.011037    0.245822   
4  -0.012329  -0.246072  -0.243154   0.417256  ...    0.096162    0.168784   

   geo_emb_24  geo_emb_25  geo_emb_26  geo_emb_27  

In [36]:
# 순서 보장 가정 하에 안전하게 결합하는 방법
test_features_with_embeddings = pd.concat(
    [test_features_embedded.reset_index(drop=True), test_embeddings_df], 
    axis=1
)
test_features_with_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86868 entries, 0 to 86867
Data columns (total 84 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   building_id                             86868 non-null  int64  
 1   count_floors_pre_eq                     86868 non-null  int64  
 2   age                                     86868 non-null  int64  
 3   area_percentage                         86868 non-null  int64  
 4   height_percentage                       86868 non-null  int64  
 5   land_surface_condition                  86868 non-null  object 
 6   foundation_type                         86868 non-null  object 
 7   roof_type                               86868 non-null  object 
 8   ground_floor_type                       86868 non-null  object 
 9   other_floor_type                        86868 non-null  object 
 10  position                                86868 non-null  ob

In [37]:
# 순서 보장 가정 하에 안전하게 결합하는 방법
train_features_with_embeddings = pd.concat(
    [train_features_embedded.reset_index(drop=True), train_embeddings_df], 
    axis=1
)
train_features_with_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 84 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   building_id                             260601 non-null  int64  
 1   count_floors_pre_eq                     260601 non-null  int64  
 2   age                                     260601 non-null  int64  
 3   area_percentage                         260601 non-null  int64  
 4   height_percentage                       260601 non-null  int64  
 5   land_surface_condition                  260601 non-null  object 
 6   foundation_type                         260601 non-null  object 
 7   roof_type                               260601 non-null  object 
 8   ground_floor_type                       260601 non-null  object 
 9   other_floor_type                        260601 non-null  object 
 10  position                                2606

# 4. Model Training
- 기본 모델들을 최종적으로 학습시킨다.
- 여기서 완성된 모델이 추론을 수행하거나, 더 큰 모델로 합쳐짐.

In [11]:
from model import XGBoost

In [47]:
final_train_df = train_features_with_embeddings.merge(original_train_labels, on="building_id", how="left")
final_train_df = final_train_df.drop(columns='building_id')

In [15]:
xgb_ensemble = XGBoost(df=final_train_df, target_col="damage_grade")
xgb_ensemble.encoding(categorical_cols=original_categorical_cols)
xgb_ensemble.kfold(n_splits=5)

In [16]:
xgb_ensemble.train()

[0]	validation_0-mlogloss:0.97983
[1]	validation_0-mlogloss:0.95607
[2]	validation_0-mlogloss:0.93410
[3]	validation_0-mlogloss:0.91376
[4]	validation_0-mlogloss:0.89482
[5]	validation_0-mlogloss:0.87724
[6]	validation_0-mlogloss:0.86080
[7]	validation_0-mlogloss:0.84523
[8]	validation_0-mlogloss:0.83070
[9]	validation_0-mlogloss:0.81708
[10]	validation_0-mlogloss:0.80426
[11]	validation_0-mlogloss:0.79228
[12]	validation_0-mlogloss:0.78091
[13]	validation_0-mlogloss:0.77029
[14]	validation_0-mlogloss:0.76024
[15]	validation_0-mlogloss:0.75082
[16]	validation_0-mlogloss:0.74191
[17]	validation_0-mlogloss:0.73346
[18]	validation_0-mlogloss:0.72547
[19]	validation_0-mlogloss:0.71800
[20]	validation_0-mlogloss:0.71088
[21]	validation_0-mlogloss:0.70416
[22]	validation_0-mlogloss:0.69777
[23]	validation_0-mlogloss:0.69175
[24]	validation_0-mlogloss:0.68605
[25]	validation_0-mlogloss:0.68063
[26]	validation_0-mlogloss:0.67550
[27]	validation_0-mlogloss:0.67061
[28]	validation_0-mlogloss:0.6

In [17]:
xgb_ensemble.eval()

XGBoost K-Fold CV Micro F1 Score: 0.75581


0.755814444303744

In [18]:
# save models
with open("xgb_ensemble.pkl", "wb") as f:
    pickle.dump(xgb_ensemble, f)

In [19]:
# load models
with open("xgb_ensemble.pkl", "rb") as f:
    xgb_ensemble = pickle.load(f)

print(f"모델 개수: {len(xgb_ensemble.models)}")
print(f"모델 타입: {type(xgb_ensemble.models[0])}")

모델 개수: 5
모델 타입: <class 'xgboost.sklearn.XGBClassifier'>


In [None]:
'''
from model import LightGBMLimit

lgb_limit = LightGBMLimit(df=final_train_df, target_col="damage_grade")
lgb_limit.encoding(categorical_cols=original_categorical_cols)
lgb_limit.kfold(n_splits=5)

lgb_limit.train()
lgb_limit.eval()

# save models
with open("lgb_limit.pkl", "wb") as f:
    pickle.dump(lgb_limit, f)
'''

In [51]:
final_categorical_cols = [
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "land_surface_condition",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status",
]

In [52]:
lgb_ensemble = LightGBM(df=final_train_df, target_col="damage_grade")
lgb_ensemble.encoding(categorical_cols=final_categorical_cols)
lgb_ensemble.kfold(n_splits=5)
train_columns = lgb_ensemble.X.columns.to_list()
"building_id" in train_columns, len(train_columns)

(False, 83)

In [53]:
lgb_ensemble.train()

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[678]	valid_0's multi_logloss: 0.55639


1it [00:19, 19.30s/it]

[Fold 0] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[575]	valid_0's multi_logloss: 0.556535


2it [00:33, 16.35s/it]

[Fold 1] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[641]	valid_0's multi_logloss: 0.553782


3it [00:49, 16.04s/it]

[Fold 2] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[614]	valid_0's multi_logloss: 0.557798


4it [01:04, 15.63s/it]

[Fold 3] done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[574]	valid_0's multi_logloss: 0.558963


5it [01:18, 15.76s/it]

[Fold 4] done.





In [54]:
lgb_ensemble.eval()

LightGBM Small Model CV Micro F1: 0.75256


0.7525604276269086

In [55]:
# save model
with open("lgb_ensemble.pkl", "wb") as f:
    pickle.dump(lgb_ensemble, f)

In [25]:
# load model
with open("lgb_ensemble.pkl", "rb") as f:
    lgb_ensemble = pickle.load(f)

print(f"모델 개수: {len(lgb_ensemble.models)}")
print(f"모델 타입: {type(lgb_ensemble.models[0])}")

모델 개수: 5
모델 타입: <class 'lightgbm.sklearn.LGBMClassifier'>


# 5. Combining Baseline Models

- Ensemble Method를 사용해서 모델의 성능을 높인다.
- Stacking 기법. LinearSVM을 이용해 Meta 모델을 만들기
- TODO: lgb_ensemble + xgb_ensemble 모델 만들어보기

In [58]:
import torch
import pandas as pd

from prediction import predict_prob
from prediction import predict_lgb
from prediction import get_test_meta_data
from prediction import get_oof_predictions

from prediction import create_xgb_submission
from prediction import create_lgb_submission
from prediction import create_ensemble_submission


In [27]:
# 함수 실행
prob_df = predict_prob(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=train_values_embedded,
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist(),
)

# 결과 확인
print(prob_df.shape)
prob_df.head()

(260601, 6)


Unnamed: 0,lgb_class0,lgb_class1,lgb_class2,xgb_class0,xgb_class1,xgb_class2
0,0.002196,0.141024,0.85678,0.001845,0.105661,0.892493
1,0.004886,0.851847,0.143267,0.004857,0.849888,0.145255
2,0.005324,0.211425,0.783251,0.008953,0.257848,0.733199
3,0.002202,0.90089,0.096907,0.00252,0.900067,0.097413
4,0.001123,0.556464,0.442413,0.00117,0.540729,0.458101


In [None]:
# 1. 타겟 컬럼 이름 확인 (보통 'damage_grade' 혹은 'label' 입니다)
# train_labels_embedded의 컬럼을 확인해보세요. 보통 'building_id'와 'damage_grade'가 있을 겁니다.
target_col = 'damage_grade'

# 만약 컬럼명이 다르다면 아래 주석을 풀고 컬럼명을 확인 후 수정하세요
# print(train_labels_embedded.columns) 

# 2. building_id를 제외하고 타겟값만 넘겨주도록 수정
X_train_meta_df = get_oof_predictions(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    X=train_values_embedded,
    
    # [수정된 부분] 전체 DF 대신 특정 컬럼만 선택해서 넘깁니다.
    y=train_labels_embedded[target_col], 
    
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

print("OOF Shape:", X_train_meta_df.shape)
# 이제 (260601, 6)이 정상적으로 출력될 것입니다.

Generating OOF Predictions...
OOF Shape: (260601, 6)


In [None]:
y_train_tensor = train_labels_embedded['damage_grade'] - 1 
meta_model = train_stacking_model(X_train_meta_df, y_train_tensor, input_dim=6)

In [None]:
# 2-3. 테스트 데이터 메타 데이터 생성
X_test_meta, test_ids = get_test_meta_data(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=test_values_embedded,
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

In [None]:
# 2-4. 최종 예측 수행
meta_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_meta.values).to(device)
    outputs = meta_model(X_test_tensor)
    # Logits -> 확률 -> 가장 높은 클래스 선택
    # 0,1,2 로 나오므로 다시 +1 해서 1,2,3으로 복구
    final_preds = torch.argmax(outputs, dim=1).cpu().numpy() + 1 

# 2-5. 제출 파일 생성
submission = pd.DataFrame({
    'building_id': test_ids,
    'damage_grade': final_preds
})

print("최종 예측 완료!")
print(submission.head())

# 저장
submission.to_csv('submission_stacking_nn.csv', index=False)

# 6. Submission
- 모델 예측 결과를 통해 최종 결과를 도출한다.

In [None]:
print("train feature count:", len(lgb_ensemble.X.columns.tolist()), len(xgb_ensemble.X.columns.tolist()))

In [56]:
lgb_ensemble.X.columns.tolist()

['count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other',
 'legal_ownership_status',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_agriculture',
 'has_secondary_use_hotel',
 'has_secondary_use_rental',
 'has_secondary_use_institution',
 'has_secondary_use_school',
 'has_secondary_use_industry',
 'has_secondary_use_health_post',
 'has_secondary_use_gov_office',
 'has_secondary_use_use_police',
 'has_secondary_use_other',
 'geo

In [None]:
xgb_ensemble.X.columns.tolist()

In [None]:
avg_test_probabilities = predict_lgb(
        models=models,
        test_df=test_df.drop(columns=['building_id']),
        categorical_cols=categorical_cols,
    )

## 6-1. Single Model Submission

### 6-1-1. LightGBM

In [None]:
lgb_submission_df = create_lgb_submission(
    models=lgb_ensemble.models,
    test_df=test_features_with_embeddings,
    test_building_ids=test_features_with_embeddings['building_id'], # 테스트셋 building_id 컬럼
    categorical_cols=final_categorical_cols
)

# 결과 확인
print(lgb_submission_df.head(5))
lgb_submission_df.to_csv("lgb_submission22.csv", index=False)

NameError: name 'test_values_embedded' is not defined

### 6-1-2. XGBoost

In [33]:
geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
xgb_categorical_cols = [col for col in original_categorical_cols if col not in geo_cols]

xgb_submission_df = create_xgb_submission(
    models=xgb_ensemble.models,
    test_df=train_values_embedded,
    test_building_ids=train_values_embedded['building_id'], # 테스트셋 building_id 컬럼
    categorical_cols=xgb_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist()
)

# 결과 확인
print(xgb_submission_df.head(5))
xgb_submission_df.to_csv("xgb_submission.csv", index=False)

테스트 데이터 예측 확률 계산 중
xgb submission: 260601개 예측 결과 생성됨
   building_id  damage_grade
0       802906             3
1        28830             2
2        94947             3
3       590882             2
4       201944             2


## 6-2. Ensemble Model Submission

In [34]:
# LightGBM 60%, XGBoost 40% 의견반영
ensemble_submission_df = create_ensemble_submission(
    lgb_models=lgb_ensemble.models,
    xgb_models=xgb_ensemble.models,
    test_df=test_values_embedded,
    test_building_ids=test_values_embedded['building_id'],
    categorical_cols=final_categorical_cols,
    train_cols=xgb_ensemble.X.columns.tolist(),
    weights=(0.6, 0.4)
)

print(ensemble_submission_df.head(10))
ensemble_submission_df.to_csv("final_submission.csv", index=False)

xgb 테스트 데이터 예측 확률 계산 중
lgb 테스트 데이터 예측 확률 계산 중
모델 결과 가중 합산 중
ensemble submission: 86868개 예측 결과 생성됨
   building_id  damage_grade
0       300051             3
1        99355             2
2       890251             3
3       745817             1
4       421793             3
5       871976             2
6       691228             2
7       896100             3
8       343471             2
9       766647             2
