In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import warnings
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
model_path = './transformer_model/'
os.makedirs(model_path, exist_ok=True)
torch.cuda.is_available()

In [81]:
# 캐글 노트북이면 True, 아니면 False
if 'KAGGLE_URL_BASE' in os.environ:
    train_data = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv', index_col=0)
    test_data = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv', index_col=0)
else:
    train_data = pd.read_csv('./kaggle/input/playground-series-s4e12/train.csv', index_col=0)
    test_data = pd.read_csv('./kaggle/input/playground-series-s4e12/test.csv', index_col=0)

In [82]:
train_data.columns = train_data.columns.str.lower().str.replace(' ', '_')
test_data.columns = test_data.columns.str.lower().str.replace(' ', '_')

# 타겟 변수 및 피처 분리
target_column = 'premium_amount'
train_columns = train_data.columns.drop(target_column)
data = pd.concat([train_data, test_data]).drop(columns=target_column)
y = train_data[target_column].dropna().values
del train_data, test_data

In [83]:
# policy_start_date 피처 전처리
data['policy_start_date'] = pd.to_datetime(data['policy_start_date'])
data['policy_start_date'] = (data['policy_start_date'] - pd.to_datetime('1970-01-01')).dt.days

In [84]:
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error
# --- 2. 결측치 처리 ---
# 범주형 변수의 결측치를 'unknown'으로 대체
categorical_columns = data.select_dtypes(include='object').columns
for col in categorical_columns:
    data[col].fillna('unknown', inplace=True)

# 수치형 변수의 결측치를 최대값 + 10%로 대체
float_columns = data.select_dtypes(include=['float64']).columns
for col in float_columns:
    data[col].fillna(data[col].max() * 1.1, inplace=True)
    
int_columns = data.select_dtypes(include=['int64']).columns
for col in int_columns:
    data[col].fillna(data[col].max()+int(data[col].std()), inplace=True)
numerical_columns = float_columns.append(int_columns)

# --- 3. 범주형 변수 인코딩 및 수치형 스케일링 ---
# 범주형 변수 인코딩
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 수치형 변수 스케일링
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

y_scaler = StandardScaler().fit(np.log1p(y).reshape(-1,1))
y = y_scaler.transform(np.log1p(y).reshape(-1,1)).reshape(-1)

data[target_column] = np.pad(y, (0, data.shape[0]-len(y)), 'constant', constant_values=np.nan)
X = data[:len(y)].drop(columns=target_column).copy()

In [None]:

def inverse_y(y):
    return np.expm1(y_scaler.inverse_transform(y.reshape(-1,1)).reshape(-1))

def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    
    
    y_pred = inverse_y(y_pred)
    y_val = inverse_y(y_val)
    
    rmsle = mean_squared_log_error(y_val, y_pred) ** (1/2)
    return r2, mse, rmsle

def rmsle(y_true, y_pred):
    y_true = inverse_y(y_true)
    y_pred = inverse_y(y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [85]:
data

Unnamed: 0_level_0,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,policy_type,previous_claims,vehicle_age,credit_score,insurance_duration,policy_start_date,customer_feedback,smoking_status,exercise_frequency,property_type,premium_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.624171,0,-0.685909,1,-0.810261,0,1,-0.359399,2,2,-0.408224,1.286535,-1.459636,-0.007139,1.300820,2,0,3,2,1.248538
1,-0.186740,0,-0.149665,0,0.509198,2,3,-0.823814,0,1,-0.647950,0.420678,0.347957,-1.163606,0.928832,0,1,1,2,0.646643
2,-1.336685,1,-0.300306,0,0.509198,1,1,1.264546,1,2,-0.647950,0.767021,1.694669,-0.778117,1.139753,1,1,3,2,-0.229752
3,-1.480428,1,2.581937,1,-0.150531,0,3,-1.129828,0,0,-0.647950,-1.657378,-1.487704,-1.549096,1.630624,2,1,0,0,0.043158
4,-1.480428,1,0.048009,2,-0.810261,0,1,-0.506253,0,2,-0.887676,-0.272007,-0.190953,-0.392628,-0.141113,2,1,3,2,0.929390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,0.603847,0,0.026464,1,-0.810261,0,3,-0.894582,0,2,1.485609,-0.272007,-1.813296,-1.163606,-0.419145,0,1,0,1,
1999996,2.070026,0,0.886280,2,-1.469990,2,3,-1.314324,0,0,-0.408224,-1.657378,1.694669,-1.163606,0.783105,1,0,0,0,
1999997,-1.121070,0,-0.062890,2,-1.469990,2,0,-1.414036,2,1,1.485609,0.074335,1.694669,0.378351,-1.661661,2,0,1,0,
1999998,-0.546098,0,0.197014,2,0.509198,2,3,-0.799532,2,2,-0.408224,1.286535,-0.926340,0.763840,0.163764,0,0,3,1,


In [86]:
# --- LGBoost 모델 Feature 로 넣어주기 ---
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
FOLDS=5
kf = KFold(n_splits=FOLDS, 
           shuffle=False,
)

test_idx = data[target_column].isnull()
lgb_fold_predictions = []
for train_idx, val_idx in kf.split(X):
    lgb_predictions = np.zeros(len(data))  # 예측값 저장
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    lgbr = LGBMRegressor(
        num_leaves=31,
        min_child_samples=20,
        min_child_weight=0.001,
        
        n_estimators=500,
        max_depth=20,
        learning_rate=0.1,
        n_jobs=-1,
        random_state=42,
        # verbosity=1,
        force_row_wise=True,
    )
    lgbr.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    print(evaluate_model(lgbr, X_val, y_val))
    lgb_predictions[val_idx] = lgbr.predict(X_val)
    lgb_predictions[test_idx] = lgbr.predict(data.loc[test_idx][train_columns])
    lgb_fold_predictions.append(lgb_predictions)
lgb_predictions = np.sum(lgb_fold_predictions, axis=0)
lgb_predictions[:len(y)] = lgb_predictions[:len(y)] / (FOLDS-1)
lgb_predictions[len(y):] = lgb_predictions[len(y):] / FOLDS

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1146
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 19
[LightGBM] [Info] Start training from score 0.000023
(0.08951739665142144, 0.912290839924178, 1.0466646556626518)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 19
[LightGBM] [Info] Start training from score 0.000404
(0.08967290867938316, 0.9152120162325399, 1.0483390385633506)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the o

In [87]:
data['lgb_pred'] = lgb_predictions

In [88]:
X = data[:len(y)].drop(columns=target_column).copy()

In [89]:
X

Unnamed: 0_level_0,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,policy_type,previous_claims,vehicle_age,credit_score,insurance_duration,policy_start_date,customer_feedback,smoking_status,exercise_frequency,property_type,lgb_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.624171,0,-0.685909,1,-0.810261,0,1,-0.359399,2,2,-0.408224,1.286535,-1.459636,-0.007139,1.300820,2,0,3,2,0.063543
1,-0.186740,0,-0.149665,0,0.509198,2,3,-0.823814,0,1,-0.647950,0.420678,0.347957,-1.163606,0.928832,0,1,1,2,-0.004758
2,-1.336685,1,-0.300306,0,0.509198,1,1,1.264546,1,2,-0.647950,0.767021,1.694669,-0.778117,1.139753,1,1,3,2,0.024123
3,-1.480428,1,2.581937,1,-0.150531,0,3,-1.129828,0,0,-0.647950,-1.657378,-1.487704,-1.549096,1.630624,2,1,0,0,0.006562
4,-1.480428,1,0.048009,2,-0.810261,0,1,-0.506253,0,2,-0.887676,-0.272007,-0.190953,-0.392628,-0.141113,2,1,3,2,0.012957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,-0.402355,0,-0.257811,1,-1.469990,2,2,-0.942532,2,2,1.485609,-0.791521,-1.459636,-0.778117,0.852133,2,0,0,0,0.065871
1199996,0.891333,1,-0.047816,0,1.432819,2,1,-1.093797,0,1,1.485609,0.074335,-0.196567,-0.392628,0.401530,2,0,3,0,0.017443
1199997,-1.624171,1,0.351300,0,-1.469990,2,3,-0.879662,1,0,-0.887676,1.632877,1.694669,0.378351,-0.505431,1,0,1,1,-0.319328
1199998,0.963204,1,3.155685,2,-0.810261,3,3,-0.627078,1,2,-0.647950,-0.445179,-1.263159,-0.392628,-0.281087,2,0,0,0,0.011066


In [90]:

# --- 4. 데이터셋 및 DataLoader 정의 ---
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TabularDataset(X_train, y_train)
val_dataset = TabularDataset(X_val, y_val)
test_dataset = torch.tensor(data[data[target_column].isnull()].drop(columns=[target_column]).values, dtype=torch.float32)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

In [95]:
# --- 5. 트랜스포머 모델 정의 ---
class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, num_layers, hidden_dim, output_dim, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)  # Dropout 추가
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.layer_norm = nn.LayerNorm(embed_dim)  # Layer Normalization 추가
        self.fc = nn.Linear(embed_dim, output_dim)

    def forward(self, x):
        # 입력 임베딩 및 드롭아웃 적용
        x = self.embedding(x)
        x = self.dropout(x)

        # Transformer 인코더와 Layer Normalization 적용
        x = self.transformer_encoder(x)
        x = self.layer_norm(x)  # Normalization 추가

        # 출력 레이어
        x = self.fc(x)
        return x

# 모델 초기화
input_dim = X.shape[1]
model = TransformerModel(input_dim=input_dim, embed_dim=64, num_heads=4, num_layers=2, hidden_dim=128, output_dim=1)
model = TransformerModel(input_dim=input_dim, embed_dim=16, num_heads=1, num_layers=2, hidden_dim=32, output_dim=1)

In [97]:
# --- 6. 모델 학습 ---
device = torch.device("mps")
model.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
def train_model(model: torch.nn.Module, 
    train_loader: torch.utils.data.DataLoader, 
    val_loader: torch.utils.data.DataLoader, 
    criterion: torch.nn.Module, 
    optimizer: torch.optim.Optimizer, 
    epochs=10
):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # 검증 루프
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
        
        torch.save(model.state_dict(), model_path+f"model_{epoch}_embed_dim={model.embed_dim}_num_heads={model.num_heads}_num_layers={model.num_layers}_hidden_dim={model.hidden_dim}_dropout={model.dropout}.pt")

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")


In [None]:
# target val loss is under 0.9124044109944525
# with dropout, layer normalization
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

Epoch 1/10, Train Loss: 0.9233, Val Loss: 0.9171
Epoch 2/10, Train Loss: 0.9153, Val Loss: 0.9152
Epoch 3/10, Train Loss: 0.9141, Val Loss: 0.9151


In [93]:
# target val loss is under 0.9124044109944525
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

Epoch 1/10, Train Loss: 0.9210, Val Loss: 0.9170
Epoch 2/10, Train Loss: 0.9148, Val Loss: 0.9145
Epoch 3/10, Train Loss: 0.9137, Val Loss: 0.9146
Epoch 4/10, Train Loss: 0.9133, Val Loss: 0.9140
Epoch 5/10, Train Loss: 0.9129, Val Loss: 0.9135
Epoch 6/10, Train Loss: 0.9127, Val Loss: 0.9134
Epoch 7/10, Train Loss: 0.9125, Val Loss: 0.9140
Epoch 8/10, Train Loss: 0.9124, Val Loss: 0.9137
Epoch 9/10, Train Loss: 0.9122, Val Loss: 0.9135
Epoch 10/10, Train Loss: 0.9121, Val Loss: 0.9135


In [98]:
model.eval()
predict = []
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).detach().cpu().numpy()
        predict.append(outputs)
        
outputs = np.concatenate(predict).reshape(-1)

In [104]:
from sklearn.metrics import root_mean_squared_log_error
y_pred = inverse_y(outputs)

In [105]:
y_pred

array([612.74677, 896.30774, 970.8416 , ..., 898.1336 , 834.6991 ,
       833.2415 ], dtype=float32)