# K리그 패스 예측 - V8 (V7 기반 개선)

## 현재 상황
- V7: Val 13.90
- 앙상블 5개: Val 13.21, Test 13.96
- 1등: 12.25

## 개선 전략
1. **Y축 대칭 데이터 증강** - 학습 시 적용 (TTA 효과 내도록)
2. **더 큰 모델** - d_model 192, n_layers 5
3. **더 많은 에폭 + 강한 정규화**
4. **개선된 Loss** - 거리 직접 최적화 추가
5. **앙상블 10개** - 다양한 시드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import warnings
import gc
import math
import random
import pickle
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingWarmRestarts
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split

DATA_DIR = 'open_track1'
CACHE_DIR = 'cache'
os.makedirs(CACHE_DIR, exist_ok=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

Device: cuda


In [2]:
# 설정 - 더 큰 모델
CONFIG = {
    # 모델 (더 큼)
    'd_model': 192,       # 128 → 192
    'n_heads': 6,         # 4 → 6
    'n_layers': 5,        # 4 → 5
    'd_ff': 384,          # 256 → 384
    'dropout': 0.15,      # 0.1 → 0.15 (더 강한 정규화)
    'max_seq_len': 64,  # 64
    
    # 학습
    'batch_size': 64,     # 128 → 64 (더 큰 모델이라 줄임)
    'lr': 2e-4,           # 5e-4 → 3e-4
    'weight_decay': 0.02, # 0.01 → 0.02
    'epochs': 60,         # 40 → 60
    'patience': 15,       # 10 → 15
    
    # 데이터 증강
    'use_y_flip_augment': True,  # Y축 대칭 증강
    'augment_prob': 0.5,         # 50% 확률로 증강
    
    # 기타
    'use_amp': True,
    'num_workers': 0,
}

N_SEQ_FEATURES = 22
print(f"Config: {CONFIG}")

Config: {'d_model': 192, 'n_heads': 6, 'n_layers': 5, 'd_ff': 384, 'dropout': 0.15, 'max_seq_len': 64, 'batch_size': 64, 'lr': 0.0002, 'weight_decay': 0.02, 'epochs': 60, 'patience': 15, 'use_y_flip_augment': True, 'augment_prob': 0.5, 'use_amp': True, 'num_workers': 0}


In [3]:
# 데이터 로딩
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
match_info = pd.read_csv(f'{DATA_DIR}/match_info.csv')

all_games = train_df['game_id'].unique()
train_games, val_games = train_test_split(all_games, test_size=0.15, random_state=42)
train_episodes = train_df[train_df['game_id'].isin(train_games)]['game_episode'].unique()
val_episodes = train_df[train_df['game_id'].isin(val_games)]['game_episode'].unique()

print(f"Train: {len(train_episodes)} episodes")
print(f"Val: {len(val_episodes)} episodes")

Train: 13097 episodes
Val: 2338 episodes


## 1. 피처 엔지니어링 (Y축 대칭 지원)

In [4]:
def compute_sequence_features(episode_df, flip_y=False):
    """
    확장된 시퀀스 피처 (22개)
    flip_y=True면 Y좌표를 68-y로 변환
    """
    df = episode_df.copy().reset_index(drop=True)
    n = len(df)
    
    # Y축 반전
    if flip_y:
        df['start_y'] = 68 - df['start_y']
        df['end_y'] = 68 - df['end_y']
    
    pass_indices = df[df['type_name'] == 'Pass'].index.tolist()
    last_pass_idx = pass_indices[-1] if pass_indices else n - 1 # 삼항 연산자
    
    df['end_x_filled'] = df['end_x'].copy()
    df['end_y_filled'] = df['end_y'].copy()
    
    for i in range(n):  # 결측값 채우기
        if pd.isna(df.loc[i, 'end_x_filled']):
            df.loc[i, 'end_x_filled'] = df.loc[i, 'start_x']
        if pd.isna(df.loc[i, 'end_y_filled']):
            df.loc[i, 'end_y_filled'] = df.loc[i, 'start_y']
    # 마지막 패스의 end값을 start값으로 대체(end값을 예측하기 위해서)
    df.loc[last_pass_idx, 'end_x_filled'] = df.loc[last_pass_idx, 'start_x']
    df.loc[last_pass_idx, 'end_y_filled'] = df.loc[last_pass_idx, 'start_y']
    
    # 마지막 패스 직전에 일어난 패스들
    # avg_dx: x좌표 변화량들의 평균, avg_dy: y좌표 변화량들의 평균
    # team_attack_angle: 아크탄젠트를 이용한 팀의 공격 방향, 0에 가까울수록 직선적, 절댓값이 클수록 측면 활용 높고, 백패스
    # team_attack_strength: 공격 강도, 평균적인 패스 길이
    # 든 생각: 피쳐 추출 시에 평균 계산을 많이 하는데, 마지막 패스에 가까울수록 가중치를 줘보자.
    passes_before_last = df[(df['type_name'] == 'Pass') & (df.index < last_pass_idx)]
    if len(passes_before_last) > 0:
        avg_dx = (passes_before_last['end_x_filled'] - passes_before_last['start_x']).mean()
        avg_dy = (passes_before_last['end_y_filled'] - passes_before_last['start_y']).mean()
        team_attack_angle = np.arctan2(avg_dy, avg_dx) / np.pi
        team_attack_strength = np.sqrt(avg_dx**2 + avg_dy**2) / 30
    else:
        team_attack_angle = 0
        team_attack_strength = 0


    # if len(passes_before_last) > 0:
    #     # 1. 인덱스 기반 가중치 생성 (선형 가중치 예시)
    #     # 마지막 패스에 가까울수록(i가 클수록) 큰 값을 가짐
    #     weights = np.arange(len(passes_before_last)) + 1 
    #     weights = weights / weights.sum()  # 합이 1이 되도록 정규화
        
    #     # 2. 가중평균 적용 (np.average 사용)
    #     dx_values = (passes_before_last['end_x_filled'] - passes_before_last['start_x']).values
    #     dy_values = (passes_before_last['end_y_filled'] - passes_before_last['start_y']).values
        
    #     avg_dx = np.average(dx_values, weights=weights)
    #     avg_dy = np.average(dy_values, weights=weights)
        
    #     # 이후 계산은 동일
    #     team_attack_angle = np.arctan2(avg_dy, avg_dx) / np.pi
    #     team_attack_strength = np.sqrt(avg_dx**2 + avg_dy**2) / 30
    # else:
    #     team_attack_angle = 0
    #     team_attack_strength = 0
    
    features = []
    for i in range(n):
        row = df.iloc[i]
        # 마지막 패스인지 확인하는 부호
        # 든 생각: 지금은 모든 행에 대해 피쳐를 계산하는데, carry나 recovery 같은 행은 필터링해서 이상치를 제거하는게 낫지 않을까?
        is_last_pass = (i == last_pass_idx)
        
        # 좌표 정규화
        start_x = row['start_x'] / 105
        start_y = row['start_y'] / 68
        
        # 한 이벤트 내에서 공 이동(패스, non패스 포함)의 이동거리, 각도, 시간 간격, 속도 계산
        if is_last_pass:
            end_x, end_y = start_x, start_y
            distance, angle, speed = 0, 0, 0
        else:
            end_x = row['end_x_filled'] / 105
            end_y = row['end_y_filled'] / 68
            distance = np.sqrt((row['end_x_filled']-row['start_x'])**2 + 
                              (row['end_y_filled']-row['start_y'])**2) / 50
            angle = np.arctan2(row['end_y_filled']-row['start_y'],
                              row['end_x_filled']-row['start_x']) / np.pi
            td = df.iloc[i+1]['time_seconds'] - row['time_seconds'] if i < n-1 else 1
            td = max(td, 0.1)
            speed = min(distance * 50 / td, 50) / 50
        
        # dist_to_goal: 시작 위치에서 골대까지의 거리
        # y_from_center: 경기장 세로 중앙으로부터 y좌표가 떨어진 거리(측면 편차)
        # x_progress: x좌표가 0으로 부터 떨어진 거리
        # position_in_seq: 에피소드 내에서의 행의 위치
        # remaining: 에피소드 내에서 남은 행
        dist_to_goal = np.sqrt((row['start_x']-105)**2 + (row['start_y']-34)**2) / 120
        y_from_center = (row['start_y'] - 34) / 34
        x_progress = row['start_x'] / 105
        position_in_seq = i / max(n-1, 1)
        remaining = (n-1-i) / max(n-1, 1)
        
        # continuity_x: 연속된 두 이벤트 사이에 x좌표 위치 연결성, 값이 크면 데이터 누락
        # continuity_y: 연속된 두 이벤트 사이에 y좌표 위치 연결성, 값이 크면 데이터 누락
        # time_diff:연속된 두 이벤트 사이에 시간 간격, 값이 크면 경기가 중단되었거나 한참 뒤에 일어난 플레이
        # 값이 작으면: 아주 빠른 템포의 플레이 (예: 원터치 패스)
        if i > 0:
            continuity_x = (row['start_x'] - df.iloc[i-1]['end_x_filled']) / 30
            continuity_y = (row['start_y'] - df.iloc[i-1]['end_y_filled']) / 30
            time_diff = (row['time_seconds'] - df.iloc[i-1]['time_seconds']) / 10
        else:
            continuity_x, continuity_y, time_diff = 0, 0, 0
        
        feat = [
            start_x, start_y, end_x, end_y,
            distance, angle, speed,
            dist_to_goal, y_from_center, x_progress,
            position_in_seq, remaining,
            continuity_x, continuity_y, time_diff,
            float(row['is_home']),
            float(row['type_name'] == 'Pass'),
            float(row['type_name'] == 'Carry'),
            float(is_last_pass),
            team_attack_angle,
            team_attack_strength,
            len(passes_before_last) / 20,
        ]
        features.append(feat)
    
    return np.array(features, dtype=np.float32)


def get_last_pass_info(episode_df, flip_y=False):
    df = episode_df.copy().reset_index(drop=True)
    
    if flip_y:
        df['start_y'] = 68 - df['start_y']
        df['end_y'] = 68 - df['end_y']
    
    passes = df[df['type_name'] == 'Pass']
    last_pass = passes.iloc[-1]
    
    sx, sy = last_pass['start_x'], last_pass['start_y']
    
    if len(passes) > 1:
        pp = passes.iloc[:-1].copy()
        pp['end_x'] = pp['end_x'].fillna(pp['start_x'])
        pp['end_y'] = pp['end_y'].fillna(pp['start_y'])
        avg_dist = np.sqrt((pp['end_x']-pp['start_x'])**2 + (pp['end_y']-pp['start_y'])**2).mean()
    else:
        avg_dist = 15
    
    features = np.array([
        sx / 105, sy / 68,
        np.sqrt((sx-105)**2 + (sy-34)**2) / 120,
        np.arctan2(34-sy, 105-sx) / np.pi,
        float(last_pass['is_home']),
        avg_dist / 30,
        len(df) / 50,
        (sy - 34) / 34,
    ], dtype=np.float32)
    
    return features, np.array([sx, sy], dtype=np.float32)


def get_meta_features(game_id, period_id, match_info):
    match = match_info[match_info['game_id'] == game_id]
    if len(match) > 0:
        m = match.iloc[0]
        return np.array([
            (m['home_score'] - m['away_score']) / 5,
            (m['home_score'] + m['away_score']) / 10,
            m['game_day'] / 38,
            period_id / 2
        ], dtype=np.float32)
    return np.array([0, 0, 0.5, period_id / 2], dtype=np.float32)


def get_target(episode_df, start_point, flip_y=False):
    df = episode_df.copy()
    if flip_y:
        df['end_y'] = 68 - df['end_y']
    
    last_pass = df[df['type_name'] == 'Pass'].iloc[-1]
    end_x, end_y = last_pass['end_x'], last_pass['end_y']
    
    return np.array([
        (end_x - start_point[0]) / 50,
        (end_y - start_point[1]) / 50
    ], dtype=np.float32), np.array([end_x, end_y], dtype=np.float32)

In [5]:
def precompute_features(df, episodes, match_info, config, cache_path=None, include_flip=False):
    """
    피처 사전 계산
    include_flip=True면 Y축 반전 버전도 같이 저장
    """
    if cache_path and os.path.exists(cache_path):
        print(f"캐시 로딩: {cache_path}")
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    
    print("피처 사전 계산 중...")
    precomputed = {}
    max_len = config['max_seq_len']
    
    for ep_name in tqdm(episodes, desc='Precomputing'):
        ep_df = df[df['game_episode'] == ep_name].copy()
        
        # 원본
        seq_feat = compute_sequence_features(ep_df, flip_y=False)
        seq_len = len(seq_feat)
        
        if seq_len > max_len:
            seq_feat = seq_feat[-max_len:]
            mask = np.ones(max_len, dtype=np.float32)
        else:
            pad_len = max_len - seq_len
            seq_feat = np.vstack([np.zeros((pad_len, N_SEQ_FEATURES), dtype=np.float32), seq_feat])
            mask = np.concatenate([np.zeros(pad_len), np.ones(seq_len)]).astype(np.float32)
        
        lp_feat, start_point = get_last_pass_info(ep_df, flip_y=False)
        
        game_id = ep_df['game_id'].iloc[0]
        period_id = ep_df['period_id'].iloc[0]
        meta_feat = get_meta_features(game_id, period_id, match_info)
        
        try:
            target_offset, target_abs = get_target(ep_df, start_point, flip_y=False)
        except:
            target_offset = np.zeros(2, dtype=np.float32)
            target_abs = np.zeros(2, dtype=np.float32)
        
        precomputed[ep_name] = {
            'seq_feat': seq_feat,
            'mask': mask,
            'lp_feat': lp_feat,
            'meta_feat': meta_feat,
            'start_point': start_point,
            'target_offset': target_offset,
            'target_abs': target_abs,
        }
        
        # Y축 반전 버전
        if include_flip:
            seq_feat_flip = compute_sequence_features(ep_df, flip_y=True)
            if len(seq_feat_flip) > max_len:
                seq_feat_flip = seq_feat_flip[-max_len:]
            else:
                pad_len = max_len - len(seq_feat_flip)
                seq_feat_flip = np.vstack([np.zeros((pad_len, N_SEQ_FEATURES), dtype=np.float32), seq_feat_flip])
            
            lp_feat_flip, start_point_flip = get_last_pass_info(ep_df, flip_y=True)
            target_offset_flip, target_abs_flip = get_target(ep_df, start_point_flip, flip_y=True)
            
            precomputed[ep_name]['seq_feat_flip'] = seq_feat_flip
            precomputed[ep_name]['lp_feat_flip'] = lp_feat_flip
            precomputed[ep_name]['start_point_flip'] = start_point_flip
            precomputed[ep_name]['target_offset_flip'] = target_offset_flip
            precomputed[ep_name]['target_abs_flip'] = target_abs_flip
    
    if cache_path:
        with open(cache_path, 'wb') as f:
            pickle.dump(precomputed, f)
    
    return precomputed

# 피처 계산 (증강 포함)
train_cache = precompute_features(
    train_df, train_episodes, match_info, CONFIG,
    cache_path=f'{CACHE_DIR}/train_v8_aug.pkl',
    include_flip=True
)

val_cache = precompute_features(
    train_df, val_episodes, match_info, CONFIG,
    cache_path=f'{CACHE_DIR}/val_v8.pkl',
    include_flip=True  # TTA용
)

print(f"Train: {len(train_cache)}, Val: {len(val_cache)}")

캐시 로딩: cache/train_v8_aug.pkl
캐시 로딩: cache/val_v8.pkl
Train: 13097, Val: 2338


## 2. 데이터셋 (증강 지원)

In [6]:
class AugmentedDataset(Dataset):
    """
    Y축 대칭 증강을 지원하는 데이터셋
    """
    def __init__(self, precomputed, episodes, augment_prob=0.5, is_train=True):
        self.precomputed = precomputed
        self.episodes = list(episodes)
        self.augment_prob = augment_prob
        self.is_train = is_train
    
    def __len__(self):
        return len(self.episodes)
    
    def __getitem__(self, idx):
        ep_name = self.episodes[idx]
        data = self.precomputed[ep_name]
        
        # 학습 중 augment_prob 확률로 Y축 반전 사용
        use_flip = self.is_train and random.random() < self.augment_prob and 'seq_feat_flip' in data
        
        if use_flip:
            return {
                'sequence': torch.from_numpy(data['seq_feat_flip']),
                'seq_mask': torch.from_numpy(data['mask']),
                'last_pass': torch.from_numpy(data['lp_feat_flip']),
                'meta': torch.from_numpy(data['meta_feat']),
                'start_point': torch.from_numpy(data['start_point_flip']),
                'target_offset': torch.from_numpy(data['target_offset_flip']),
                'target_abs': torch.from_numpy(data['target_abs_flip']),
            }
        else:
            return {
                'sequence': torch.from_numpy(data['seq_feat']),
                'seq_mask': torch.from_numpy(data['mask']),
                'last_pass': torch.from_numpy(data['lp_feat']),
                'meta': torch.from_numpy(data['meta_feat']),
                'start_point': torch.from_numpy(data['start_point']),
                'target_offset': torch.from_numpy(data['target_offset']),
                'target_abs': torch.from_numpy(data['target_abs']),
            }

## 3. 모델 (더 크고 깊게)

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])


class ImprovedModel(nn.Module):
    """
    V7 기반 개선 모델
    - 더 큰 d_model
    - 더 깊은 Transformer
    - Residual connections 강화
    """
    def __init__(self, config):
        super().__init__()
        d = config['d_model']
        dropout = config['dropout']
        
        # 시퀀스 인코더
        self.seq_embedding = nn.Sequential(
            nn.Linear(N_SEQ_FEATURES, d),
            nn.LayerNorm(d),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.pos_encoding = PositionalEncoding(d, config['max_seq_len'], dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d, nhead=config['n_heads'],
            dim_feedforward=config['d_ff'],
            dropout=dropout, activation='gelu', batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=config['n_layers'])
        
        # 마지막 패스 인코더 (더 깊게)
        self.lp_encoder = nn.Sequential(
            nn.Linear(8, d // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d // 2, d),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d, d),
            nn.LayerNorm(d)
        )
        
        # 메타 인코더
        self.meta_encoder = nn.Sequential(
            nn.Linear(4, 64),
            nn.GELU(),
            nn.Linear(64, 64)
        )
        
        # Cross-Attention
        self.cross_attn = nn.MultiheadAttention(d, config['n_heads'], 
                                                dropout=dropout, batch_first=True)
        self.cross_norm = nn.LayerNorm(d)
        
        # Fusion (더 깊게)
        fusion_dim = d * 2 + 64   #64
        self.fusion = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        
        # 출력 헤드
        self.regressor = nn.Sequential(
            nn.Linear(128, 64),
            nn.GELU(),
            nn.Linear(64, 2)
        )
    
    def forward(self, sequence, seq_mask, last_pass, meta):
        B = sequence.size(0)
        
        # Transformer
        seq_emb = self.pos_encoding(self.seq_embedding(sequence))
        padding_mask = (seq_mask == 0)
        encoded = self.transformer(seq_emb, src_key_padding_mask=padding_mask)
        
        # 마지막 토큰
        lengths = seq_mask.sum(dim=1).long().clamp(min=1) - 1
        seq_out = encoded[torch.arange(B, device=encoded.device), lengths]
        
        # Cross-Attention
        lp_emb = self.lp_encoder(last_pass)
        cross_out, _ = self.cross_attn(lp_emb.unsqueeze(1), encoded, encoded,
                                       key_padding_mask=padding_mask)
        cross_out = self.cross_norm(cross_out.squeeze(1) + lp_emb)
        
        # Meta
        meta_emb = self.meta_encoder(meta)
        
        # Fusion
        combined = torch.cat([seq_out, cross_out, meta_emb], dim=1)
        fused = self.fusion(combined)
        
        return self.regressor(fused)

## 4. 손실 함수 (개선)

In [8]:
class ImprovedLoss(nn.Module):
    """
    개선된 손실 함수
    - MSE + Smooth L1
    - 방향 손실 (cosine)
    - 거리 손실 (직접 최적화)
    """
    def __init__(self, direction_weight=0.2, distance_weight=0.3):
        super().__init__()
        self.direction_weight = direction_weight
        self.distance_weight = distance_weight
    
    def forward(self, pred_offset, target_offset, start_point, target_abs):
        # 1. MSE + Smooth L1
        mse_loss = F.mse_loss(pred_offset, target_offset)
        smooth_loss = F.smooth_l1_loss(pred_offset, target_offset)
        
        # 2. 방향 손실
        pred_norm = torch.norm(pred_offset, dim=1, keepdim=True).clamp(min=1e-8)
        target_norm = torch.norm(target_offset, dim=1, keepdim=True).clamp(min=1e-8)
        cos_sim = (pred_offset * target_offset).sum(dim=1) / (pred_norm.squeeze() * target_norm.squeeze() + 1e-8)
        direction_loss = (1 - cos_sim).mean()
        
        # 3. 거리 손실 (절대 좌표로 직접 계산)
        pred_abs = start_point + pred_offset * 50
        dist_error = torch.sqrt(((pred_abs - target_abs) ** 2).sum(dim=1) + 1e-8)
        distance_loss = dist_error.mean() / 50  # 정규화
        
        # 총 손실
        total_loss = (
            mse_loss + 0.5 * smooth_loss +
            self.direction_weight * direction_loss +
            self.distance_weight * distance_loss
        )
        
        return total_loss, dist_error.mean().item()

## 5. 학습

In [9]:
def train_one_model(config, train_cache, val_cache, train_episodes, val_episodes, 
                    device, seed, model_name='model'):
    """단일 모델 학습"""
    set_seed(seed)
    
    # 데이터셋
    train_ds = AugmentedDataset(train_cache, train_episodes, 
                                augment_prob=config['augment_prob'], is_train=True)
    val_ds = AugmentedDataset(val_cache, val_episodes, 
                              augment_prob=0, is_train=False)
    
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], 
                             shuffle=True, num_workers=config['num_workers'], pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=config['batch_size'],
                           shuffle=False, num_workers=config['num_workers'], pin_memory=True)
    
    # 모델
    model = ImprovedModel(config).to(device)
    optimizer = AdamW(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    scheduler = OneCycleLR(optimizer, max_lr=config['lr'],
                          total_steps=len(train_loader) * config['epochs'], pct_start=0.1)
    criterion = ImprovedLoss(direction_weight=0.2, distance_weight=0.3)
    scaler = GradScaler() if config['use_amp'] else None
    
    best_val = float('inf')
    best_state = None
    patience_cnt = 0
    
    print(f"\n{'='*50}")
    print(f"Training {model_name} (seed={seed})")
    print(f"{'='*50}")
    
    for epoch in range(config['epochs']):
        # Train
        model.train()
        train_dist = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}', leave=False):
            seq = batch['sequence'].to(device)
            mask = batch['seq_mask'].to(device)
            lp = batch['last_pass'].to(device)
            meta = batch['meta'].to(device)
            start = batch['start_point'].to(device)
            target_offset = batch['target_offset'].to(device)
            target_abs = batch['target_abs'].to(device)
            
            optimizer.zero_grad()
            
            if config['use_amp']:
                with autocast():
                    pred = model(seq, mask, lp, meta)
                    loss, dist = criterion(pred, target_offset, start, target_abs)
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                pred = model(seq, mask, lp, meta)
                loss, dist = criterion(pred, target_offset, start, target_abs)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            scheduler.step()
            train_dist += dist
        
        train_dist /= len(train_loader)
        
        # Validate (with TTA)
        model.eval()
        val_dist = 0
        val_dist_tta = 0
        
        with torch.no_grad():
            for batch in val_loader:
                seq = batch['sequence'].to(device)
                mask = batch['seq_mask'].to(device)
                lp = batch['last_pass'].to(device)
                meta = batch['meta'].to(device)
                start = batch['start_point'].to(device)
                target_abs = batch['target_abs'].to(device)
                
                # 원본 예측
                pred = model(seq, mask, lp, meta)
                pred_abs = start + pred * 50
                dist = torch.sqrt(((pred_abs - target_abs) ** 2).sum(dim=1) + 1e-8).mean()
                val_dist += dist.item()
        
        val_dist /= len(val_loader)
        
        print(f"Epoch {epoch+1:2d} | Train: {train_dist:.2f} | Val: {val_dist:.2f}", end='')
        
        if val_dist < best_val:
            best_val = val_dist
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_cnt = 0
            print(f" ← Best!")
        else:
            patience_cnt += 1
            print()
        
        if patience_cnt >= config['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    model.load_state_dict(best_state)
    torch.save(best_state, f'{model_name}.pth')
    
    print(f"\n{model_name} 완료! Best Val: {best_val:.2f}")
    
    return model, best_val

In [10]:
# 단일 모델 먼저 테스트
model, val_dist = train_one_model(
    CONFIG, train_cache, val_cache, train_episodes, val_episodes,
    DEVICE, seed=42, model_name='v8_model_1'
)

print(f"\n단일 모델 Val: {val_dist:.2f}")
print(f"기존 V7: 13.90")
print(f"개선: {13.90 - val_dist:.2f}")


Training v8_model_1 (seed=42)


                                                          

Epoch  1 | Train: 18.47 | Val: 18.16 ← Best!


                                                          

Epoch  2 | Train: 17.85 | Val: 17.83 ← Best!


                                                          

Epoch  3 | Train: 17.36 | Val: 17.65 ← Best!


                                                          

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 6. 앙상블 (10개)

In [None]:
# 앙상블 학습 (10개)
SEEDS = [42, 123, 456, 789, 2024, 777, 888, 999, 1234, 5678]
models = []
val_scores = []

for i, seed in enumerate(SEEDS):
    model, val_dist = train_one_model(
        CONFIG, train_cache, val_cache, train_episodes, val_episodes,
        DEVICE, seed=seed, model_name=f'v8_model_{i+1}'
    )
    models.append(model)
    val_scores.append(val_dist)
    gc.collect()
    torch.cuda.empty_cache() if DEVICE.type == 'cuda' else None

print(f"\n{'='*50}")
print(f"개별 모델 Val:")
for i, (seed, score) in enumerate(zip(SEEDS, val_scores)):
    print(f"  Model {i+1} (seed={seed}): {score:.2f}")
print(f"평균: {np.mean(val_scores):.2f}")


Training v8_model_1 (seed=42)


                                                          

Epoch  1 | Train: 18.47 | Val: 18.16 ← Best!


                                                          

Epoch  2 | Train: 17.85 | Val: 17.83 ← Best!


                                                          

Epoch  3 | Train: 17.36 | Val: 17.65 ← Best!


                                                          

Epoch  4 | Train: 17.11 | Val: 16.88 ← Best!


                                                         

KeyboardInterrupt: 

In [None]:
# 앙상블 Val 검증
@torch.no_grad()
def validate_ensemble(models, val_cache, val_episodes, device, use_tta=True):
    for m in models:
        m.eval()
    
    all_dists = []
    
    for ep_name in tqdm(val_episodes, desc='Ensemble Val'):
        data = val_cache[ep_name]
        
        # 원본 피처
        seq = torch.from_numpy(data['seq_feat']).unsqueeze(0).to(device)
        mask = torch.from_numpy(data['mask']).unsqueeze(0).to(device)
        lp = torch.from_numpy(data['lp_feat']).unsqueeze(0).to(device)
        meta = torch.from_numpy(data['meta_feat']).unsqueeze(0).to(device)
        start = data['start_point']
        target = data['target_abs']
        
        # 원본 앙상블 예측
        preds_orig = []
        for model in models:
            pred = model(seq, mask, lp, meta)[0].cpu().numpy()
            pred_abs = start + pred * 50
            preds_orig.append(pred_abs)
        ensemble_orig = np.mean(preds_orig, axis=0)
        
        if use_tta and 'seq_feat_flip' in data:
            # 반전 피처
            seq_flip = torch.from_numpy(data['seq_feat_flip']).unsqueeze(0).to(device)
            lp_flip = torch.from_numpy(data['lp_feat_flip']).unsqueeze(0).to(device)
            start_flip = data['start_point_flip']
            
            preds_flip = []
            for model in models:
                pred = model(seq_flip, mask, lp_flip, meta)[0].cpu().numpy()
                pred_abs = start_flip + pred * 50
                pred_abs[1] = 68 - pred_abs[1]  # Y 다시 반전
                preds_flip.append(pred_abs)
            ensemble_flip = np.mean(preds_flip, axis=0)
            
            # TTA 평균
            final_pred = (ensemble_orig + ensemble_flip) / 2
        else:
            final_pred = ensemble_orig
        
        dist = np.sqrt((final_pred[0] - target[0])**2 + (final_pred[1] - target[1])**2)
        all_dists.append(dist)
    
    return np.mean(all_dists)

# 앙상블 검증
ensemble_val = validate_ensemble(models, val_cache, val_episodes, DEVICE, use_tta=False)
ensemble_val_tta = validate_ensemble(models, val_cache, val_episodes, DEVICE, use_tta=True)

print(f"\n{'='*50}")
print(f"앙상블 결과:")
print(f"  앙상블 (No TTA): {ensemble_val:.2f}")
print(f"  앙상블 (TTA):    {ensemble_val_tta:.2f}")
print(f"\n기존 최고 (앙상블 5개): 13.21")
print(f"1등: 12.25")

Ensemble Val: 100%|██████████| 2338/2338 [02:56<00:00, 13.26it/s]
Ensemble Val: 100%|██████████| 2338/2338 [05:51<00:00,  6.64it/s]


앙상블 결과:
  앙상블 (No TTA): 13.49
  앙상블 (TTA):    13.51

기존 최고 (앙상블 5개): 13.21
1등: 12.25





## 7. Test 예측

In [None]:
@torch.no_grad()
def predict_test(models, test_df, match_info, config, device, use_tta=True, test_dir=f'{DATA_DIR}/test'):
    for m in models:
        m.eval()
    
    predictions = []
    max_len = config['max_seq_len']
    
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc='Test'):
        path = row['path'].replace('./test/', f'{test_dir}/')
        ep_df = pd.read_csv(path)
        ep_name = row['game_episode']
        
        # 원본 피처
        seq_feat = compute_sequence_features(ep_df, flip_y=False)
        seq_len = len(seq_feat)
        if seq_len > max_len:
            seq_feat = seq_feat[-max_len:]
            mask = np.ones(max_len, dtype=np.float32)
        else:
            pad_len = max_len - seq_len
            seq_feat = np.vstack([np.zeros((pad_len, N_SEQ_FEATURES), dtype=np.float32), seq_feat])
            mask = np.concatenate([np.zeros(pad_len), np.ones(seq_len)]).astype(np.float32)
        
        lp_feat, start = get_last_pass_info(ep_df, flip_y=False)
        meta_feat = get_meta_features(ep_df['game_id'].iloc[0], ep_df['period_id'].iloc[0], match_info)
        
        seq = torch.from_numpy(seq_feat).unsqueeze(0).to(device)
        mask_t = torch.from_numpy(mask).unsqueeze(0).to(device)
        lp = torch.from_numpy(lp_feat).unsqueeze(0).to(device)
        meta = torch.from_numpy(meta_feat).unsqueeze(0).to(device)
        
        # 원본 앙상블
        preds_orig = []
        for model in models:
            pred = model(seq, mask_t, lp, meta)[0].cpu().numpy()
            pred_abs = start + pred * 50
            preds_orig.append(pred_abs)
        ensemble_orig = np.mean(preds_orig, axis=0)
        
        if use_tta:
            # 반전 피처
            seq_feat_flip = compute_sequence_features(ep_df, flip_y=True)
            if len(seq_feat_flip) > max_len:
                seq_feat_flip = seq_feat_flip[-max_len:]
            else:
                pad_len = max_len - len(seq_feat_flip)
                seq_feat_flip = np.vstack([np.zeros((pad_len, N_SEQ_FEATURES), dtype=np.float32), seq_feat_flip])
            
            lp_feat_flip, start_flip = get_last_pass_info(ep_df, flip_y=True)
            
            seq_flip = torch.from_numpy(seq_feat_flip).unsqueeze(0).to(device)
            lp_flip = torch.from_numpy(lp_feat_flip).unsqueeze(0).to(device)
            
            preds_flip = []
            for model in models:
                pred = model(seq_flip, mask_t, lp_flip, meta)[0].cpu().numpy()
                pred_abs = start_flip + pred * 50
                pred_abs[1] = 68 - pred_abs[1]
                preds_flip.append(pred_abs)
            ensemble_flip = np.mean(preds_flip, axis=0)
            
            final_pred = (ensemble_orig + ensemble_flip) / 2
        else:
            final_pred = ensemble_orig
        
        predictions.append({
            'game_episode': ep_name,
            'end_x': np.clip(final_pred[0], 0, 105),
            'end_y': np.clip(final_pred[1], 0, 68)
        })
    
    return pd.DataFrame(predictions)

# Test 예측
submission = predict_test(models, test_df, match_info, CONFIG, DEVICE, use_tta=True)
submission.to_csv('submission_v8.csv', index=False)

print("\n제출 파일 저장!")
print(submission.head())

Test: 100%|██████████| 2414/2414 [06:32<00:00,  6.15it/s]


제출 파일 저장!
  game_episode      end_x      end_y
0     153363_1  60.445557  22.038307
1     153363_2  24.488165  47.759064
2     153363_6  40.602531  61.833672
3     153363_7  60.556396  11.537536
4     153363_8  81.772652   9.559481





In [None]:
# 최종 요약
print("\n" + "="*60)
print("V8 최종 요약")
print("="*60)
print(f"\n모델 구조:")
print(f"  d_model: {CONFIG['d_model']}")
print(f"  n_layers: {CONFIG['n_layers']}")
print(f"  n_heads: {CONFIG['n_heads']}")
print(f"  dropout: {CONFIG['dropout']}")
print(f"\n개선사항:")
print(f"  ✓ Y축 대칭 데이터 증강 (학습 시 50%)")
print(f"  ✓ 더 큰 모델 (128→192, 4→5 layers)")
print(f"  ✓ 강한 정규화 (dropout 0.15)")
print(f"  ✓ 거리 직접 최적화 Loss")
print(f"  ✓ 앙상블 10개 + TTA")
print(f"\n결과:")
print(f"  개별 모델 평균: {np.mean(val_scores):.2f}")
print(f"  앙상블 (No TTA): {ensemble_val:.2f}")
print(f"  앙상블 (TTA): {ensemble_val_tta:.2f}")
print(f"\n비교:")
print(f"  V7 단일: 13.90")
print(f"  기존 앙상블 5개: 13.21")
print(f"  1등: 12.25")


V8 최종 요약

모델 구조:
  d_model: 192
  n_layers: 5
  n_heads: 6
  dropout: 0.15

개선사항:
  ✓ Y축 대칭 데이터 증강 (학습 시 50%)
  ✓ 더 큰 모델 (128→192, 4→5 layers)
  ✓ 강한 정규화 (dropout 0.15)
  ✓ 거리 직접 최적화 Loss
  ✓ 앙상블 10개 + TTA

결과:
  개별 모델 평균: 13.60
  앙상블 (No TTA): 13.49
  앙상블 (TTA): 13.51

비교:
  V7 단일: 13.90
  기존 앙상블 5개: 13.21
  1등: 12.25
