In [None]:
import os
import pandas as pd
import polars as pl
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import optuna
from optuna.trial import Trial
import warnings
import gc
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print(f"–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è —É—Å—Ç—Ä–æ–π—Å—Ç–≤–æ: {device}")
print(f"–î–æ—Å—Ç—É–ø–Ω–æ GPU: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"–ù–∞–∑–≤–∞–Ω–∏–µ GPU: {torch.cuda.get_device_name(0)}")

NUM_TARGET_COLUMNS = 424

# –ì–ª–æ–±–∞–ª—å–Ω—ã–µ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã–µ
models = []
scaler = None
feature_cols = None
base_cols = None
is_initialized = False
model_val_losses = []
optuna_study = None
best_hyperparams = None
ensemble_weights_global = None

In [None]:
class SmartScaler:
    """–£–º–Ω—ã–π —Å–∫–µ–π–ª–µ—Ä —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –≤—ã–±—Ä–æ—Å–æ–≤"""
    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_importance = None
    
    def fit(self, X, y=None):
        # –û–±–Ω–∞—Ä—É–∂–µ–Ω–∏–µ –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ –≤—ã–±—Ä–æ—Å–æ–≤
        X_clean = self._remove_outliers(X)
        self.scaler.fit(X_clean)
        
        # –û—Ü–µ–Ω–∫–∞ –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        if y is not None:
            self.feature_importance = np.std(X_clean, axis=0)
        
        return self
    
    def fit_transform(self, X, y=None):
        """Fit –∏ transform –≤ –æ–¥–Ω–æ–º –º–µ—Ç–æ–¥–µ"""
        self.fit(X, y)
        return self.transform(X)
    
    def _remove_outliers(self, X, n_sigmas=3):
        """–£–¥–∞–ª–µ–Ω–∏–µ –≤—ã–±—Ä–æ—Å–æ–≤ –ø–æ –ø—Ä–∞–≤–∏–ª—É 3 —Å–∏–≥–º"""
        X_clean = X.copy()
        for i in range(X.shape[1]):
            col = X[:, i]
            if np.std(col) > 1e-6:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –µ—Å—Ç—å –≤–∞—Ä–∏–∞—Ü–∏—è
                mean, std = np.mean(col), np.std(col)
                mask = (col >= mean - n_sigmas * std) & (col <= mean + n_sigmas * std)
                X_clean[~mask, i] = mean
        return X_clean
    
    def transform(self, X):
        return self.scaler.transform(X)

In [None]:
class ImprovedModel(nn.Module):
    """–£–ª—É—á—à–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å —Å —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–µ–π –∏ skip-connections"""
    def __init__(self, input_size, output_size, hidden_sizes=[512, 256, 128], 
                 dropout_rates=[0.3, 0.2, 0.1], use_layer_norm=True, activation='relu'):
        super().__init__()
        
        layers = []
        prev_size = input_size
        
        for i, (hidden_size, dropout_rate) in enumerate(zip(hidden_sizes, dropout_rates)):
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.LayerNorm(hidden_size) if use_layer_norm else nn.BatchNorm1d(hidden_size),
                nn.ReLU() if activation == 'relu' else nn.LeakyReLU(0.1),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        self.features = nn.Sequential(*layers)
        self.output = nn.Linear(prev_size, output_size)
        
        # Skip connection –æ—Ç –≤—Ö–æ–¥–∞ –∫ –≤—ã—Ö–æ–¥—É
        self.skip = nn.Linear(input_size, output_size) if input_size != output_size else None
        
    def forward(self, x):
        features = self.features(x)
        output = self.output(features)
        
        if self.skip is not None:
            output = output + 0.1 * self.skip(x)  # –ú–∞–ª—ã–π –≤–µ—Å –¥–ª—è skip connection
        
        return output

In [None]:
class ResidualBlock(nn.Module):
    """–ë–ª–æ–∫ —Å residual connection"""
    def __init__(self, size, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(size, size),
            nn.LayerNorm(size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(size, size),
            nn.LayerNorm(size),
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return x + self.dropout(self.net(x))

In [None]:
class AdvancedEnsembleModel(nn.Module):
    """–ü—Ä–æ–¥–≤–∏–Ω—É—Ç–∞—è –º–æ–¥–µ–ª—å —Å –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–º –∞–Ω—Å–∞–º–±–ª–µ–º"""
    def __init__(self, input_size, output_size, num_experts=3):
        super().__init__()
        
        self.num_experts = num_experts
        self.experts = nn.ModuleList([
            ImprovedModel(input_size, 256, [512, 384], [0.3, 0.25]) 
            for _ in range(num_experts)
        ])
        
        self.gate = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_experts),
            nn.Softmax(dim=1)
        )
        
        self.output = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_size)
        )
        
    def forward(self, x):
        gates = self.gate(x)  # [batch_size, num_experts]
        expert_outputs = [expert(x) for expert in self.experts]  # list of [batch_size, 256]
        
        # –ü—Ä–∞–≤–∏–ª—å–Ω–æ–µ –≤–∑–≤–µ—à–∏–≤–∞–Ω–∏–µ —ç–∫—Å–ø–µ—Ä—Ç–æ–≤
        combined = torch.zeros_like(expert_outputs[0])
        for i in range(self.num_experts):
            # gates[:, i:i+1] - [batch_size, 1], expert_outputs[i] - [batch_size, 256]
            combined += gates[:, i:i+1] * expert_outputs[i]
        
        return self.output(combined)

In [None]:
class TransformerModel(nn.Module):
    """Transformer-like –º–æ–¥–µ–ª—å –¥–ª—è –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö —Ä—è–¥–æ–≤"""
    def __init__(self, input_size, output_size, num_heads=8, num_layers=2):
        super().__init__()
        
        self.input_proj = nn.Linear(input_size, 512)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=512, 
            nhead=num_heads,
            dim_feedforward=1024,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.output = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, output_size)
        )
        
    def forward(self, x):
        x = self.input_proj(x)
        x = x.unsqueeze(1)  # –î–æ–±–∞–≤–ª—è–µ–º sequence dimension [batch_size, 1, 512]
        x = self.transformer(x)
        x = x.squeeze(1)    # –£–±–∏—Ä–∞–µ–º sequence dimension [batch_size, 512]
        return self.output(x)

In [None]:
class SmartScaler:
    """–£–º–Ω—ã–π —Å–∫–µ–π–ª–µ—Ä —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –≤—ã–±—Ä–æ—Å–æ–≤"""
    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_importance = None
    
    def fit(self, X, y=None):
        # –û–±–Ω–∞—Ä—É–∂–µ–Ω–∏–µ –∏ –æ–±—Ä–∞–±–æ—Ç–∫–∞ –≤—ã–±—Ä–æ—Å–æ–≤
        X_clean = self._remove_outliers(X)
        self.scaler.fit(X_clean)
        
        # –û—Ü–µ–Ω–∫–∞ –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
        if y is not None:
            self.feature_importance = np.std(X_clean, axis=0)
        
        return self
    
    def fit_transform(self, X, y=None):
        """Fit –∏ transform –≤ –æ–¥–Ω–æ–º –º–µ—Ç–æ–¥–µ"""
        self.fit(X, y)
        return self.transform(X)
    
    def _remove_outliers(self, X, n_sigmas=3):
        """–£–¥–∞–ª–µ–Ω–∏–µ –≤—ã–±—Ä–æ—Å–æ–≤ –ø–æ –ø—Ä–∞–≤–∏–ª—É 3 —Å–∏–≥–º"""
        X_clean = X.copy()
        for i in range(X.shape[1]):
            col = X[:, i]
            if np.std(col) > 1e-6:  # –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –µ—Å—Ç—å –≤–∞—Ä–∏–∞—Ü–∏—è
                mean, std = np.mean(col), np.std(col)
                mask = (col >= mean - n_sigmas * std) & (col <= mean + n_sigmas * std)
                X_clean[~mask, i] = mean
        return X_clean
    
    def transform(self, X):
        return self.scaler.transform(X)

# ==========================================
# –£–õ–£–ß–®–ï–ù–ù–´–ï –ê–†–•–ò–¢–ï–ö–¢–£–†–´ –ú–û–î–ï–õ–ï–ô
# ==========================================

class Model1_Deep(nn.Module):
    """–ì–ª—É–±–æ–∫–∞—è —Å–µ—Ç—å —Å Layer Normalization –∏ –ª—É—á—à–µ–π —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–µ–π"""
    def __init__(self, input_size, output_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.LayerNorm(512),  # LayerNorm –≤–º–µ—Å—Ç–æ BatchNorm
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 384),
            nn.LayerNorm(384),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(384, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.15),
            
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        return self.net(x)

class Model2_Wide(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 768)  # –ú–µ–Ω—å—à–µ 1024‚Üí768
        self.bn1 = nn.LayerNorm(768)  # LayerNorm –≤–º–µ—Å—Ç–æ BatchNorm
        self.drop1 = nn.Dropout(0.4)  # –ë–æ–ª—å—à–µ dropout
        
        self.fc2 = nn.Linear(768, 512)
        self.bn2 = nn.LayerNorm(512)
        self.drop2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(512, 384)
        self.bn3 = nn.LayerNorm(384)
        self.drop3 = nn.Dropout(0.2)
        
        self.output = nn.Linear(384, output_size)
        self.skip = nn.Linear(input_size, 384)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        identity = self.skip(x)
        
        x = self.activation(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = self.activation(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        x = self.activation(self.bn3(self.fc3(x)))
        x = self.drop3(x)
        
        x = x + identity
        return self.output(x)

class Model3_Residual(nn.Module):
    """–ò—Å–ø—Ä–∞–≤–ª–µ–Ω–Ω–∞—è Residual —Å BatchNorm –∏ Pre-Activation"""
    def __init__(self, input_size, output_size):
        super().__init__()
        self.input_proj = nn.Linear(input_size, 384)
        self.bn_input = nn.BatchNorm1d(384)
        
        # Pre-activation residual blocks
        self.block1 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.block2 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.block3 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.output_proj = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        x = self.bn_input(self.input_proj(x))
        x = x + self.block1(x)
        x = x + self.block2(x)
        x = x + self.block3(x)
        return self.output_proj(x)

class Model4_DeepWide(nn.Module):
    """Deep & Wide —Å –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã–º–∏ –ø—É—Ç—è–º–∏"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # Deep path (—É–∑–∫–∏–π –∏ –≥–ª—É–±–æ–∫–∏–π)
        self.deep = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(384, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.ReLU(),
        )
        
        # Wide path (—à–∏—Ä–æ–∫–∏–π –∏ –º–µ–ª–∫–∏–π)
        self.wide = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
        )
        
        # Combine
        self.combine = nn.Sequential(
            nn.Linear(256, 128),  # 128 + 128 = 256
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        deep_out = self.deep(x)
        wide_out = self.wide(x)
        
        # Concatenate
        combined = torch.cat([deep_out, wide_out], dim=1)
        return self.combine(combined)

class Model5_Bottleneck(nn.Module):
    """Bottleneck —Å Attention –º–µ—Ö–∞–Ω–∏–∑–º–æ–º"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )
        
        # Self-Attention –Ω–∞ bottleneck
        self.attention = nn.Sequential(
            nn.Linear(128, 64),
            nn.Tanh(),
            nn.Linear(64, 128),
            nn.Softmax(dim=1)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.15),
            
            nn.Linear(384, output_size)
        )
    
    def forward(self, x):
        # Encode
        encoded = self.encoder(x)
        
        # Attention
        attention_weights = self.attention(encoded)
        attended = encoded * attention_weights
        
        # Decode
        return self.decoder(attended)

class Model6_Transformer(nn.Module):
    """Transformer-inspired architecture"""
    def __init__(self, input_size, output_size, num_heads=8):
        super().__init__()
        
        self.input_proj = nn.Linear(input_size, 512)
        
        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=512,
            num_heads=num_heads,
            dropout=0.2,
            batch_first=True
        )
        
        self.norm1 = nn.LayerNorm(512)
        self.norm2 = nn.LayerNorm(512)
        
        # Feed-forward
        self.ffn = nn.Sequential(
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512)
        )
        
        # Output
        self.output = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        # Project input
        x = self.input_proj(x)
        
        # Add sequence dimension for attention
        x = x.unsqueeze(1)  # [batch, 1, features]
        
        # Self-attention with residual
        attn_out, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_out)
        
        # Feed-forward with residual
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        
        # Remove sequence dimension
        x = x.squeeze(1)  # [batch, features]
        
        return self.output(x)

class Model7_EnsembleBlock(nn.Module):
    """–ú–æ–¥–µ–ª—å —Å –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–º –∞–Ω—Å–∞–º–±–ª–µ–º"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # –¢—Ä–∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã—Ö –ø—É—Ç–∏
        self.path1 = nn.Sequential(
            nn.Linear(input_size, 384),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(384, 256),
            nn.ReLU(),
        )
        
        self.path2 = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
        )
        
        self.path3 = nn.Sequential(
            nn.Linear(input_size, 640),
            nn.ReLU(),
            nn.Dropout(0.35),
            nn.Linear(640, 256),
            nn.ReLU(),
        )
        
        # Gating mechanism –¥–ª—è –≤–∑–≤–µ—à–∏–≤–∞–Ω–∏—è –ø—É—Ç–µ–π
        self.gate = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 3),
            nn.Softmax(dim=1)
        )
        
        # Final layers
        self.output = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        # –¢—Ä–∏ –ø—É—Ç–∏
        out1 = self.path1(x)
        out2 = self.path2(x)
        out3 = self.path3(x)
        
        # Gating weights
        gates = self.gate(x)  # [batch, 3]
        
        # Weighted combination
        combined = (gates[:, 0:1] * out1 + 
                   gates[:, 1:2] * out2 + 
                   gates[:, 2:3] * out3)
        
        return self.output(combined)

# –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —É–ª—É—á—à–µ–Ω–Ω—ã–µ –º–æ–¥–µ–ª–∏
class ImprovedModel(nn.Module):
    """–£–ª—É—á—à–µ–Ω–Ω–∞—è –º–æ–¥–µ–ª—å —Å —Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏–µ–π –∏ skip-connections"""
    def __init__(self, input_size, output_size, hidden_sizes=[512, 256, 128], 
                 dropout_rates=[0.3, 0.2, 0.1], use_layer_norm=True, activation='relu'):
        super().__init__()
        
        layers = []
        prev_size = input_size
        
        for i, (hidden_size, dropout_rate) in enumerate(zip(hidden_sizes, dropout_rates)):
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.LayerNorm(hidden_size) if use_layer_norm else nn.BatchNorm1d(hidden_size),
                nn.ReLU() if activation == 'relu' else nn.LeakyReLU(0.1),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        self.features = nn.Sequential(*layers)
        self.output = nn.Linear(prev_size, output_size)
        
        # Skip connection –æ—Ç –≤—Ö–æ–¥–∞ –∫ –≤—ã—Ö–æ–¥—É
        self.skip = nn.Linear(input_size, output_size) if input_size != output_size else None
        
    def forward(self, x):
        features = self.features(x)
        output = self.output(features)
        
        if self.skip is not None:
            output = output + 0.1 * self.skip(x)  # –ú–∞–ª—ã–π –≤–µ—Å –¥–ª—è skip connection
        
        return output

class AdvancedEnsembleModel(nn.Module):
    """–ü—Ä–æ–¥–≤–∏–Ω—É—Ç–∞—è –º–æ–¥–µ–ª—å —Å –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–º –∞–Ω—Å–∞–º–±–ª–µ–º"""
    def __init__(self, input_size, output_size, num_experts=3):
        super().__init__()
        
        self.num_experts = num_experts
        self.experts = nn.ModuleList([
            ImprovedModel(input_size, 256, [512, 384], [0.3, 0.25]) 
            for _ in range(num_experts)
        ])
        
        self.gate = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_experts),
            nn.Softmax(dim=1)
        )
        
        self.output = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_size)
        )
        
    def forward(self, x):
        gates = self.gate(x)  # [batch_size, num_experts]
        expert_outputs = [expert(x) for expert in self.experts]  # list of [batch_size, 256]
        
        # –ü—Ä–∞–≤–∏–ª—å–Ω–æ–µ –≤–∑–≤–µ—à–∏–≤–∞–Ω–∏–µ —ç–∫—Å–ø–µ—Ä—Ç–æ–≤
        combined = torch.zeros_like(expert_outputs[0])
        for i in range(self.num_experts):
            # gates[:, i:i+1] - [batch_size, 1], expert_outputs[i] - [batch_size, 256]
            combined += gates[:, i:i+1] * expert_outputs[i]
        
        return self.output(combined)

In [None]:
def create_enhanced_features(df, base_cols_ref=None):
    """–°–æ–∑–¥–∞–µ—Ç —Ä–∞—Å—à–∏—Ä–µ–Ω–Ω—ã–π –Ω–∞–±–æ—Ä —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏—Ö –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä–æ–≤"""
    features = df.copy()
    
    if base_cols_ref is not None:
        numeric_cols = [c for c in base_cols_ref if c in df.columns]
    else:
        numeric_cols = [c for c in df.columns 
                       if c not in ['date_id', 'is_scored'] 
                       and pd.api.types.is_numeric_dtype(df[c])]
    
    for col in numeric_cols:
        try:
            with np.errstate(divide='ignore', invalid='ignore'):
                # ===== –ë–ê–ó–û–í–´–ï RETURNS =====
                features[f'{col}_return_1d'] = df[col].pct_change(1)
                features[f'{col}_return_5d'] = df[col].pct_change(5)
                features[f'{col}_return_20d'] = df[col].pct_change(20)
                features[f'{col}_return_60d'] = df[col].pct_change(60)
                
                # ===== SIMPLE MOVING AVERAGES (SMA) =====
                features[f'{col}_ma_5'] = df[col].rolling(5, min_periods=1).mean()
                features[f'{col}_ma_10'] = df[col].rolling(10, min_periods=1).mean()
                features[f'{col}_ma_20'] = df[col].rolling(20, min_periods=1).mean()
                features[f'{col}_ma_60'] = df[col].rolling(60, min_periods=1).mean()
                
                # ===== EXPONENTIAL MOVING AVERAGES (EMA) =====
                features[f'{col}_ema_5'] = df[col].ewm(span=5, adjust=False).mean()
                features[f'{col}_ema_10'] = df[col].ewm(span=10, adjust=False).mean()
                features[f'{col}_ema_20'] = df[col].ewm(span=20, adjust=False).mean()
                
                # ===== MA CROSSOVERS =====
                features[f'{col}_ma_5_20_diff'] = features[f'{col}_ma_5'] - features[f'{col}_ma_20']
                features[f'{col}_ma_10_60_diff'] = features[f'{col}_ma_10'] - features[f'{col}_ma_60']
                features[f'{col}_ema_5_20_diff'] = features[f'{col}_ema_5'] - features[f'{col}_ema_20']
                
                # ===== PRICE TO MA DISTANCE =====
                features[f'{col}_to_ma_5'] = (df[col] - features[f'{col}_ma_5']) / features[f'{col}_ma_5']
                features[f'{col}_to_ma_20'] = (df[col] - features[f'{col}_ma_20']) / features[f'{col}_ma_20']
                features[f'{col}_to_ema_10'] = (df[col] - features[f'{col}_ema_10']) / features[f'{col}_ema_10']
                
                # ===== VOLATILITY =====
                features[f'{col}_std_5'] = df[col].rolling(5, min_periods=1).std()
                features[f'{col}_std_20'] = df[col].rolling(20, min_periods=1).std()
                features[f'{col}_std_60'] = df[col].rolling(60, min_periods=1).std()
                
                # ===== BOLLINGER BANDS =====
                ma_20 = features[f'{col}_ma_20']
                std_20 = features[f'{col}_std_20']
                features[f'{col}_bb_upper'] = ma_20 + 2 * std_20
                features[f'{col}_bb_lower'] = ma_20 - 2 * std_20
                features[f'{col}_bb_width'] = (features[f'{col}_bb_upper'] - features[f'{col}_bb_lower']) / ma_20
                features[f'{col}_bb_position'] = (df[col] - features[f'{col}_bb_lower']) / (features[f'{col}_bb_upper'] - features[f'{col}_bb_lower'])
                
                # ===== RSI (Relative Strength Index) =====
                delta = df[col].diff()
                gain = delta.where(delta > 0, 0).rolling(window=14, min_periods=1).mean()
                loss = -delta.where(delta < 0, 0).rolling(window=14, min_periods=1).mean()
                rs = gain / loss
                features[f'{col}_rsi_14'] = 100 - (100 / (1 + rs))
                
                # ===== MACD =====
                ema_12 = df[col].ewm(span=12, adjust=False).mean()
                ema_26 = df[col].ewm(span=26, adjust=False).mean()
                features[f'{col}_macd'] = ema_12 - ema_26
                features[f'{col}_macd_signal'] = features[f'{col}_macd'].ewm(span=9, adjust=False).mean()
                features[f'{col}_macd_diff'] = features[f'{col}_macd'] - features[f'{col}_macd_signal']
                
                # ===== MOMENTUM =====
                features[f'{col}_momentum_5'] = df[col] - df[col].shift(5)
                features[f'{col}_momentum_10'] = df[col] - df[col].shift(10)
                features[f'{col}_momentum_20'] = df[col] - df[col].shift(20)
                
                # ===== RATE OF CHANGE (ROC) =====
                features[f'{col}_roc_5'] = ((df[col] - df[col].shift(5)) / df[col].shift(5)) * 100
                features[f'{col}_roc_10'] = ((df[col] - df[col].shift(10)) / df[col].shift(10)) * 100
                features[f'{col}_roc_20'] = ((df[col] - df[col].shift(20)) / df[col].shift(20)) * 100
                
                # ===== LAG FEATURES =====
                features[f'{col}_lag_1'] = df[col].shift(1)
                features[f'{col}_lag_2'] = df[col].shift(2)
                features[f'{col}_lag_3'] = df[col].shift(3)
                features[f'{col}_lag_5'] = df[col].shift(5)
                
                # ===== MIN/MAX OVER WINDOWS =====
                features[f'{col}_max_5'] = df[col].rolling(5, min_periods=1).max()
                features[f'{col}_min_5'] = df[col].rolling(5, min_periods=1).min()
                features[f'{col}_max_20'] = df[col].rolling(20, min_periods=1).max()
                features[f'{col}_min_20'] = df[col].rolling(20, min_periods=1).min()
                
                # Distance to recent high/low
                features[f'{col}_dist_to_max_20'] = (df[col] - features[f'{col}_max_20']) / features[f'{col}_max_20']
                features[f'{col}_dist_to_min_20'] = (df[col] - features[f'{col}_min_20']) / features[f'{col}_min_20']
                
                # ===== ACCELERATION (second derivative) =====
                features[f'{col}_acceleration'] = df[col].diff().diff()
                
                # ===== Z-SCORE (standardized price) =====
                rolling_mean = df[col].rolling(20, min_periods=1).mean()
                rolling_std = df[col].rolling(20, min_periods=1).std()
                features[f'{col}_zscore'] = (df[col] - rolling_mean) / rolling_std
                
        except:
            pass
    
    return features


def add_spread_features(train, features, target_pairs):
    """–î–æ–±–∞–≤–ª—è–µ—Ç —Ñ–∏—á–∏ –¥–ª—è —Å–ø—Ä–µ–¥–æ–≤ –º–µ–∂–¥—É –ø–∞—Ä–∞–º–∏ –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç–æ–≤"""
    print("–î–æ–±–∞–≤–ª–µ–Ω–∏–µ spread features...")
    
    spread_count = 0
    for idx, row in target_pairs.iterrows():
        pair = row['pair']
        
        # –ü–∞—Ä—Å–∏–º –ø–∞—Ä—É (–Ω–∞–ø—Ä–∏–º–µ—Ä: "LME_CA_Close - US_Stock_CCJ_adj_close")
        if ' - ' in str(pair):
            try:
                col_a, col_b = pair.split(' - ')
                col_a = col_a.strip()
                col_b = col_b.strip()
                
                if col_a in train.columns and col_b in train.columns:
                    # –í—ã—á–∏—Å–ª—è–µ–º spread
                    spread = train[col_a] - train[col_b]
                    prefix = f'spread_{idx}'
                    
                    with np.errstate(divide='ignore', invalid='ignore'):
                        # MA –¥–ª—è spread
                        features[f'{prefix}_ma_5'] = spread.rolling(5, min_periods=1).mean()
                        features[f'{prefix}_ma_20'] = spread.rolling(20, min_periods=1).mean()
                        
                        # Volatility spread
                        features[f'{prefix}_std_5'] = spread.rolling(5, min_periods=1).std()
                        features[f'{prefix}_std_20'] = spread.rolling(20, min_periods=1).std()
                        
                        # Momentum spread
                        features[f'{prefix}_momentum_5'] = spread - spread.shift(5)
                        features[f'{prefix}_return_5d'] = spread.pct_change(5)
                        
                        # Z-score spread
                        rolling_mean = spread.rolling(20, min_periods=1).mean()
                        rolling_std = spread.rolling(20, min_periods=1).std()
                        features[f'{prefix}_zscore'] = (spread - rolling_mean) / rolling_std
                        
                        # Ratio features
                        ratio = train[col_a] / (train[col_b] + 1e-10)  # –ò–∑–±–µ–≥–∞–µ–º –¥–µ–ª–µ–Ω–∏—è –Ω–∞ 0
                        features[f'{prefix}_ratio'] = ratio
                        features[f'{prefix}_ratio_ma_5'] = ratio.rolling(5, min_periods=1).mean()
                        
                        spread_count += 1
            except Exception as e:
                # –¢–∏—Ö–æ –ø—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–±–ª–µ–º–Ω—ã–µ –ø–∞—Ä—ã
                pass
    
    print(f"Spread features –¥–æ–±–∞–≤–ª–µ–Ω—ã –¥–ª—è {spread_count} –ø–∞—Ä. –í—Å–µ–≥–æ —Ñ–∏—á–µ–π: {len(features.columns)}")
    return features

In [None]:
def create_smart_features(df, base_cols_ref=None):
    """–£–º–Ω–æ–µ —Å–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ —Å —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–µ–π"""
    features = df.copy()
    
    if base_cols_ref is not None:
        numeric_cols = [c for c in base_cols_ref if c in df.columns]
    else:
        numeric_cols = [c for c in df.columns 
                       if c not in ['date_id', 'is_scored'] 
                       and pd.api.types.is_numeric_dtype(df[c])]
    
    # –§–∏–ª—å—Ç—Ä—É–µ–º –∫–æ–ª–æ–Ω–∫–∏ —Å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ–π –≤–∞—Ä–∏–∞—Ç–∏–≤–Ω–æ—Å—Ç—å—é
    useful_cols = []
    for col in numeric_cols:
        if len(df[col]) > 10:  # –î–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–∞–Ω–Ω—ã—Ö
            std_val = df[col].std()
            unique_vals = df[col].nunique()
            if std_val > 1e-6 and unique_vals > 5 and not np.isnan(std_val):
                useful_cols.append(col)
    
    print(f"–ò—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è {len(useful_cols)} –∏–∑ {len(numeric_cols)} –∫–æ–ª–æ–Ω–æ–∫")
    
    # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–æ–ª–æ–Ω–æ–∫ –¥–ª—è –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏
    useful_cols = useful_cols[:min(200, len(useful_cols))]
    
    for col in useful_cols:
        try:
            with np.errstate(divide='ignore', invalid='ignore'):
                # –ë–∞–∑–æ–≤—ã–µ —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–∞—Ü–∏–∏
                features[f'{col}_return_1d'] = df[col].pct_change(1)
                features[f'{col}_return_5d'] = df[col].pct_change(5)
                
                # –°–∫–æ–ª—å–∑—è—â–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
                for window in [5, 10, 20]:
                    features[f'{col}_ma_{window}'] = df[col].rolling(window, min_periods=1).mean()
                    features[f'{col}_std_{window}'] = df[col].rolling(window, min_periods=1).std()
                
                # –ö–ª—é—á–µ–≤—ã–µ –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏
                features[f'{col}_momentum_5'] = df[col] / df[col].shift(5) - 1
                features[f'{col}_volatility_20'] = df[col].rolling(20).std() / df[col].rolling(20).mean()
                
                # Z-score
                rolling_mean = df[col].rolling(20, min_periods=1).mean()
                rolling_std = df[col].rolling(20, min_periods=1).std()
                features[f'{col}_zscore'] = (df[col] - rolling_mean) / rolling_std
                
                # Lag features
                for lag in [1, 2, 3]:
                    features[f'{col}_lag_{lag}'] = df[col].shift(lag)
                
        except Exception as e:
            continue
    
    # –ó–∞–ø–æ–ª–Ω—è–µ–º NaN
    features = features.fillna(0)
    
    return features

def prepare_features(df):
    global base_cols, feature_cols
    
    if base_cols is None:
        numeric_cols = [c for c in df.columns 
                       if c not in ['date_id', 'is_scored'] 
                       and pd.api.types.is_numeric_dtype(df[c])]
        test_features = create_smart_features(df, base_cols_ref=numeric_cols)
    else:
        test_features = create_smart_features(df, base_cols_ref=base_cols)
    
    if feature_cols is None:
        feature_cols = [c for c in test_features.columns 
                       if c != 'date_id' and pd.api.types.is_numeric_dtype(test_features[c])]
    
    X_test = np.zeros((len(df), len(feature_cols)))
    
    for i, col in enumerate(feature_cols):
        if col in test_features.columns:
            X_test[:, i] = test_features[col].fillna(0).values
    
    X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)
    
    return X_test

In [None]:
class SpearmanLoss(nn.Module):
    """Loss —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ Spearman Correlation"""
    def __init__(self):
        super().__init__()
    
    def forward(self, pred, target):
        # –î–ª—è –∫–∞–∂–¥–æ–≥–æ target –æ—Ç–¥–µ–ª—å–Ω–æ
        batch_size = pred.shape[0]
        num_targets = pred.shape[1]
        
        total_loss = 0
        
        for i in range(num_targets):
            pred_col = pred[:, i]
            target_col = target[:, i]
            
            # –†–∞–Ω–∂–∏—Ä—É–µ–º
            pred_rank = pred_col.argsort().argsort().float()
            target_rank = target_col.argsort().argsort().float()
            
            # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º
            pred_rank = (pred_rank - pred_rank.mean()) / (pred_rank.std() + 1e-6)
            target_rank = (target_rank - target_rank.mean()) / (target_rank.std() + 1e-6)
            
            # Correlation (–º–∏–Ω–∏–º–∏–∑–∏—Ä—É–µ–º –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—É—é)
            correlation = (pred_rank * target_rank).mean()
            total_loss += -correlation
        
        return total_loss / num_targets


class CombinedLoss(nn.Module):
    """–ö–æ–º–±–∏–Ω–∞—Ü–∏—è MSE + Spearman + Direction"""
    def __init__(self, mse_weight=0.3, spearman_weight=0.5, direction_weight=0.2):
        super().__init__()
        self.mse_weight = mse_weight
        self.spearman_weight = spearman_weight
        self.direction_weight = direction_weight
        
        self.mse = nn.HuberLoss()
        self.spearman = SpearmanLoss()
    
    def forward(self, pred, target):
        # MSE —á–∞—Å—Ç—å
        mse_loss = self.mse(pred, target)
        
        # Spearman —á–∞—Å—Ç—å
        spearman_loss = self.spearman(pred, target)
        
        # Direction —á–∞—Å—Ç—å
        pred_sign = torch.sign(pred)
        target_sign = torch.sign(target)
        direction_correct = (pred_sign == target_sign).float()
        direction_loss = 1 - direction_correct.mean()
        
        # –ö–æ–º–±–∏–Ω–∏—Ä—É–µ–º
        total_loss = (self.mse_weight * mse_loss + 
                     self.spearman_weight * spearman_loss + 
                     self.direction_weight * direction_loss)
        
        return total_loss

In [None]:
def objective(trial, X_train_t, y_train_t, X_val_t, y_val_t, input_size, output_size):
    """–§—É–Ω–∫—Ü–∏—è –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å –ø–æ–º–æ—â—å—é Optuna"""
    
    try:
        # –ü—Ä–µ–¥–ª–∞–≥–∞–µ–º –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã
        model_type = trial.suggest_categorical('model_type', ['improved', 'transformer'])
        lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
        
        if model_type == 'improved':
            hidden_size1 = trial.suggest_int('hidden_size1', 256, 1024)
            hidden_size2 = trial.suggest_int('hidden_size2', 128, 512)
            model = ImprovedModel(
                input_size, output_size, 
                hidden_sizes=[hidden_size1, hidden_size2],
                dropout_rates=[dropout_rate, dropout_rate * 0.8]
            ).to(device)
        else:  # transformer
            # –î–ª—è transformer –æ–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º num_heads —á—Ç–æ–±—ã embed_dim –±—ã–ª –∫—Ä–∞—Ç–µ–Ω
            embed_dim = 512  # –§–∏–∫—Å–∏—Ä—É–µ–º embed_dim
            possible_heads = [2, 4, 8, 16]  # –¢–æ–ª—å–∫–æ –¥–µ–ª–∏—Ç–µ–ª–∏ 512
            num_heads = trial.suggest_categorical('num_heads', possible_heads)
            num_layers = trial.suggest_int('num_layers', 1, 4)
            model = TransformerModel(input_size, output_size, num_heads=num_heads, num_layers=num_layers).to(device)
        
        # –û–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä –∏ scheduler
        optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'AdamW', 'RMSprop'])
        if optimizer_name == 'Adam':
            optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        elif optimizer_name == 'AdamW':
            optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
        
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
        # criterion = nn.HuberLoss(delta=1.0)
        criterion = SpearmanLoss(
            mse_weight=0.3,       # MSE –¥–ª—è magnitude
            spearman_weight=0.5,  # Spearman –¥–ª—è ranking
            direction_weight=0.2  # Direction –¥–ª—è –∑–Ω–∞–∫–∞
        )
        
        # –û–±—É—á–µ–Ω–∏–µ
        model.train()
        best_val_loss = float('inf')
        patience = 10
        epochs_no_improve = 0
        
        for epoch in range(80):  # –ö–æ—Ä–æ—Ç–∫–æ–µ –æ–±—É—á–µ–Ω–∏–µ –¥–ª—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏
            optimizer.zero_grad()
            train_outputs = model(X_train_t)
            train_loss = criterion(train_outputs, y_train_t)
            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            # –í–∞–ª–∏–¥–∞—Ü–∏—è
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_t)
                val_loss = criterion(val_outputs, y_val_t)
            
            scheduler.step(val_loss)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss.item()
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= patience:
                    break
            
            model.train()
        
        # –û—á–∏—Å—Ç–∫–∞ –ø–∞–º—è—Ç–∏
        del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        return best_val_loss
    
    except Exception as e:
        # –õ–æ–≥–∏—Ä—É–µ–º –æ—à–∏–±–∫—É –∏ –ø—Ä–æ–ø—É—Å–∫–∞–µ–º trial
        print(f"Trial {trial.number} failed: {e}")
        return float('inf')  # –í–æ–∑–≤—Ä–∞—â–∞–µ–º –ø–ª–æ—Ö–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ –¥–ª—è –Ω–µ—É–¥–∞—á–Ω–æ–≥–æ trial

def optimize_hyperparameters(X_train_t, y_train_t, X_val_t, y_val_t, input_size, output_size, n_trials=20):
    """–û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –¥–ª—è –≤—Å–µ—Ö –º–æ–¥–µ–ª–µ–π"""
    
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    print(f"–ó–∞–ø—É—Å–∫ –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ ({n_trials} trials)...")
    
    study.optimize(
        lambda trial: objective(trial, X_train_t, y_train_t, X_val_t, y_val_t, input_size, output_size),
        n_trials=n_trials,
        show_progress_bar=True
    )
    
    print("\n–õ—É—á—à–∏–µ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã:")
    for key, value in study.best_trial.params.items():
        print(f"  {key}: {value}")
    print(f"–õ—É—á—à–µ–µ –∑–Ω–∞—á–µ–Ω–∏–µ loss: {study.best_value:.6f}")
    
    return study.best_params

def create_model_from_params(params, input_size, output_size):
    """–°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –æ—Å–Ω–æ–≤–µ –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤"""
    
    model_type = params['model_type']
    
    if model_type == 'improved':
        return ImprovedModel(
            input_size, output_size,
            hidden_sizes=[params['hidden_size1'], params['hidden_size2']],
            dropout_rates=[params['dropout_rate'], params['dropout_rate'] * 0.8]
        ).to(device)
    else:  # transformer
        return TransformerModel(
            input_size, output_size,
            num_heads=params['num_heads'],
            num_layers=params['num_layers']
        ).to(device)

In [None]:
def improved_training_loop(model, X_train_t, y_train_t, X_val_t, y_val_t, params, epochs=300):
    """–£–ª—É—á—à–µ–Ω–Ω—ã–π —Ü–∏–∫–ª –æ–±—É—á–µ–Ω–∏—è —Å –æ–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏"""
    
    criterion = nn.HuberLoss(delta=1.0)
    
    # –û–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä
    optimizer_name = params.get('optimizer', 'AdamW')
    lr = params.get('lr', 0.001)
    weight_decay = params.get('weight_decay', 1e-5)
    
    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_name == 'AdamW':
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # –£–±—Ä–∞–ª verbose –ø–∞—Ä–∞–º–µ—Ç—Ä –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=8, factor=0.5)
    
    # –†–∞–Ω–Ω—è—è –æ—Å—Ç–∞–Ω–æ–≤–∫–∞
    patience = 50
    best_val_loss = float('inf')
    best_r2 = -float('inf')
    best_model_state = None
    epochs_no_improve = 0
    
    model.train()
    
    for epoch in range(epochs):
        if epochs_no_improve >= patience:
            print(f"Early stopping –Ω–∞ —ç–ø–æ—Ö–µ {epoch+1}")
            break
        
        # –û–±—É—á–µ–Ω–∏–µ
        optimizer.zero_grad()
        train_outputs = model(X_train_t)
        train_loss = criterion(train_outputs, y_train_t)
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        # –í–∞–ª–∏–¥–∞—Ü–∏—è
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_t)
            val_loss = criterion(val_outputs, y_val_t)
            
            val_preds = val_outputs.cpu().numpy()
            val_true = y_val_t.cpu().numpy()
            
            mae = mean_absolute_error(val_true.flatten(), val_preds.flatten())
            direction_correct = np.mean(np.sign(val_preds) == np.sign(val_true))
            r2 = r2_score(val_true.flatten(), val_preds.flatten())
            
            train_preds = train_outputs.detach().cpu().numpy()
            train_true = y_train_t.cpu().numpy()
            train_r2 = r2_score(train_true.flatten(), train_preds.flatten())
        
        # –û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ª—É—á—à–∏—Ö —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
        if val_loss < best_val_loss:
            best_val_loss = val_loss.item()
            best_r2 = r2
            best_model_state = model.state_dict().copy()
            epochs_no_improve = 0
            improvement_msg = "‚úì –£–õ–£–ß–®–ï–ù–ò–ï"
        else:
            epochs_no_improve += 1
            improvement_msg = f"NO IMPROVE ({epochs_no_improve}/{patience})"
        
        model.train()
        scheduler.step(val_loss)
        
        # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ
        if (epoch + 1) % 10 == 0 or epoch == 0 or epochs_no_improve >= patience:
            lr = optimizer.param_groups[0]['lr']
            print(f"Ep {epoch+1:3d}/{epochs} | "
                  f"TrL: {train_loss.item():.6f} | "
                  f"VaL: {val_loss.item():.6f} | "
                  f"MAE: {mae:.6f} | "
                  f"R¬≤_tr: {train_r2:7.4f} | "
                  f"R¬≤_val: {r2:7.4f} | "
                  f"Dir: {direction_correct:.4f} | "
                  f"LR: {lr:.6f} | {improvement_msg}")
        
        # –ü–µ—Ä–∏–æ–¥–∏—á–µ—Å–∫–∞—è –æ—á–∏—Å—Ç–∫–∞ –ø–∞–º—è—Ç–∏
        if epoch % 50 == 0 and torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # –ó–∞–≥—Ä—É–∑–∫–∞ –ª—É—á—à–∏—Ö –≤–µ—Å–æ–≤
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    model.eval()
    
    return model, best_val_loss, best_r2

def calculate_dynamic_weights(models, X_val_t, y_val_t):
    """–í—ã—á–∏—Å–ª–µ–Ω–∏–µ –≤–µ—Å–æ–≤ –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏"""
    weights = []
    performances = []
    
    with torch.no_grad():
        for model in models:
            pred = model(X_val_t)
            # –ò—Å–ø–æ–ª—å–∑—É–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –º–µ—Ç—Ä–∏–∫
            mae = nn.L1Loss()(pred, y_val_t).item()
            mse = nn.MSELoss()(pred, y_val_t).item()
            
            # –ö–æ–º–±–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –æ—Ü–µ–Ω–∫–∞ (—á–µ–º –º–µ–Ω—å—à–µ –æ—à–∏–±–∫–∏ - —Ç–µ–º –ª—É—á—à–µ)
            score = 1.0 / (mae + 0.1 * mse + 1e-8)
            performances.append(score)
    
    # Softmax –¥–ª—è –≤–µ—Å–æ–≤
    performances = np.array(performances)
    weights = np.exp(performances - np.max(performances))
    weights = weights / weights.sum()
    
    return weights

In [None]:
def initialize_models():
    global models, scaler, feature_cols, base_cols, model_val_losses, is_initialized, device
    
    if is_initialized:
        return
    
    print("="*70)
    print(f"–ò–ù–ò–¶–ò–ê–õ–ò–ó–ê–¶–ò–Ø –ò –û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô –ù–ê {device}")
    print("="*70)
    
    # ===== –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–• =====
    train = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train.csv')
    train_labels = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train_labels.csv')
    
    # –ß–∏—Ç–∞–µ–º –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ –ø–∞—Ä–∞—Ö –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç–æ–≤
    try:
        target_pairs = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/target_pairs.csv')
        print("\n–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ targets:")
        print(f"–í—Å–µ–≥–æ targets: {len(target_pairs)}")
        for lag in sorted(target_pairs['lag'].unique()):
            count = len(target_pairs[target_pairs['lag'] == lag])
            print(f"  Lag {lag}: {count} targets")
    except:
        print("\n–í–Ω–∏–º–∞–Ω–∏–µ: target_pairs.csv –Ω–µ –Ω–∞–π–¥–µ–Ω, –ø—Ä–æ–¥–æ–ª–∂–∞–µ–º –±–µ–∑ –Ω–µ–≥–æ")
        target_pairs = None
    
    # ===== –ë–ê–ó–û–í–´–ï –ö–û–õ–û–ù–ö–ò =====
    base_cols = [c for c in train.columns 
                 if c not in ['date_id'] 
                 and pd.api.types.is_numeric_dtype(train[c])]
    
    print(f"\n–ë–∞–∑–æ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫: {len(base_cols)}")
    print("–°–æ–∑–¥–∞–Ω–∏–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤...")
    
    # ===== FEATURE ENGINEERING =====
    train_features = create_enhanced_features(train, base_cols_ref=base_cols)
    
    # –î–æ–±–∞–≤–ª—è–µ–º spread features –µ—Å–ª–∏ –µ—Å—Ç—å target_pairs
    if target_pairs is not None:
        train_features = add_spread_features(train, train_features, target_pairs)
    
    feature_cols = [c for c in train_features.columns 
                   if c != 'date_id' and pd.api.types.is_numeric_dtype(train_features[c])]
    
    print(f"–°–æ–∑–¥–∞–Ω–æ {len(feature_cols)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
    
    # ===== –ü–û–î–ì–û–¢–û–í–ö–ê –î–ê–ù–ù–´–• =====
    target_cols = [f'target_{i}' for i in range(424)]
    
    X_train = train_features[feature_cols].fillna(0).values
    y_train = train_labels[target_cols].fillna(0).values
    
    X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
    y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # ===== –ü–ï–†–ï–ù–û–° –ù–ê GPU =====
    X_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_tensor = torch.FloatTensor(y_train).to(device)
    
    split_idx = int(len(X_train_scaled) * 0.9)
    X_train_t, X_val_t = X_tensor[:split_idx], X_tensor[split_idx:]
    y_train_t, y_val_t = y_tensor[:split_idx], y_tensor[split_idx:]
    
    print(f"Train: {len(X_train_t)}, Validation: {len(X_val_t)}")
    
    # ===== –ö–û–ù–§–ò–ì–£–†–ê–¶–ò–Ø –ú–û–î–ï–õ–ï–ô (–ë–ï–ó MODEL 3!) =====
    model_configs = [
        (Model1_Deep, "Deep+LayerNorm", 1000),
        # (Model5_Bottleneck, "Bottleneck+Attention", 1000),
        (Model6_Transformer, "Transformer", 1000),
        (Model7_EnsembleBlock, "EnsembleBlock+Gating", 1000),
    ]
    
    # ===== –û–ë–£–ß–ï–ù–ò–ï –ú–û–î–ï–õ–ï–ô =====
    for i, (ModelClass, name, epochs) in enumerate(model_configs):
        print(f"\n{'='*70}")
        print(f"–ú–û–î–ï–õ–¨ {i+1}/{len(model_configs)}: {name}")
        print(f"{'='*70}")
        
        try:
            # –°–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏
            model = ModelClass(X_train_scaled.shape[1], 424).to(device)
            
            criterion = nn.HuberLoss(delta=1.0)
            optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, 'min', patience=5, factor=0.5
            )
            
            # Early Stopping –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
            patience = 15
            best_val_loss = float('inf')
            best_r2 = -float('inf')
            best_model_state = None
            epochs_no_improve = 0
            early_stop = False
            
            model.train()
            for epoch in range(epochs):
                if early_stop:
                    print(f"Early stopping –Ω–∞ —ç–ø–æ—Ö–µ {epoch+1}")
                    break
                
                # Training step
                optimizer.zero_grad()
                train_outputs = model(X_train_t)
                train_loss = criterion(train_outputs, y_train_t)
                train_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                
                # Validation step
                model.eval()
                with torch.no_grad():
                    val_outputs = model(X_val_t)
                    val_loss = criterion(val_outputs, y_val_t)
                    
                    val_preds = val_outputs.cpu().numpy()
                    val_true = y_val_t.cpu().numpy()
                    
                    mae = mean_absolute_error(val_true.flatten(), val_preds.flatten())
                    direction_correct = np.mean(np.sign(val_preds) == np.sign(val_true))
                    r2 = r2_score(val_true.flatten(), val_preds.flatten())
                    
                    train_preds = train_outputs.detach().cpu().numpy()
                    train_true = y_train_t.cpu().numpy()
                    train_r2 = r2_score(train_true.flatten(), train_preds.flatten())
                
                # Early Stopping –ª–æ–≥–∏–∫–∞
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_r2 = r2
                    best_model_state = model.state_dict().copy()
                    epochs_no_improve = 0
                    improvement_msg = "‚úì –£–õ–£–ß–®–ï–ù–ò–ï"
                else:
                    epochs_no_improve += 1
                    improvement_msg = f"NO IMPROVE ({epochs_no_improve}/{patience})"
                    
                    if epochs_no_improve >= patience:
                        early_stop = True
                
                model.train()
                scheduler.step(val_loss)
                
                # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ
                if (epoch + 1) % 5 == 0 or epoch == 0 or early_stop:
                    lr = optimizer.param_groups[0]['lr']
                    print(f"Ep {epoch+1:3d}/{epochs} | "
                          f"TrL: {train_loss.item():.6f} | "
                          f"VaL: {val_loss.item():.6f} | "
                          f"MAE: {mae:.6f} | "
                          f"R¬≤_tr: {train_r2:7.4f} | "
                          f"R¬≤_val: {r2:7.4f} | "
                          f"Dir: {direction_correct:.4f} | "
                          f"LR: {lr:.6f} | {improvement_msg}")
            
            # –ó–∞–≥—Ä—É–∂–∞–µ–º –ª—É—á—à–∏–µ –≤–µ—Å–∞
            if best_model_state is not None:
                model.load_state_dict(best_model_state)
            
            model.eval()
            models.append(model)
            model_val_losses.append(best_val_loss.item())
            
            print(f"–ó–∞–≤–µ—Ä—à–µ–Ω–∞. Best Val Loss: {best_val_loss:.6f}, Best R¬≤: {best_r2:.4f}")
            
        except Exception as e:
            print(f"–û–®–ò–ë–ö–ê –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏ {name}: {e}")
            import traceback
            traceback.print_exc()
            continue
        
        # –û—á–∏—Å—Ç–∫–∞ GPU –ø–∞–º—è—Ç–∏
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # ===== –ü–†–û–í–ï–†–ö–ê –ß–¢–û –•–û–¢–¨ –ß–¢–û-–¢–û –û–ë–£–ß–ò–õ–û–°–¨ =====
    if not models:
        raise Exception("‚ùå –ù–ò –û–î–ù–ê –ú–û–î–ï–õ–¨ –ù–ï –ë–´–õ–ê –£–°–ü–ï–®–ù–û –û–ë–£–ß–ï–ù–ê!")
    
    # ===== –í–´–ß–ò–°–õ–ï–ù–ò–ï –í–ï–°–û–í –î–õ–Ø –ê–ù–°–ê–ú–ë–õ–Ø =====
    weights = 1.0 / np.array(model_val_losses)
    weights = weights / weights.sum()
    
    print(f"\n{'='*70}")
    print(f"–í–ï–°–ê –ú–û–î–ï–õ–ï–ô –î–õ–Ø –í–ó–í–ï–®–ï–ù–ù–û–ì–û –ê–ù–°–ê–ú–ë–õ–Ø:")
    for i, (config, weight, val_loss) in enumerate(zip(model_configs, weights, model_val_losses)):
        if i < len(models):
            print(f"  –ú–æ–¥–µ–ª—å {i+1} ({config[1]}): –≤–µ—Å={weight:.4f}, val_loss={val_loss:.6f}")
    print(f"{'='*70}\n")
    
    is_initialized = True


# ===== –§–£–ù–ö–¶–ò–Ø –î–õ–Ø –î–û–ë–ê–í–õ–ï–ù–ò–Ø SPREAD FEATURES =====
def add_spread_features(train, features, target_pairs):
    """–î–æ–±–∞–≤–ª—è–µ—Ç —Ñ–∏—á–∏ –¥–ª—è —Å–ø—Ä–µ–¥–æ–≤ –º–µ–∂–¥—É –ø–∞—Ä–∞–º–∏ –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç–æ–≤"""
    print("–î–æ–±–∞–≤–ª–µ–Ω–∏–µ spread features...")
    
    for idx, row in target_pairs.iterrows():
        pair = row['pair']
        
        # –ü–∞—Ä—Å–∏–º –ø–∞—Ä—É (–Ω–∞–ø—Ä–∏–º–µ—Ä: "LME_CA_Close - US_Stock_CCJ_adj_close")
        if ' - ' in str(pair):
            try:
                col_a, col_b = pair.split(' - ')
                col_a = col_a.strip()
                col_b = col_b.strip()
                
                if col_a in train.columns and col_b in train.columns:
                    # –í—ã—á–∏—Å–ª—è–µ–º spread
                    spread = train[col_a] - train[col_b]
                    prefix = f'spread_{idx}'
                    
                    with np.errstate(divide='ignore', invalid='ignore'):
                        # MA –¥–ª—è spread
                        features[f'{prefix}_ma_5'] = spread.rolling(5, min_periods=1).mean()
                        features[f'{prefix}_ma_20'] = spread.rolling(20, min_periods=1).mean()
                        
                        # Volatility spread
                        features[f'{prefix}_std_5'] = spread.rolling(5, min_periods=1).std()
                        features[f'{prefix}_std_20'] = spread.rolling(20, min_periods=1).std()
                        
                        # Momentum spread
                        features[f'{prefix}_momentum_5'] = spread - spread.shift(5)
                        features[f'{prefix}_return_5d'] = spread.pct_change(5)
                        
                        # Z-score spread
                        rolling_mean = spread.rolling(20, min_periods=1).mean()
                        rolling_std = spread.rolling(20, min_periods=1).std()
                        features[f'{prefix}_zscore'] = (spread - rolling_mean) / rolling_std
                        
                        # Ratio features
                        ratio = train[col_a] / train[col_b]
                        features[f'{prefix}_ratio'] = ratio
                        features[f'{prefix}_ratio_ma_5'] = ratio.rolling(5, min_periods=1).mean()
            except Exception as e:
                # –¢–∏—Ö–æ –ø—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–±–ª–µ–º–Ω—ã–µ –ø–∞—Ä—ã
                pass
    
    print(f"Spread features –¥–æ–±–∞–≤–ª–µ–Ω—ã. –í—Å–µ–≥–æ —Ñ–∏—á–µ–π: {len(features.columns)}")
    return features

In [None]:
def predict(test, label_lags_1_batch, label_lags_2_batch, label_lags_3_batch, label_lags_4_batch):
    global models, scaler, feature_cols, model_val_losses, is_initialized, device
    
    if not is_initialized:
        initialize_models()
    
    try:
        test_pd = test.to_pandas()
        X_test = prepare_features(test_pd)
        X_test_scaled = scaler.transform(X_test[-1:])
        
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        
        # –í–ó–í–ï–®–ï–ù–ù–´–ô –ê–ù–°–ê–ú–ë–õ–¨ –≤–º–µ—Å—Ç–æ –ø—Ä–æ—Å—Ç–æ–≥–æ —É—Å—Ä–µ–¥–Ω–µ–Ω–∏—è
        all_preds = []
        with torch.no_grad():
            for model in models:
                pred = model(X_test_tensor)
                pred_cpu = pred.cpu().numpy()[0]
                all_preds.append(pred_cpu)
        
        # –í—ã—á–∏—Å–ª—è–µ–º –≤–µ—Å–∞ (–æ–±—Ä–∞—Ç–Ω–æ –ø—Ä–æ–ø–æ—Ä—Ü–∏–æ–Ω–∞–ª—å–Ω–æ val loss)
        weights = 1.0 / np.array(model_val_losses)
        weights = weights / weights.sum()
        
        # –í–∑–≤–µ—à–µ–Ω–Ω–æ–µ —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–µ
        predictions = np.average(all_preds, axis=0, weights=weights)
        predictions = np.clip(predictions, -0.1, 0.1)
        predictions = np.nan_to_num(predictions, nan=0.0, posinf=0.0, neginf=0.0)
        
        return pl.DataFrame({f'target_{i}': [float(predictions[i])] for i in range(NUM_TARGET_COLUMNS)})
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ predict: {e}")
        return pl.DataFrame({f'target_{i}': [0.0] for i in range(NUM_TARGET_COLUMNS)})

In [None]:
@torch.no_grad()
def efficient_predict_batch(test_data, models, weights=None, batch_size=1024):
    """–≠—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å –±–∞—Ç—á–∏–Ω–≥–æ–º"""
    if weights is None:
        weights = np.ones(len(models)) / len(models)
    
    # –ï—Å–ª–∏ test_data —É–∂–µ pandas DataFrame, –∏—Å–ø–æ–ª—å–∑—É–µ–º –∫–∞–∫ –µ—Å—Ç—å
    if hasattr(test_data, 'to_pandas'):
        test_pd = test_data.to_pandas()
    else:
        test_pd = test_data
    
    X_test = prepare_features(test_pd)
    X_test_scaled = scaler.transform(X_test)
    
    # –ë–∞—Ç—á–∏–Ω–≥ –¥–ª—è –±–æ–ª—å—à–∏—Ö –¥–∞–Ω–Ω—ã—Ö
    all_predictions = []
    
    for i in range(0, len(X_test_scaled), batch_size):
        batch = X_test_scaled[i:i+batch_size]
        X_batch_tensor = torch.FloatTensor(batch).to(device)
        
        batch_preds = []
        for model, weight in zip(models, weights):
            pred = model(X_batch_tensor).cpu().numpy()
            batch_preds.append(pred * weight)
        
        batch_ensemble = np.sum(batch_preds, axis=0)
        all_predictions.append(batch_ensemble)
    
    predictions = np.vstack(all_predictions)
    predictions = np.clip(predictions, -0.1, 0.1)
    predictions = np.nan_to_num(predictions, nan=0.0, posinf=0.0, neginf=0.0)
    
    return predictions

def create_submission_file():
    global models, scaler, feature_cols, is_initialized, device, ensemble_weights_global
    
    print("\n–°–æ–∑–¥–∞–Ω–∏–µ submission.parquet...")
    
    if not is_initialized:
        raise Exception("–ú–æ–¥–µ–ª–∏ –Ω–µ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω—ã!")
    
    test = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/test.csv')
    
    print("–ì–µ–Ω–µ—Ä–∞—Ü–∏—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π...")
    
    # –ò—Å–ø–æ–ª—å–∑—É–µ–º –±–∞—Ç—á–∏–Ω–≥–æ–≤–æ–µ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ
    predictions = efficient_predict_batch(test, models, ensemble_weights_global)
    
    submission = pd.DataFrame({'date_id': test['date_id'].values})
    for i in range(424):
        submission[f'target_{i}'] = predictions[:, i]
    
    if 'is_scored' in test.columns:
        submission = submission[test['is_scored'] == True].reset_index(drop=True)
    
    submission = submission.fillna(0).replace([np.inf, -np.inf], 0)
    submission.to_parquet('submission.parquet', index=False, engine='pyarrow')
    
    print(f"–ì–æ—Ç–æ–≤–æ: {submission.shape}")
    
    # –û—á–∏—â–∞–µ–º GPU –ø–∞–º—è—Ç—å
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

In [None]:
# def main():
#     global is_initialized
    
#     print("–ó–ê–ü–£–°–ö –£–õ–£–ß–®–ï–ù–ù–û–ì–û –ü–ê–ô–ü–õ–ê–ô–ù–ê –° OPTUNA")
#     print("="*50)
    
#     # –®–∞–≥ 1: –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∏ –æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π —Å Optuna
#     if not is_initialized:
#         print("–ù–∞—á–∞–ª–æ –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–µ–π —Å –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–µ–π –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤...")
#         initialize_models()
#     else:
#         print("–ú–æ–¥–µ–ª–∏ —É–∂–µ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω—ã, –ø—Ä–æ–ø—É—Å–∫–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ")
    
#     # –®–∞–≥ 2: –°–æ–∑–¥–∞–Ω–∏–µ submission —Ñ–∞–π–ª–∞
#     print("–°–æ–∑–¥–∞–Ω–∏–µ submission —Ñ–∞–π–ª–∞...")
#     create_submission_file()
    
#     print("\n–ü–ê–ô–ü–õ–ê–ô–ù –£–°–ü–ï–®–ù–û –ó–ê–í–ï–†–®–ï–ù!")

# # –ó–∞–ø—É—Å–∫–∞–µ–º –æ—Å–Ω–æ–≤–Ω–æ–π –ø–∞–π–ø–ª–∞–π–Ω
# if __name__ == "__main__":
#     main()

In [None]:
def calculate_kaggle_score_correct(predictions, targets):
    """
    –ü–†–ê–í–ò–õ–¨–ù–ê–Ø –º–µ—Ç—Ä–∏–∫–∞ —Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏—è: Modified Sharpe Ratio
    
    Score = (Mean Spearman Correlation / Std Spearman Correlation) * 100,000
    """
    from scipy.stats import spearmanr
    
    print("\n" + "="*70)
    print("–í–´–ß–ò–°–õ–ï–ù–ò–ï KAGGLE SCORE (Modified Sharpe Ratio)")
    print("="*70)
    
    correlations = []
    failed_targets = 0
    
    # –î–ª—è –∫–∞–∂–¥–æ–≥–æ target –≤—ã—á–∏—Å–ª—è–µ–º Spearman correlation
    for i in range(targets.shape[1]):  # 424 targets
        pred_col = predictions[:, i]
        true_col = targets[:, i]
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º —á—Ç–æ –µ—Å—Ç—å –≤–∞—Ä–∏–∞—Ü–∏—è –≤ –¥–∞–Ω–Ω—ã—Ö
        if len(np.unique(pred_col)) < 2 or len(np.unique(true_col)) < 2:
            failed_targets += 1
            continue
        
        # Spearman rank correlation
        try:
            corr, p_value = spearmanr(pred_col, true_col)
            if not np.isnan(corr) and not np.isinf(corr):
                correlations.append(corr)
            else:
                failed_targets += 1
        except Exception as e:
            failed_targets += 1
            continue
    
    if len(correlations) == 0:
        print("\n‚ö†Ô∏è –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –û–®–ò–ë–ö–ê: –ù–µ —É–¥–∞–ª–æ—Å—å –≤—ã—á–∏—Å–ª–∏—Ç—å –Ω–∏ –æ–¥–Ω–æ–π –∫–æ—Ä—Ä–µ–ª—è—Ü–∏–∏!")
        print(f"   –ü—Ä–æ–≤–∞–ª–µ–Ω–æ targets: {failed_targets}/{targets.shape[1]}")
        print("\n   –ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–∞–Ω–Ω—ã—Ö:")
        print(f"   Predictions shape: {predictions.shape}")
        print(f"   Predictions range: [{predictions.min():.6f}, {predictions.max():.6f}]")
        print(f"   Predictions unique values: {len(np.unique(predictions))}")
        print(f"   Targets shape: {targets.shape}")
        print(f"   Targets range: [{targets.min():.6f}, {targets.max():.6f}]")
        
        return {
            'kaggle_score': 0.0,
            'sharpe_ratio': 0.0,
            'mean_correlation': 0.0,
            'std_correlation': 0.0,
            'median_correlation': 0.0,
            'correlations': np.array([])
        }
    
    correlations = np.array(correlations)
    
    # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
    mean_corr = np.mean(correlations)
    std_corr = np.std(correlations)
    median_corr = np.median(correlations)
    min_corr = np.min(correlations)
    max_corr = np.max(correlations)
    
    # Kaggle Score (–º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–Ω—ã–π Modified Sharpe Ratio)
    if std_corr > 1e-8:  # –ó–∞—â–∏—Ç–∞ –æ—Ç –¥–µ–ª–µ–Ω–∏—è –Ω–∞ 0
        sharpe_ratio = mean_corr / std_corr
        kaggle_score = sharpe_ratio * 100000  # –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ
    else:
        sharpe_ratio = 0
        kaggle_score = 0
    
    print(f"\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ Spearman Correlations:")
    print(f"  –£—Å–ø–µ—à–Ω–æ: {len(correlations)}/{targets.shape[1]} targets")
    if failed_targets > 0:
        print(f"  –ü—Ä–æ–≤–∞–ª–µ–Ω–æ: {failed_targets} targets (–∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è)")
    print(f"  Mean:    {mean_corr:.6f}")
    print(f"  Median:  {median_corr:.6f}")
    print(f"  Std:     {std_corr:.6f}")
    print(f"  Min:     {min_corr:.6f}")
    print(f"  Max:     {max_corr:.6f}")
    
    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ
    positive = (correlations > 0).sum()
    negative = (correlations < 0).sum()
    near_zero = (np.abs(correlations) < 0.01).sum()
    
    print(f"\n  Positive correlations: {positive}/{len(correlations)} ({100*positive/len(correlations):.1f}%)")
    print(f"  Negative correlations: {negative}/{len(correlations)} ({100*negative/len(correlations):.1f}%)")
    print(f"  Near zero (|r| < 0.01): {near_zero}/{len(correlations)} ({100*near_zero/len(correlations):.1f}%)")
    
    print(f"\n{'='*70}")
    print(f"KAGGLE SCORE (Modified Sharpe Ratio):")
    print(f"{'='*70}")
    print(f"  Sharpe Ratio:        {sharpe_ratio:.6f}")
    print(f"  KAGGLE SCORE:        {kaggle_score:.2f}")
    
    # –ò–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏—è
    if kaggle_score > 100000:
        print(f"\n  ‚úÖ –û–¢–õ–ò–ß–ù–´–ô —Ä–µ–∑—É–ª—å—Ç–∞—Ç! (> 100,000)")
    elif kaggle_score > 50000:
        print(f"\n  ‚úì –•–æ—Ä–æ—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç (> 50,000)")
    elif kaggle_score > 0:
        print(f"\n  ‚ö† –°–ª–∞–±—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç (> 0, –Ω–æ < 50,000)")
    else:
        print(f"\n  ‚ùå –ü–õ–û–•–û–ô —Ä–µ–∑—É–ª—å—Ç–∞—Ç (–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–π score)")
        print(f"     –ú–æ–¥–µ–ª—å –ø—Ä–µ–¥—Å–∫–∞–∑—ã–≤–∞–µ—Ç –≤ –ø—Ä–æ—Ç–∏–≤–æ–ø–æ–ª–æ–∂–Ω–æ–º –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–∏!")
    
    print(f"{'='*70}\n")
    
    return {
        'kaggle_score': kaggle_score,
        'sharpe_ratio': sharpe_ratio,
        'mean_correlation': mean_corr,
        'std_correlation': std_corr,
        'median_correlation': median_corr,
        'min_correlation': min_corr,
        'max_correlation': max_corr,
        'correlations': correlations,
        'successful_targets': len(correlations),
        'failed_targets': failed_targets
    }

In [None]:
def diagnose_predictions(predictions, targets):
    """–î–∏–∞–≥–Ω–æ—Å—Ç–∏–∫–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –¥–ª—è –ø–æ–∏—Å–∫–∞ –ø—Ä–æ–±–ª–µ–º"""
    print("\n" + "="*70)
    print("–î–ò–ê–ì–ù–û–°–¢–ò–ö–ê –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–ô")
    print("="*70)
    
    print(f"\n–§–æ—Ä–º–∞ –¥–∞–Ω–Ω—ã—Ö:")
    print(f"  Predictions: {predictions.shape}")
    print(f"  Targets: {targets.shape}")
    
    print(f"\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ predictions:")
    print(f"  Mean:   {predictions.mean():.8f}")
    print(f"  Std:    {predictions.std():.8f}")
    print(f"  Min:    {predictions.min():.8f}")
    print(f"  Max:    {predictions.max():.8f}")
    print(f"  Median: {np.median(predictions):.8f}")
    
    print(f"\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ targets:")
    print(f"  Mean:   {targets.mean():.8f}")
    print(f"  Std:    {targets.std():.8f}")
    print(f"  Min:    {targets.min():.8f}")
    print(f"  Max:    {targets.max():.8f}")
    print(f"  Median: {np.median(targets):.8f}")
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ –∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
    const_pred_cols = []
    const_target_cols = []
    
    for i in range(predictions.shape[1]):
        if len(np.unique(predictions[:, i])) < 2:
            const_pred_cols.append(i)
        if len(np.unique(targets[:, i])) < 2:
            const_target_cols.append(i)
    
    if len(const_pred_cols) > 0:
        print(f"\n‚ö†Ô∏è –ö–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ predictions –∫–æ–ª–æ–Ω–∫–∏: {len(const_pred_cols)}/{predictions.shape[1]}")
        print(f"   –ü—Ä–∏–º–µ—Ä—ã –∏–Ω–¥–µ–∫—Å–æ–≤: {const_pred_cols[:10]}")
    else:
        print(f"\n‚úÖ –ù–µ—Ç –∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã—Ö predictions –∫–æ–ª–æ–Ω–æ–∫")
    
    if len(const_target_cols) > 0:
        print(f"\n‚ö†Ô∏è –ö–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ target –∫–æ–ª–æ–Ω–∫–∏: {len(const_target_cols)}/{targets.shape[1]}")
        print(f"   –ü—Ä–∏–º–µ—Ä—ã –∏–Ω–¥–µ–∫—Å–æ–≤: {const_target_cols[:10]}")
    else:
        print(f"\n‚úÖ –ù–µ—Ç –∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã—Ö target –∫–æ–ª–æ–Ω–æ–∫")
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –Ω–∞ NaN –∏ Inf
    nan_preds = np.isnan(predictions).sum()
    inf_preds = np.isinf(predictions).sum()
    nan_targets = np.isnan(targets).sum()
    inf_targets = np.isinf(targets).sum()
    
    if nan_preds > 0 or inf_preds > 0:
        print(f"\n‚ö†Ô∏è Predictions: NaN={nan_preds}, Inf={inf_preds}")
    else:
        print(f"\n‚úÖ Predictions: –ù–µ—Ç NaN/Inf")
    
    if nan_targets > 0 or inf_targets > 0:
        print(f"\n‚ö†Ô∏è Targets: NaN={nan_targets}, Inf={inf_targets}")
    else:
        print(f"\n‚úÖ Targets: –ù–µ—Ç NaN/Inf")
    
    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤
    pos_preds = (predictions > 0).sum()
    neg_preds = (predictions < 0).sum()
    zero_preds = (predictions == 0).sum()
    
    pos_targets = (targets > 0).sum()
    neg_targets = (targets < 0).sum()
    zero_targets = (targets == 0).sum()
    
    total = predictions.size
    print(f"\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤ predictions:")
    print(f"  Positive: {pos_preds}/{total} ({100*pos_preds/total:.1f}%)")
    print(f"  Negative: {neg_preds}/{total} ({100*neg_preds/total:.1f}%)")
    print(f"  Zero:     {zero_preds}/{total} ({100*zero_preds/total:.1f}%)")
    
    print(f"\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∑–Ω–∞–∫–æ–≤ targets:")
    print(f"  Positive: {pos_targets}/{total} ({100*pos_targets/total:.1f}%)")
    print(f"  Negative: {neg_targets}/{total} ({100*neg_targets/total:.1f}%)")
    print(f"  Zero:     {zero_targets}/{total} ({100*zero_targets/total:.1f}%)")
    
    # –ü—Ä–æ–≤–µ—Ä–∫–∞ –≤–∞—Ä–∏–∞—Ü–∏–∏ –ø–æ targets
    low_variance_preds = 0
    low_variance_targets = 0
    
    for i in range(predictions.shape[1]):
        if predictions[:, i].std() < 1e-6:
            low_variance_preds += 1
        if targets[:, i].std() < 1e-6:
            low_variance_targets += 1
    
    if low_variance_preds > 0:
        print(f"\n‚ö†Ô∏è Predictions —Å –Ω–∏–∑–∫–æ–π –¥–∏—Å–ø–µ—Ä—Å–∏–µ–π (std < 1e-6): {low_variance_preds}/{predictions.shape[1]}")
    
    if low_variance_targets > 0:
        print(f"\n‚ö†Ô∏è Targets —Å –Ω–∏–∑–∫–æ–π –¥–∏—Å–ø–µ—Ä—Å–∏–µ–π (std < 1e-6): {low_variance_targets}/{targets.shape[1]}")
    
    print("="*70 + "\n")


def calculate_kaggle_score_correct(predictions, targets):
    """
    –ü–†–ê–í–ò–õ–¨–ù–ê–Ø –º–µ—Ç—Ä–∏–∫–∞ —Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏—è: Modified Sharpe Ratio
    
    Score = (Mean Spearman Correlation / Std Spearman Correlation) * 100,000
    """
    from scipy.stats import spearmanr
    
    print("\n" + "="*70)
    print("–í–´–ß–ò–°–õ–ï–ù–ò–ï KAGGLE SCORE (Modified Sharpe Ratio)")
    print("="*70)
    
    correlations = []
    failed_targets = 0
    
    # –î–ª—è –∫–∞–∂–¥–æ–≥–æ target –≤—ã—á–∏—Å–ª—è–µ–º Spearman correlation
    for i in range(targets.shape[1]):  # 424 targets
        pred_col = predictions[:, i]
        true_col = targets[:, i]
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º —á—Ç–æ –µ—Å—Ç—å –≤–∞—Ä–∏–∞—Ü–∏—è –≤ –¥–∞–Ω–Ω—ã—Ö
        if len(np.unique(pred_col)) < 2 or len(np.unique(true_col)) < 2:
            failed_targets += 1
            continue
        
        # Spearman rank correlation
        try:
            corr, p_value = spearmanr(pred_col, true_col)
            if not np.isnan(corr) and not np.isinf(corr):
                correlations.append(corr)
            else:
                failed_targets += 1
        except Exception as e:
            failed_targets += 1
            continue
    
    if len(correlations) == 0:
        print("\n‚ö†Ô∏è –ö–†–ò–¢–ò–ß–ï–°–ö–ê–Ø –û–®–ò–ë–ö–ê: –ù–µ —É–¥–∞–ª–æ—Å—å –≤—ã—á–∏—Å–ª–∏—Ç—å –Ω–∏ –æ–¥–Ω–æ–π –∫–æ—Ä—Ä–µ–ª—è—Ü–∏–∏!")
        print(f"   –ü—Ä–æ–≤–∞–ª–µ–Ω–æ targets: {failed_targets}/{targets.shape[1]}")
        print("\n   –ü—Ä–∏—á–∏–Ω—ã:")
        print("   - –í—Å–µ predictions –∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ target")
        print("   - –í—Å–µ targets –∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ")
        print("   - –ù–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –≤–∞—Ä–∏–∞—Ü–∏–∏ –≤ –¥–∞–Ω–Ω—ã—Ö")
        
        return {
            'kaggle_score': 0.0,
            'sharpe_ratio': 0.0,
            'mean_correlation': 0.0,
            'std_correlation': 0.0,
            'median_correlation': 0.0,
            'correlations': np.array([]),
            'successful_targets': 0,
            'failed_targets': failed_targets
        }
    
    correlations = np.array(correlations)
    
    # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
    mean_corr = np.mean(correlations)
    std_corr = np.std(correlations)
    median_corr = np.median(correlations)
    min_corr = np.min(correlations)
    max_corr = np.max(correlations)
    
    # Kaggle Score (–º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–Ω—ã–π Modified Sharpe Ratio)
    if std_corr > 1e-8:  # –ó–∞—â–∏—Ç–∞ –æ—Ç –¥–µ–ª–µ–Ω–∏—è –Ω–∞ 0
        sharpe_ratio = mean_corr / std_corr
        kaggle_score = sharpe_ratio * 100000  # –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ
    else:
        sharpe_ratio = 0
        kaggle_score = 0
    
    print(f"\n–°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ Spearman Correlations:")
    print(f"  –£—Å–ø–µ—à–Ω–æ: {len(correlations)}/{targets.shape[1]} targets")
    if failed_targets > 0:
        print(f"  –ü—Ä–æ–≤–∞–ª–µ–Ω–æ: {failed_targets} targets (–∫–æ–Ω—Å—Ç–∞–Ω—Ç–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è)")
    print(f"  Mean:    {mean_corr:.6f}")
    print(f"  Median:  {median_corr:.6f}")
    print(f"  Std:     {std_corr:.6f}")
    print(f"  Min:     {min_corr:.6f}")
    print(f"  Max:     {max_corr:.6f}")
    
    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ
    positive = (correlations > 0).sum()
    negative = (correlations < 0).sum()
    near_zero = (np.abs(correlations) < 0.01).sum()
    
    print(f"\n  Positive correlations: {positive}/{len(correlations)} ({100*positive/len(correlations):.1f}%)")
    print(f"  Negative correlations: {negative}/{len(correlations)} ({100*negative/len(correlations):.1f}%)")
    print(f"  Near zero (|r| < 0.01): {near_zero}/{len(correlations)} ({100*near_zero/len(correlations):.1f}%)")
    
    print(f"\n{'='*70}")
    print(f"KAGGLE SCORE (Modified Sharpe Ratio):")
    print(f"{'='*70}")
    print(f"  Sharpe Ratio:        {sharpe_ratio:.6f}")
    print(f"  KAGGLE SCORE:        {kaggle_score:.2f}")
    
    # –ò–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏—è
    if kaggle_score > 100000:
        print(f"\n  ‚úÖ –û–¢–õ–ò–ß–ù–´–ô —Ä–µ–∑—É–ª—å—Ç–∞—Ç! (> 100,000)")
    elif kaggle_score > 50000:
        print(f"\n  ‚úì –•–æ—Ä–æ—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç (> 50,000)")
    elif kaggle_score > 0:
        print(f"\n  ‚ö† –°–ª–∞–±—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç (> 0, –Ω–æ < 50,000)")
    else:
        print(f"\n  ‚ùå –ü–õ–û–•–û–ô —Ä–µ–∑—É–ª—å—Ç–∞—Ç (–æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–π score)")
        print(f"     –ú–æ–¥–µ–ª—å –ø—Ä–µ–¥—Å–∫–∞–∑—ã–≤–∞–µ—Ç –≤ –ø—Ä–æ—Ç–∏–≤–æ–ø–æ–ª–æ–∂–Ω–æ–º –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–∏!")
    
    print(f"{'='*70}\n")
    
    return {
        'kaggle_score': kaggle_score,
        'sharpe_ratio': sharpe_ratio,
        'mean_correlation': mean_corr,
        'std_correlation': std_corr,
        'median_correlation': median_corr,
        'min_correlation': min_corr,
        'max_correlation': max_corr,
        'correlations': correlations,
        'successful_targets': len(correlations),
        'failed_targets': failed_targets
    }

In [None]:
def full_evaluation_corrected():
    """
    –ü–æ–ª–Ω–∞—è –æ—Ü–µ–Ω–∫–∞ —Å –ü–†–ê–í–ò–õ–¨–ù–û–ô –º–µ—Ç—Ä–∏–∫–æ–π
    """
    global models, scaler, feature_cols, base_cols, model_val_losses, is_initialized, device
    
    if not is_initialized:
        print("–ú–æ–¥–µ–ª–∏ –Ω–µ –∏–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω—ã!")
        return
    
    print("\n" + "üéØ"*35)
    print("–ü–û–õ–ù–ê–Ø –û–¶–ï–ù–ö–ê –ú–û–î–ï–õ–ò (–ü–†–ê–í–ò–õ–¨–ù–ê–Ø –ú–ï–¢–†–ò–ö–ê)")
    print("üéØ"*35)
    
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ
    train = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train.csv')
    train_labels = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train_labels.csv')
    
    # –°–æ–∑–¥–∞–µ–º —Ñ–∏—á–∏
    train_features = create_enhanced_features(train, base_cols_ref=base_cols)
    
    try:
        target_pairs = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/target_pairs.csv')
        train_features = add_spread_features(train, train_features, target_pairs)
    except:
        pass
    
    target_cols = [f'target_{i}' for i in range(424)]
    
    X_train = train_features[feature_cols].fillna(0).values
    y_train = train_labels[target_cols].fillna(0).values
    
    X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
    y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Validation set (–ø–æ—Å–ª–µ–¥–Ω–∏–µ 10%)
    split_idx = int(len(X_train) * 0.9)
    X_val = X_train[split_idx:]
    y_val = y_train[split_idx:]
    
    print(f"\n1Ô∏è‚É£  VALIDATION SCORE:")
    print(f"Validation set size: {len(X_val)} samples")
    
    # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
    X_val_scaled = scaler.transform(X_val)
    X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
    
    all_preds = []
    with torch.no_grad():
        for model in models:
            model.eval()
            pred = model(X_val_tensor)
            pred_cpu = pred.cpu().numpy()
            all_preds.append(pred_cpu)
    
    # –í–∑–≤–µ—à–µ–Ω–Ω—ã–π –∞–Ω—Å–∞–º–±–ª—å
    weights = 1.0 / np.array(model_val_losses)
    weights = weights / weights.sum()
    
    val_preds = np.average(all_preds, axis=0, weights=weights)
    val_preds = np.clip(val_preds, -0.1, 0.1)
    val_preds = np.nan_to_num(val_preds, nan=0.0, posinf=0.0, neginf=0.0)
    
    # –î–ò–ê–ì–ù–û–°–¢–ò–ö–ê VALIDATION PREDICTIONS
    diagnose_predictions(val_preds, y_val)
    
    # –í—ã—á–∏—Å–ª—è–µ–º –ü–†–ê–í–ò–õ–¨–ù–´–ô Kaggle Score
    val_results = calculate_kaggle_score_correct(val_preds, y_val)
    
    # Test set (–ø–æ—Å–ª–µ–¥–Ω–∏–µ 90 –¥–Ω–µ–π)
    print(f"\n2Ô∏è‚É£  TEST SCORE:")
    test_train = train.tail(90).reset_index(drop=True)
    test_labels = train_labels.tail(90).reset_index(drop=True)
    
    X_test = prepare_features(test_train)
    y_test = test_labels[target_cols].fillna(0).values
    y_test = np.nan_to_num(y_test, nan=0.0, posinf=0.0, neginf=0.0)
    
    X_test_scaled = scaler.transform(X_test)
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    
    all_preds = []
    with torch.no_grad():
        for model in models:
            model.eval()
            pred = model(X_test_tensor)
            pred_cpu = pred.cpu().numpy()
            all_preds.append(pred_cpu)
    
    test_preds = np.average(all_preds, axis=0, weights=weights)
    test_preds = np.clip(test_preds, -0.1, 0.1)
    test_preds = np.nan_to_num(test_preds, nan=0.0, posinf=0.0, neginf=0.0)
    
    # –î–ò–ê–ì–ù–û–°–¢–ò–ö–ê TEST PREDICTIONS
    diagnose_predictions(test_preds, y_test)
    
    test_results = calculate_kaggle_score_correct(test_preds, y_test)
    
    # –°—Ä–∞–≤–Ω–µ–Ω–∏–µ
    print("\n" + "="*70)
    print("üìä –°–†–ê–í–ù–ï–ù–ò–ï –†–ï–ó–£–õ–¨–¢–ê–¢–û–í:")
    print("="*70)
    print(f"{'–ú–µ—Ç—Ä–∏–∫–∞':<30} {'Validation':<20} {'Test':<20}")
    print("-"*70)
    print(f"{'KAGGLE SCORE':<30} {val_results['kaggle_score']:<20.2f} {test_results['kaggle_score']:<20.2f}")
    print(f"{'Sharpe Ratio':<30} {val_results['sharpe_ratio']:<20.6f} {test_results['sharpe_ratio']:<20.6f}")
    print(f"{'Mean Correlation':<30} {val_results['mean_correlation']:<20.6f} {test_results['mean_correlation']:<20.6f}")
    print(f"{'Std Correlation':<30} {val_results['std_correlation']:<20.6f} {test_results['std_correlation']:<20.6f}")
    print("="*70)
    
    if val_results['kaggle_score'] > 100000:
        print("‚úÖ –û–¢–õ–ò–ß–ù–´–ô –†–ï–ó–£–õ–¨–¢–ê–¢! Score > 100,000")
    elif val_results['kaggle_score'] > 50000:
        print("‚úì –•–æ—Ä–æ—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç. Score > 50,000")
    elif val_results['kaggle_score'] > 0:
        print("‚ö† –°–ª–∞–±—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç. Score > 0, –Ω–æ < 50,000")
    else:
        print("‚ùå –ü–õ–û–•–û–ô —Ä–µ–∑—É–ª—å—Ç–∞—Ç. –û—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–π score!")
    
    print("\n" + "üéØ"*35 + "\n")
    
    return {
        'validation': val_results,
        'test': test_results
    }

In [None]:
# –î–æ–±–∞–≤—å—Ç–µ –≤ main():
def main():
    global is_initialized
    
    print("–ó–ê–ü–£–°–ö –ü–ê–ô–ü–õ–ê–ô–ù–ê")
    print("="*50)
    
    if not is_initialized:
        print("–ù–∞—á–∞–ª–æ –æ–±—É—á–µ–Ω–∏—è –º–æ–¥–µ–ª–µ–π...")
        initialize_models()
    
    # –ü–†–ê–í–ò–õ–¨–ù–ê–Ø –û–¶–ï–ù–ö–ê
    print("\n" + "="*50)
    print("–í–´–ß–ò–°–õ–ï–ù–ò–ï KAGGLE SCORE")
    print("="*50)
    
    results = full_evaluation_corrected()
    
    # –°–æ–∑–¥–∞–Ω–∏–µ submission
    print("\n–°–æ–∑–¥–∞–Ω–∏–µ submission —Ñ–∞–π–ª–∞...")
    create_submission_file()
    
    print("\n‚úÖ –í–°–ï –ì–û–¢–û–í–û!")
    print(f"üèÜ –í–∞—à –æ–∂–∏–¥–∞–µ–º—ã–π Kaggle Score: {results['validation']['kaggle_score']:.2f}")
    print(f"üìä Sharpe Ratio: {results['validation']['sharpe_ratio']:.6f}")
    print(f"üìÅ Submission —Ñ–∞–π–ª: submission.parquet")
    print(f"\nüöÄ –û–¢–ü–†–ê–í–¨–¢–ï –ù–ê KAGGLE!")

if __name__ == "__main__":
    main()