In [22]:
"""
MITSUI - 5-Model Ensemble with Feature Alignment
"""

import os
import pandas as pd
import polars as pl
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
# import kaggle_evaluation.mitsui_inference_server

import warnings
warnings.filterwarnings('ignore')

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используется устройство: {device}")
print(f"Доступно GPU: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Название GPU: {torch.cuda.get_device_name(0)}")

NUM_TARGET_COLUMNS = 424

# Глобальные переменные
models = []
scaler = None
feature_cols = None
base_cols = None
is_initialized = False
model_val_losses = []

Используется устройство: cuda
Доступно GPU: 1
Название GPU: NVIDIA L4


In [23]:
# class Model1_Deep(nn.Module):
#     def __init__(self, input_size, output_size):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_size, 512),
#             nn.BatchNorm1d(512),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(512, 256),
#             nn.BatchNorm1d(256),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, 128),
#             nn.ReLU(),
#             nn.Linear(128, output_size)
#         )
    
#     def forward(self, x):
#         return self.net(x)

# class Model2_Wide(nn.Module):
#     def __init__(self, input_size, output_size):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_size, 1024),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(1024, 512),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(512, output_size)
#         )
    
#     def forward(self, x):
#         return self.net(x)

# class Model3_Residual(nn.Module):
#     def __init__(self, input_size, output_size):
#         super().__init__()
#         self.input_proj = nn.Linear(input_size, 256)
#         self.block1 = nn.Sequential(
#             nn.Linear(256, 256),
#             nn.ReLU(),
#             nn.Dropout(0.2)
#         )
#         self.block2 = nn.Sequential(
#             nn.Linear(256, 256),
#             nn.ReLU(),
#             nn.Dropout(0.2)
#         )
#         self.output = nn.Linear(256, output_size)
    
#     def forward(self, x):
#         x = self.input_proj(x)
#         x = x + self.block1(x)
#         x = x + self.block2(x)
#         return self.output(x)

# class Model4_DeepWide(nn.Module):
#     def __init__(self, input_size, output_size):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_size, 768),
#             nn.BatchNorm1d(768),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(768, 512),
#             nn.BatchNorm1d(512),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(512, 256),
#             nn.ReLU(),
#             nn.Dropout(0.1),
#             nn.Linear(256, output_size)
#         )
    
#     def forward(self, x):
#         return self.net(x)

# class Model5_Bottleneck(nn.Module):
#     def __init__(self, input_size, output_size):
#         super().__init__()
#         self.encoder = nn.Sequential(
#             nn.Linear(input_size, 512),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(512, 128),
#             nn.ReLU()
#         )
#         self.decoder = nn.Sequential(
#             nn.Linear(128, 256),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(256, output_size)
#         )
    
#     def forward(self, x):
#         x = self.encoder(x)
#         x = self.decoder(x)
#         return x

In [None]:
# ==========================================
# УЛУЧШЕННЫЕ АРХИТЕКТУРЫ МОДЕЛЕЙ
# ==========================================

class Model1_Deep(nn.Module):
    """Глубокая сеть с Layer Normalization и лучшей регуляризацией"""
    def __init__(self, input_size, output_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.LayerNorm(512),  # LayerNorm вместо BatchNorm
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 384),
            nn.LayerNorm(384),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(384, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.15),
            
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        return self.net(x)

class Model2_Wide(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 768)  # Меньше 1024→768
        self.bn1 = nn.LayerNorm(768)  # LayerNorm вместо BatchNorm
        self.drop1 = nn.Dropout(0.4)  # Больше dropout
        
        self.fc2 = nn.Linear(768, 512)
        self.bn2 = nn.LayerNorm(512)
        self.drop2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(512, 384)
        self.bn3 = nn.LayerNorm(384)
        self.drop3 = nn.Dropout(0.2)
        
        self.output = nn.Linear(384, output_size)
        self.skip = nn.Linear(input_size, 384)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        identity = self.skip(x)
        
        x = self.activation(self.bn1(self.fc1(x)))
        x = self.drop1(x)
        x = self.activation(self.bn2(self.fc2(x)))
        x = self.drop2(x)
        x = self.activation(self.bn3(self.fc3(x)))
        x = self.drop3(x)
        
        x = x + identity
        return self.output(x)

class Model3_Residual(nn.Module):
    """Исправленная Residual с BatchNorm и Pre-Activation"""
    def __init__(self, input_size, output_size):
        super().__init__()
        self.input_proj = nn.Linear(input_size, 384)
        self.bn_input = nn.BatchNorm1d(384)
        
        # Pre-activation residual blocks
        self.block1 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.block2 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.block3 = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 384)
        )
        
        self.output_proj = nn.Sequential(
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        x = self.bn_input(self.input_proj(x))
        x = x + self.block1(x)
        x = x + self.block2(x)
        x = x + self.block3(x)
        return self.output_proj(x)

class Model4_DeepWide(nn.Module):
    """Deep & Wide с параллельными путями"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # Deep path (узкий и глубокий)
        self.deep = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(384, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.ReLU(),
        )
        
        # Wide path (широкий и мелкий)
        self.wide = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
        )
        
        # Combine
        self.combine = nn.Sequential(
            nn.Linear(256, 128),  # 128 + 128 = 256
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        deep_out = self.deep(x)
        wide_out = self.wide(x)
        
        # Concatenate
        combined = torch.cat([deep_out, wide_out], dim=1)
        return self.combine(combined)

class Model5_Bottleneck(nn.Module):
    """Bottleneck с Attention механизмом"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )
        
        # Self-Attention на bottleneck
        self.attention = nn.Sequential(
            nn.Linear(128, 64),
            nn.Tanh(),
            nn.Linear(64, 128),
            nn.Softmax(dim=1)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.15),
            
            nn.Linear(384, output_size)
        )
    
    def forward(self, x):
        # Encode
        encoded = self.encoder(x)
        
        # Attention
        attention_weights = self.attention(encoded)
        attended = encoded * attention_weights
        
        # Decode
        return self.decoder(attended)

class Model6_Transformer(nn.Module):
    """Transformer-inspired architecture"""
    def __init__(self, input_size, output_size, num_heads=8):
        super().__init__()
        
        self.input_proj = nn.Linear(input_size, 512)
        
        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=512,
            num_heads=num_heads,
            dropout=0.2,
            batch_first=True
        )
        
        self.norm1 = nn.LayerNorm(512)
        self.norm2 = nn.LayerNorm(512)
        
        # Feed-forward
        self.ffn = nn.Sequential(
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512)
        )
        
        # Output
        self.output = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(256, output_size)
        )
    
    def forward(self, x):
        # Project input
        x = self.input_proj(x)
        
        # Add sequence dimension for attention
        x = x.unsqueeze(1)  # [batch, 1, features]
        
        # Self-attention with residual
        attn_out, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_out)
        
        # Feed-forward with residual
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        
        # Remove sequence dimension
        x = x.squeeze(1)  # [batch, features]
        
        return self.output(x)

class Model7_EnsembleBlock(nn.Module):
    """Модель с внутренним ансамблем"""
    def __init__(self, input_size, output_size):
        super().__init__()
        
        # Три параллельных пути
        self.path1 = nn.Sequential(
            nn.Linear(input_size, 384),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(384, 256),
            nn.ReLU(),
        )
        
        self.path2 = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
        )
        
        self.path3 = nn.Sequential(
            nn.Linear(input_size, 640),
            nn.ReLU(),
            nn.Dropout(0.35),
            nn.Linear(640, 256),
            nn.ReLU(),
        )
        
        # Gating mechanism для взвешивания путей
        self.gate = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 3),
            nn.Softmax(dim=1)
        )
        
        # Final layers
        self.output = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        # Три пути
        out1 = self.path1(x)
        out2 = self.path2(x)
        out3 = self.path3(x)
        
        # Gating weights
        gates = self.gate(x)  # [batch, 3]
        
        # Weighted combination
        combined = (gates[:, 0:1] * out1 + 
                   gates[:, 1:2] * out2 + 
                   gates[:, 2:3] * out3)
        
        return self.output(combined)

In [25]:
def create_enhanced_features(df, base_cols_ref=None):
    """Создает расширенный набор технических индикаторов"""
    features = df.copy()
    
    if base_cols_ref is not None:
        numeric_cols = [c for c in base_cols_ref if c in df.columns]
    else:
        numeric_cols = [c for c in df.columns 
                       if c not in ['date_id', 'is_scored'] 
                       and pd.api.types.is_numeric_dtype(df[c])]
    
    for col in numeric_cols:
        try:
            with np.errstate(divide='ignore', invalid='ignore'):
                # ===== БАЗОВЫЕ RETURNS =====
                features[f'{col}_return_1d'] = df[col].pct_change(1)
                features[f'{col}_return_5d'] = df[col].pct_change(5)
                features[f'{col}_return_20d'] = df[col].pct_change(20)
                features[f'{col}_return_60d'] = df[col].pct_change(60)
                
                # ===== SIMPLE MOVING AVERAGES (SMA) =====
                features[f'{col}_ma_5'] = df[col].rolling(5, min_periods=1).mean()
                features[f'{col}_ma_10'] = df[col].rolling(10, min_periods=1).mean()
                features[f'{col}_ma_20'] = df[col].rolling(20, min_periods=1).mean()
                features[f'{col}_ma_60'] = df[col].rolling(60, min_periods=1).mean()
                
                # ===== EXPONENTIAL MOVING AVERAGES (EMA) =====
                features[f'{col}_ema_5'] = df[col].ewm(span=5, adjust=False).mean()
                features[f'{col}_ema_10'] = df[col].ewm(span=10, adjust=False).mean()
                features[f'{col}_ema_20'] = df[col].ewm(span=20, adjust=False).mean()
                
                # ===== MA CROSSOVERS =====
                features[f'{col}_ma_5_20_diff'] = features[f'{col}_ma_5'] - features[f'{col}_ma_20']
                features[f'{col}_ma_10_60_diff'] = features[f'{col}_ma_10'] - features[f'{col}_ma_60']
                features[f'{col}_ema_5_20_diff'] = features[f'{col}_ema_5'] - features[f'{col}_ema_20']
                
                # ===== PRICE TO MA DISTANCE =====
                features[f'{col}_to_ma_5'] = (df[col] - features[f'{col}_ma_5']) / features[f'{col}_ma_5']
                features[f'{col}_to_ma_20'] = (df[col] - features[f'{col}_ma_20']) / features[f'{col}_ma_20']
                features[f'{col}_to_ema_10'] = (df[col] - features[f'{col}_ema_10']) / features[f'{col}_ema_10']
                
                # ===== VOLATILITY =====
                features[f'{col}_std_5'] = df[col].rolling(5, min_periods=1).std()
                features[f'{col}_std_20'] = df[col].rolling(20, min_periods=1).std()
                features[f'{col}_std_60'] = df[col].rolling(60, min_periods=1).std()
                
                # ===== BOLLINGER BANDS =====
                ma_20 = features[f'{col}_ma_20']
                std_20 = features[f'{col}_std_20']
                features[f'{col}_bb_upper'] = ma_20 + 2 * std_20
                features[f'{col}_bb_lower'] = ma_20 - 2 * std_20
                features[f'{col}_bb_width'] = (features[f'{col}_bb_upper'] - features[f'{col}_bb_lower']) / ma_20
                features[f'{col}_bb_position'] = (df[col] - features[f'{col}_bb_lower']) / (features[f'{col}_bb_upper'] - features[f'{col}_bb_lower'])
                
                # ===== RSI (Relative Strength Index) =====
                delta = df[col].diff()
                gain = delta.where(delta > 0, 0).rolling(window=14, min_periods=1).mean()
                loss = -delta.where(delta < 0, 0).rolling(window=14, min_periods=1).mean()
                rs = gain / loss
                features[f'{col}_rsi_14'] = 100 - (100 / (1 + rs))
                
                # ===== MACD =====
                ema_12 = df[col].ewm(span=12, adjust=False).mean()
                ema_26 = df[col].ewm(span=26, adjust=False).mean()
                features[f'{col}_macd'] = ema_12 - ema_26
                features[f'{col}_macd_signal'] = features[f'{col}_macd'].ewm(span=9, adjust=False).mean()
                features[f'{col}_macd_diff'] = features[f'{col}_macd'] - features[f'{col}_macd_signal']
                
                # ===== MOMENTUM =====
                features[f'{col}_momentum_5'] = df[col] - df[col].shift(5)
                features[f'{col}_momentum_10'] = df[col] - df[col].shift(10)
                features[f'{col}_momentum_20'] = df[col] - df[col].shift(20)
                
                # ===== RATE OF CHANGE (ROC) =====
                features[f'{col}_roc_5'] = ((df[col] - df[col].shift(5)) / df[col].shift(5)) * 100
                features[f'{col}_roc_10'] = ((df[col] - df[col].shift(10)) / df[col].shift(10)) * 100
                features[f'{col}_roc_20'] = ((df[col] - df[col].shift(20)) / df[col].shift(20)) * 100
                
                # ===== LAG FEATURES =====
                features[f'{col}_lag_1'] = df[col].shift(1)
                features[f'{col}_lag_2'] = df[col].shift(2)
                features[f'{col}_lag_3'] = df[col].shift(3)
                features[f'{col}_lag_5'] = df[col].shift(5)
                
                # ===== MIN/MAX OVER WINDOWS =====
                features[f'{col}_max_5'] = df[col].rolling(5, min_periods=1).max()
                features[f'{col}_min_5'] = df[col].rolling(5, min_periods=1).min()
                features[f'{col}_max_20'] = df[col].rolling(20, min_periods=1).max()
                features[f'{col}_min_20'] = df[col].rolling(20, min_periods=1).min()
                
                # Distance to recent high/low
                features[f'{col}_dist_to_max_20'] = (df[col] - features[f'{col}_max_20']) / features[f'{col}_max_20']
                features[f'{col}_dist_to_min_20'] = (df[col] - features[f'{col}_min_20']) / features[f'{col}_min_20']
                
                # ===== ACCELERATION (second derivative) =====
                features[f'{col}_acceleration'] = df[col].diff().diff()
                
                # ===== Z-SCORE (standardized price) =====
                rolling_mean = df[col].rolling(20, min_periods=1).mean()
                rolling_std = df[col].rolling(20, min_periods=1).std()
                features[f'{col}_zscore'] = (df[col] - rolling_mean) / rolling_std
                
        except:
            pass
    
    return features

In [26]:
def prepare_features(df):
    global base_cols, feature_cols
    
    # Если модели еще не инициализированы, используем только базовые колонки
    if base_cols is None:
        numeric_cols = [c for c in df.columns 
                       if c not in ['date_id', 'is_scored'] 
                       and pd.api.types.is_numeric_dtype(df[c])]
        test_features = create_enhanced_features(df, base_cols_ref=numeric_cols)
    else:
        test_features = create_enhanced_features(df, base_cols_ref=base_cols)
    
    # Если feature_cols еще не определены, создаем их из доступных колонок
    if feature_cols is None:
        feature_cols = [c for c in test_features.columns 
                       if c != 'date_id' and pd.api.types.is_numeric_dtype(test_features[c])]
    
    # Берем только те фичи которые есть в feature_cols
    # Если какой-то фичи нет - заполняем нулями
    X_test = np.zeros((len(df), len(feature_cols)))
    
    for i, col in enumerate(feature_cols):
        if col in test_features.columns:
            X_test[:, i] = test_features[col].fillna(0).values
    
    X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)
    
    return X_test

In [27]:
def initialize_models():
    global models, scaler, feature_cols, base_cols, model_val_losses, is_initialized, device
    
    if is_initialized:
        return
    
    print("="*70)
    print(f"ИНИЦИАЛИЗАЦИЯ И ОБУЧЕНИЕ МОДЕЛЕЙ НА {device}")
    print("="*70)
    
    train = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train.csv')
    train_labels = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/train_labels.csv')
    
    base_cols = [c for c in train.columns 
                 if c not in ['date_id'] 
                 and pd.api.types.is_numeric_dtype(train[c])]
    
    print(f"\nБазовых колонок: {len(base_cols)}")
    print("Создание признаков...")
    
    train_features = create_enhanced_features(train, base_cols_ref=base_cols)
    
    feature_cols = [c for c in train_features.columns 
                   if c != 'date_id' and pd.api.types.is_numeric_dtype(train_features[c])]
    
    print(f"Создано {len(feature_cols)} признаков")
    
    target_cols = [f'target_{i}' for i in range(424)]
    
    X_train = train_features[feature_cols].fillna(0).values
    y_train = train_labels[target_cols].fillna(0).values
    
    X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
    y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    X_tensor = torch.FloatTensor(X_train_scaled).to(device)
    y_tensor = torch.FloatTensor(y_train).to(device)
    
    split_idx = int(len(X_train_scaled) * 0.9)
    X_train_t, X_val_t = X_tensor[:split_idx], X_tensor[split_idx:]
    y_train_t, y_val_t = y_tensor[:split_idx], y_tensor[split_idx:]
    
    print(f"Train: {len(X_train_t)}, Validation: {len(X_val_t)}")
    
    # model_configs = [
    #     (Model1_Deep, "Deep", 1000),
    #     (Model2_Wide, "Wide", 1000),
    #     (Model3_Residual, "Residual", 1000),
    #     (Model4_DeepWide, "DeepWide", 1000),
    #     (Model5_Bottleneck, "Bottleneck", 1000),
    # ]

    model_configs = [
        (Model1_Deep, "Deep+LayerNorm", 800),
        (Model2_Wide, "Wide+Skip", 800),
        (Model3_Residual, "Residual+PreAct", 800),
        (Model4_DeepWide, "DeepWide+Parallel", 800),
        (Model5_Bottleneck, "Bottleneck+Attention", 800),
        (Model6_Transformer, "Transformer", 800),
        (Model7_EnsembleBlock, "EnsembleBlock+Gating", 800),
    ]
    
    for i, (ModelClass, name, epochs) in enumerate(model_configs):
        print(f"\n{'='*70}")
        print(f"МОДЕЛЬ {i+1}/{len(model_configs)}: {name}")
        print(f"{'='*70}")
        
        model = ModelClass(X_train_scaled.shape[1], 424).to(device)
        
        criterion = nn.HuberLoss(delta=1.0)
        optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
        
        patience = 15
        best_val_loss = float('inf')
        best_r2 = -float('inf')
        best_model_state = None
        epochs_no_improve = 0
        early_stop = False
        
        model.train()
        for epoch in range(epochs):
            if early_stop:
                print(f"Early stopping на эпохе {epoch+1}")
                break
                
            optimizer.zero_grad()
            train_outputs = model(X_train_t)
            train_loss = criterion(train_outputs, y_train_t)
            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_t)
                val_loss = criterion(val_outputs, y_val_t)
                
                val_preds = val_outputs.cpu().numpy()
                val_true = y_val_t.cpu().numpy()
                
                mae = mean_absolute_error(val_true.flatten(), val_preds.flatten())
                direction_correct = np.mean(np.sign(val_preds) == np.sign(val_true))
                r2 = r2_score(val_true.flatten(), val_preds.flatten())
                
                train_preds = train_outputs.detach().cpu().numpy()
                train_true = y_train_t.cpu().numpy()
                train_r2 = r2_score(train_true.flatten(), train_preds.flatten())
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_r2 = r2
                best_model_state = model.state_dict().copy()
                epochs_no_improve = 0
                improvement_msg = "✓ УЛУЧШЕНИЕ"
            else:
                epochs_no_improve += 1
                improvement_msg = f"NO IMPROVE ({epochs_no_improve}/{patience})"
                
                if epochs_no_improve >= patience:
                    early_stop = True
            
            model.train()
            scheduler.step(val_loss)
            
            if (epoch + 1) % 5 == 0 or epoch == 0 or early_stop:
                lr = optimizer.param_groups[0]['lr']
                print(f"Ep {epoch+1:3d}/{epochs} | "
                      f"TrL: {train_loss.item():.6f} | "
                      f"VaL: {val_loss.item():.6f} | "
                      f"MAE: {mae:.6f} | "
                      f"R²_tr: {train_r2:7.4f} | "
                      f"R²_val: {r2:7.4f} | "
                      f"Dir: {direction_correct:.4f} | "
                      f"LR: {lr:.6f} | {improvement_msg}")
        
        if best_model_state is not None:
            model.load_state_dict(best_model_state)
        
        model.eval()
        models.append(model)
        model_val_losses.append(best_val_loss.item())
        
        print(f"Завершена. Best Val Loss: {best_val_loss:.6f}, Best R²: {best_r2:.4f}")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # ВЫЧИСЛЯЕМ ОПТИМАЛЬНЫЕ ВЕСА
    weights = 1.0 / np.array(model_val_losses)
    weights = weights / weights.sum()
    print(f"\n{'='*70}")
    print(f"ВЕСА МОДЕЛЕЙ ДЛЯ АНСАМБЛЯ:")
    for i, (config, weight) in enumerate(zip(model_configs, weights)):
        print(f"  Модель {i+1} ({config[1]}): {weight:.4f}")
    print(f"{'='*70}\n")
    
    is_initialized = True

In [28]:
def predict(test, label_lags_1_batch, label_lags_2_batch, label_lags_3_batch, label_lags_4_batch):
    global models, scaler, feature_cols, is_initialized, device
    
    if not is_initialized:
        print("Модели не инициализированы!")
        return pl.DataFrame({f'target_{i}': [0.0] for i in range(NUM_TARGET_COLUMNS)})
    
    try:
        test_pd = test.to_pandas()
        
        # Подготовка с выравниванием
        X_test = prepare_features(test_pd)
        X_test_scaled = scaler.transform(X_test[-1:])
        
        # ПЕРЕНОСИМ НА GPU
        X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
        
        # Ансамбль
        all_preds = []
        with torch.no_grad():
            for model in models:
                pred = model(X_test_tensor)
                pred_cpu = pred.cpu().numpy()[0]  # ← ПЕРЕНОСИМ НА CPU
                all_preds.append(pred_cpu)
        
        predictions = np.mean(all_preds, axis=0)
        predictions = np.clip(predictions, -0.1, 0.1)
        predictions = np.nan_to_num(predictions, nan=0.0, posinf=0.0, neginf=0.0)
        
        return pl.DataFrame({f'target_{i}': [float(predictions[i])] for i in range(NUM_TARGET_COLUMNS)})
        
    except Exception as e:
        print(f"Ошибка predict: {e}")
        return pl.DataFrame({f'target_{i}': [0.0] for i in range(NUM_TARGET_COLUMNS)})

In [29]:
def create_submission_file():
    global models, scaler, feature_cols, is_initialized, device
    
    print("\nСоздание submission.parquet...")
    
    if not is_initialized:
        raise Exception("Модели не инициализированы!")
    
    test = pd.read_csv('/home/nicolaedrabcinski/sd_kaggle/data/raw/test.csv')
    
    # Подготовка фичей
    X_test = prepare_features(test)
    X_test_scaled = scaler.transform(X_test)
    
    # ПЕРЕНОСИМ ДАННЫЕ НА GPU
    X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
    
    print("Генерация предсказаний на GPU...")
    all_preds = []
    with torch.no_grad():
        for i, model in enumerate(models):
            print(f"   Модель {i+1}/{len(models)}...")
            pred = model(X_test_tensor)
            pred_cpu = pred.cpu().numpy()  # ← ПЕРЕНОСИМ ОБРАТНО НА CPU
            all_preds.append(pred_cpu)
    
    predictions = np.mean(all_preds, axis=0)
    predictions = np.clip(predictions, -0.1, 0.1)
    predictions = np.nan_to_num(predictions, nan=0.0, posinf=0.0, neginf=0.0)
    
    submission = pd.DataFrame({'date_id': test['date_id'].values})
    for i in range(424):
        submission[f'target_{i}'] = predictions[:, i]
    
    if 'is_scored' in test.columns:
        submission = submission[test['is_scored'] == True].reset_index(drop=True)
    
    submission = submission.fillna(0).replace([np.inf, -np.inf], 0)
    submission.to_parquet('submission.parquet', index=False, engine='pyarrow')
    
    print(f"Готово: {submission.shape}")
    
    # Очищаем GPU память
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
def main():
    global is_initialized
    
    print("ЗАПУСК ПАЙПЛАЙНА")
    print("="*50)
    
    # # Шаг 1: Инициализация и обучение моделей ОДИН РАЗ
    # if not is_initialized:
    #     print("Начало обучения моделей...")

    initialize_models()
    # else:
    #     print("Модели уже инициализированы, пропускаем обучение")
    
    # Шаг 2: Создание submission файла
    print("Создание submission файла...")
    create_submission_file()
    
# Запускаем основной пайплайн
main()

ЗАПУСК ПАЙПЛАЙНА
ИНИЦИАЛИЗАЦИЯ И ОБУЧЕНИЕ МОДЕЛЕЙ НА cuda

Базовых колонок: 557
Создание признаков...
Создано 26179 признаков
Train: 1764, Validation: 197

МОДЕЛЬ 1/7: Deep+LayerNorm
Ep   1/800 | TrL: 0.022790 | VaL: 0.005886 | MAE: 0.087069 | R²_tr: -48.6976 | R²_val: -15.4246 | Dir: 0.4418 | LR: 0.001000 | ✓ УЛУЧШЕНИЕ
Ep   5/800 | TrL: 0.001885 | VaL: 0.001689 | MAE: 0.049009 | R²_tr: -3.1109 | R²_val: -3.7131 | Dir: 0.4432 | LR: 0.001000 | ✓ УЛУЧШЕНИЕ
Ep  10/800 | TrL: 0.001660 | VaL: 0.001528 | MAE: 0.046240 | R²_tr: -2.6208 | R²_val: -3.2641 | Dir: 0.4431 | LR: 0.001000 | ✓ УЛУЧШЕНИЕ
Ep  15/800 | TrL: 0.001499 | VaL: 0.001367 | MAE: 0.043282 | R²_tr: -2.2677 | R²_val: -2.8154 | Dir: 0.4433 | LR: 0.001000 | ✓ УЛУЧШЕНИЕ
Ep  20/800 | TrL: 0.001346 | VaL: 0.001217 | MAE: 0.040348 | R²_tr: -1.9351 | R²_val: -2.3967 | Dir: 0.4430 | LR: 0.001000 | ✓ УЛУЧШЕНИЕ
Ep  25/800 | TrL: 0.001207 | VaL: 0.001082 | MAE: 0.037546 | R²_tr: -1.6329 | R²_val: -2.0190 | Dir: 0.4435 | LR: 0.001000 | ✓ УЛУ