In [1]:
# --------------------------------------------------------------------------
# | БЛОК 1: Импорт необходимых библиотек                                   |
# --------------------------------------------------------------------------

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
import gc 

In [2]:
# --------------------------------------------------------------------------
# | БЛОК 2: Конфигурация проекта                                           |
# --------------------------------------------------------------------------

VER = 20
CONFIG = {
    'train_path': f'C:/Users/Николай/PycharmProjects/FlightRank_2025/mydata/1/1_train.parquet',
    'test_path': f'C:/Users/Николай/PycharmProjects/FlightRank_2025/mydata/1/1_test.parquet',
    'sample_submission_path': f'C:/Users/Николай/PycharmProjects/FlightRank_2025/data/sample_submission.parquet',

    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu', 
    'SEED': 42,
    'BATCH_SIZE': 8192*4, 
    'LR': 0.001, 
    'EPOCHS': 3, 
    'WEIGHT_DECAY': 1e-5, 

    'embedding_dims': {},
    'dropout_rate': 0.1, 
    'mlp_dims': [1024, 512, 256], 
    'num_cross_layers': 4,
    'output_dim': 1 
}

np.random.seed(CONFIG['SEED'])
torch.manual_seed(CONFIG['SEED'])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG['SEED'])

In [3]:
# --------------------------------------------------------------------------
# | БЛОК 3: Загрузка данных и определение размеров эмбеддингов             |
# --------------------------------------------------------------------------

train_df = pd.read_parquet(CONFIG['train_path'], engine='pyarrow')
test_df = pd.read_parquet(CONFIG['test_path'], engine='pyarrow')

data_cols= [
    'legs0_arrivalAt', 'legs0_departureAt', 'legs1_arrivalAt', 'legs1_departureAt', 'requestDate'
]

num_cols = [
    'legs0_duration', 'legs0_segments0_duration', 'legs0_segments1_duration', 'legs0_segments2_duration', 'legs0_segments3_duration', 'legs1_duration', 'legs1_segments0_duration', 'legs1_segments1_duration', 'legs1_segments2_duration', 'legs1_segments3_duration', 'miniRules0_monetaryAmount', 'miniRules1_monetaryAmount', 'taxes', 'totalPrice', 
]

bool_cols = ['isAccess3D', 'isVip', 'sex']

cat_cols = [col for col in train_df.columns if col not in data_cols and col not in num_cols and col not in bool_cols and col not in  ['ranker_id', 'selected', 'frequentFlyer']]

frequentFlyer_col = 'frequentFlyer'

ranker_id_col = 'ranker_id'

selected_col = 'selected'


for col in cat_cols:
    num_unique_values = train_df[col].nunique() + 1
    embedding_dim = int(np.sqrt(num_unique_values))
    CONFIG['embedding_dims'][col] = (num_unique_values, embedding_dim)
    

# 1.1 Создание словаря для frequentFlyer и обновление конфига
all_ff_codes = train_df[frequentFlyer_col].str.split('/').explode().dropna().unique()
ff_code_to_idx = {code: i for i, code in enumerate(all_ff_codes)}
ff_unknown_idx = len(ff_code_to_idx)
ff_embedding_dim = int(np.sqrt(len(all_ff_codes) + 1))
CONFIG['embedding_dims'][frequentFlyer_col] = (len(ff_code_to_idx) + 1, ff_embedding_dim)

In [4]:
# --------------------------------------------------------------------------
# | БЛОК 4: Определение модели (с новыми признаками-разницами)              |
# --------------------------------------------------------------------------

class FlightRankModel(nn.Module):
    def __init__(self, config, num_cols, cat_cols, bool_cols, data_cols, frequentFlyer_col):
        super().__init__()
        self.config = config
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.bool_cols = bool_cols
        self.data_cols = data_cols
        self.frequentFlyer_col = frequentFlyer_col

        # --- 1. ВХОДНАЯ ЧАСТЬ ---
        self.embedding_layers = nn.ModuleDict({
            col: nn.Embedding(num_embeddings=dims[0], embedding_dim=dims[1])
            for col, dims in config['embedding_dims'].items() if col != frequentFlyer_col
        })
        
        ff_dims = config['embedding_dims'][frequentFlyer_col]
        self.ff_embedding_layer = nn.Embedding(num_embeddings=ff_dims[0], embedding_dim=ff_dims[1])

        cat_embedding_dim = sum(dims[1] for col, dims in config['embedding_dims'].items() if col != frequentFlyer_col)
        ff_embedding_dim = ff_dims[1]
        numerical_dim = len(self.num_cols)
        boolean_dim = len(self.bool_cols)
        cyclical_dim = len(self.data_cols) * 8
        time_to_departure_dim = 1
        
        # ИЗМЕНЕНИЕ: Добавляем размер для новых признаков (3 diff-фичи на каждую числовую колонку)
        diff_features_dim = len(self.num_cols) * 3
        
        self.input_dim = (cat_embedding_dim + ff_embedding_dim + numerical_dim + 
                          boolean_dim + cyclical_dim + time_to_departure_dim + diff_features_dim)
        
        # ИЗМЕНЕНИЕ: Обновляем размер BatchNorm
        self.all_numerical_batch_norm = nn.BatchNorm1d(
            numerical_dim + boolean_dim + cyclical_dim + time_to_departure_dim + diff_features_dim
        )
        
        # --- 2. CROSS & DEEP сети (остаются без изменений, т.к. input_dim уже обновлен) ---
        self.cross_net = nn.ModuleList([
            nn.Linear(self.input_dim, self.input_dim) 
            for _ in range(config['num_cross_layers'])
        ])

        deep_layers = []
        layer_dims = [self.input_dim] + config['mlp_dims']
        for i in range(len(layer_dims) - 1):
            deep_layers.append(nn.Linear(layer_dims[i], layer_dims[i+1]))
            deep_layers.append(nn.ReLU())
        self.deep_net = nn.Sequential(*deep_layers)
        
        self.final_layer = nn.Linear(config['mlp_dims'][-1], config['output_dim'])

    def forward(self, x_dict):
        # --- 1. Формирование x_0 ---
        embedded_features = [self.embedding_layers[col](x_dict[col]) for col in self.cat_cols]
        
        # ... (логика для frequentFlyer без изменений) ...
        list_of_indices = x_dict[self.frequentFlyer_col]
        avg_ff_embeddings_list = []
        for indices in list_of_indices:
            if not indices:
                avg_embedding = torch.zeros(self.ff_embedding_layer.embedding_dim, device=self.config['DEVICE'])
            else:
                indices_tensor = torch.tensor(indices, dtype=torch.long, device=self.config['DEVICE'])
                embeddings = self.ff_embedding_layer(indices_tensor)
                avg_embedding = embeddings.mean(dim=0)
            avg_ff_embeddings_list.append(avg_embedding)
        avg_ff_embedding_batch = torch.stack(avg_ff_embeddings_list, dim=0)
        embedded_features.append(avg_ff_embedding_batch)
        concatenated_embeddings = torch.cat(embedded_features, dim=1)

        # Собираем все числоподобные признаки
        numerical_inputs = [x_dict['numerical'], x_dict['boolean']]
        for col in self.data_cols:
            date_tensor = x_dict[f'{col}_components']
            numerical_inputs.append(torch.sin(2 * np.pi * date_tensor[:, 0] / 59.0).unsqueeze(1))
            numerical_inputs.append(torch.cos(2 * np.pi * date_tensor[:, 0] / 59.0).unsqueeze(1))
            numerical_inputs.append(torch.sin(2 * np.pi * date_tensor[:, 1] / 23.0).unsqueeze(1))
            numerical_inputs.append(torch.cos(2 * np.pi * date_tensor[:, 1] / 23.0).unsqueeze(1))
            numerical_inputs.append(torch.sin(2 * np.pi * date_tensor[:, 2] / 6.0).unsqueeze(1))
            numerical_inputs.append(torch.cos(2 * np.pi * date_tensor[:, 2] / 6.0).unsqueeze(1))
            numerical_inputs.append(torch.sin(2 * np.pi * date_tensor[:, 3] / 365.0).unsqueeze(1))
            numerical_inputs.append(torch.cos(2 * np.pi * date_tensor[:, 3] / 365.0).unsqueeze(1))
        
        time_to_departure = (x_dict['legs0_departureAt_unix'] - x_dict['requestDate_unix']) / 60.0
        numerical_inputs.append(time_to_departure.unsqueeze(1))
        
        # ИЗМЕНЕНИЕ: Добавляем новые признаки-разницы
        numerical_inputs.append(x_dict['diff_features'])
        
        processed_numerical_all = torch.cat(numerical_inputs, dim=1)
        processed_numerical_all = self.all_numerical_batch_norm(processed_numerical_all)

        x_0 = torch.cat([concatenated_embeddings, processed_numerical_all], dim=1)
        
        # ... (Cross-сеть, Deep-сеть и final_layer без изменений) ...
        x_cross = x_0
        for layer in self.cross_net:
            x_cross = x_0 * torch.sigmoid(layer(x_cross)) + x_cross
        deep_output = self.deep_net(x_cross)
        final_output = self.final_layer(deep_output)
        
        return final_output

In [5]:
# --------------------------------------------------------------------------
# | БЛОК 5: Предобработка, обучение и предсказание (с новыми фичами)        |
# --------------------------------------------------------------------------

# --- 1. Подготовительный этап ---
print("--- Начало предобработки для обучения ---")

# 1.1 Расчет Min/Max для нормализации
combined_df = pd.concat([train_df, test_df], ignore_index=True)
num_min = torch.tensor(combined_df[num_cols].astype(np.float32).min().values, dtype=torch.float32)
num_max = torch.tensor(combined_df[num_cols].astype(np.float32).max().values, dtype=torch.float32)
num_range = num_max - num_min
num_range[num_range == 0] = 1e-9
print("Статистика Min-Max для нормализации рассчитана.")

# 1.2 ИЗМЕНЕНИЕ: Предварительный расчет групповых статистик
print("Предварительный расчет групповых статистик...")
stat_cols_mean = []
stat_cols_max = []
stat_cols_min = []

for col in tqdm(num_cols, desc="Создание статистических фич"):
    mean_col_name = f'{col}_mean_by_ranker'
    max_col_name = f'{col}_max_by_ranker'
    min_col_name = f'{col}_min_by_ranker'
    
    combined_df[mean_col_name] = combined_df.groupby('ranker_id')[col].transform('mean')
    combined_df[max_col_name] = combined_df.groupby('ranker_id')[col].transform('max')
    combined_df[min_col_name] = combined_df.groupby('ranker_id')[col].transform('min')
    
    stat_cols_mean.append(mean_col_name)
    stat_cols_max.append(max_col_name)
    stat_cols_min.append(min_col_name)

# Разделяем обратно на train и test
train_df = combined_df.iloc[:len(train_df)].copy()
test_df = combined_df.iloc[len(train_df):].copy()
del combined_df; gc.collect()
print("Статистические фичи добавлены в DataFrame'ы.")


# --- 2. Функция для подготовки батчей ---
def get_batch(df, indices, device):
    batch_df = df.iloc[indices]
    x = {}

    for col in cat_cols:
        x[col] = torch.tensor(batch_df[col].values.astype(np.int64), dtype=torch.long, device=device)
    
    # Нормализуем оригинальные числовые колонки
    numerical_tensor = torch.tensor(batch_df[num_cols].values.astype(np.float32), device=device)
    normalized_numerical = (numerical_tensor - num_min.to(device)) / num_range.to(device)
    x['numerical'] = normalized_numerical
    
    x['boolean'] = torch.tensor(batch_df[bool_cols].values.astype(np.float32), device=device)
    
    for col in data_cols:
        # ... (обработка дат без изменений) ...
        dt_series = pd.to_datetime(batch_df[col], errors='coerce')
        x[f'{col}_components'] = torch.tensor(np.vstack([dt_series.dt.minute.fillna(0), dt_series.dt.hour.fillna(0), dt_series.dt.dayofweek.fillna(0), dt_series.dt.dayofyear.fillna(0)]).T, dtype=torch.float32, device=device)
        x[f'{col}_unix'] = torch.tensor(dt_series.astype(np.int64).values // 10**9, dtype=torch.float32, device=device)
        
    # ИЗМЕНЕНИЕ: Вычисляем признаки-разницы
    # Нормализуем групповые статистики, используя те же min/max, что и для оригинальных колонок
    stats_mean_tensor = (torch.tensor(batch_df[stat_cols_mean].values.astype(np.float32), device=device) - num_min.to(device)) / num_range.to(device)
    stats_max_tensor = (torch.tensor(batch_df[stat_cols_max].values.astype(np.float32), device=device) - num_min.to(device)) / num_range.to(device)
    stats_min_tensor = (torch.tensor(batch_df[stat_cols_min].values.astype(np.float32), device=device) - num_min.to(device)) / num_range.to(device)
    
    # Вычисляем разницы
    diff_mean = normalized_numerical - stats_mean_tensor
    diff_max = normalized_numerical - stats_max_tensor
    diff_min = normalized_numerical - stats_min_tensor
    
    # Объединяем все разницы в один тензор
    x['diff_features'] = torch.cat([diff_mean, diff_max, diff_min], dim=1)
    
    # ... (обработка frequentFlyer и y без изменений) ...
    ff_str_list = batch_df[frequentFlyer_col].fillna('').tolist()
    list_of_indices = [[ff_code_to_idx.get(code, ff_unknown_idx) for code in s.split('/') if code] for s in ff_str_list]
    x[frequentFlyer_col] = list_of_indices
    y = None
    if 'selected' in batch_df.columns:
        y = torch.tensor(batch_df['selected'].values.astype(float), dtype=torch.float32, device=device).unsqueeze(1)
        
    return x, y

# --- 3. Инициализация и цикл обучения ---
model = FlightRankModel(CONFIG, num_cols, cat_cols, bool_cols, data_cols, frequentFlyer_col).to(CONFIG['DEVICE'])
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

print(f"\n--- Начало обучения на {CONFIG['EPOCHS']} эпох ---")
for epoch in range(CONFIG['EPOCHS']):
    model.train()
    running_loss = 0.0
    shuffled_indices = np.random.permutation(len(train_df))
    num_batches = (len(train_df) + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']
   
    progress_bar = tqdm(range(num_batches), desc=f"Эпоха {epoch + 1}/{CONFIG['EPOCHS']}")
    
    for i in progress_bar:
        batch_indices = shuffled_indices[i * CONFIG['BATCH_SIZE'] : (i + 1) * CONFIG['BATCH_SIZE']]
        x_batch, y_batch = get_batch(train_df, batch_indices, CONFIG['DEVICE'])
        
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        current_avg_loss = running_loss / (i + 1)
        progress_bar.set_postfix(avg_loss=f'{current_avg_loss:.4f}')
        
    print(f"Итоговый средний лосс за эпоху {epoch + 1}: {running_loss / num_batches:.4f}")

# --- 4. Обработка эмбеддингов для неизвестных категорий ---
print("\n--- Обновление эмбеддингов для неизвестных категорий (взвешенное усреднение) ---")
with torch.no_grad():
    for col, layer in model.embedding_layers.items():
        if col not in CONFIG['embedding_dims']:
            continue
            
        print(f"Обработка колонки: {col}")
        counts = train_df[col].value_counts()
        
        known_indices = torch.tensor(counts.index.values, dtype=torch.long, device=CONFIG['DEVICE'])
        weights = torch.tensor(counts.values, dtype=torch.float32, device=CONFIG['DEVICE'])
        
        known_embeddings = layer.weight.data[known_indices]
        
        weighted_sum_of_vectors = (known_embeddings * weights.unsqueeze(1)).sum(dim=0)
        
        total_weight = weights.sum()
        
        if total_weight > 0:
            mean_embedding = weighted_sum_of_vectors / total_weight
            unknown_idx = CONFIG['embedding_dims'][col][0] - 1
            layer.weight.data[unknown_idx] = mean_embedding

    print("Обработка колонки: frequentFlyer")
    ff_counts = train_df[frequentFlyer_col].str.split('/').explode().dropna().value_counts()
    
    ff_known_indices_list = []
    ff_weights_list = []
    for code_str, count in ff_counts.items():
        if code_str in ff_code_to_idx:
            ff_known_indices_list.append(ff_code_to_idx[code_str])
            ff_weights_list.append(count)

    if ff_known_indices_list:
        ff_known_indices = torch.tensor(ff_known_indices_list, dtype=torch.long, device=CONFIG['DEVICE'])
        ff_weights = torch.tensor(ff_weights_list, dtype=torch.float32, device=CONFIG['DEVICE'])
        
        ff_known_embeddings = model.ff_embedding_layer.weight.data[ff_known_indices]
        
        weighted_sum_ff = (ff_known_embeddings * ff_weights.unsqueeze(1)).sum(dim=0)
        total_weight_ff = ff_weights.sum()

        if total_weight_ff > 0:
            mean_ff_embedding = weighted_sum_ff / total_weight_ff
            model.ff_embedding_layer.weight.data[ff_unknown_idx] = mean_ff_embedding

# --- 5. Цикл предсказания ---
print("\n--- Генерация предсказаний для теста ---")
model.eval()
test_preds = []
num_test_batches = (len(test_df) + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

with torch.no_grad():
    for i in tqdm(range(num_test_batches), desc="Предсказание"):
        test_indices = list(range(i * CONFIG['BATCH_SIZE'], min((i + 1) * CONFIG['BATCH_SIZE'], len(test_df))))
        x_batch, _ = get_batch(test_df, test_indices, CONFIG['DEVICE'])
        
        outputs = model(x_batch)
        preds = torch.sigmoid(outputs).cpu().numpy().flatten()
        test_preds.extend(preds)

test_df['score'] = test_preds

--- Начало предобработки для обучения ---
Статистика Min-Max для нормализации рассчитана.
Предварительный расчет групповых статистик...


Создание статистических фич: 100%|██████████| 14/14 [01:53<00:00,  8.12s/it]


Статистические фичи добавлены в DataFrame'ы.

--- Начало обучения на 3 эпох ---


Эпоха 1/3: 100%|██████████| 554/554 [37:33<00:00,  4.07s/it, avg_loss=0.0257] 


Итоговый средний лосс за эпоху 1: 0.0257


Эпоха 2/3: 100%|██████████| 554/554 [33:47<00:00,  3.66s/it, avg_loss=0.0211]


Итоговый средний лосс за эпоху 2: 0.0211


Эпоха 3/3: 100%|██████████| 554/554 [33:41<00:00,  3.65s/it, avg_loss=0.0199]


Итоговый средний лосс за эпоху 3: 0.0199

--- Обновление эмбеддингов для неизвестных категорий (взвешенное усреднение) ---
Обработка колонки: companyID
Обработка колонки: corporateTariffCode
Обработка колонки: nationality
Обработка колонки: legs0_segments0_aircraft_code
Обработка колонки: legs0_segments0_arrivalTo_airport_city_iata
Обработка колонки: legs0_segments0_arrivalTo_airport_iata
Обработка колонки: legs0_segments0_baggageAllowance_quantity
Обработка колонки: legs0_segments0_baggageAllowance_weightMeasurementType
Обработка колонки: legs0_segments0_cabinClass
Обработка колонки: legs0_segments0_departureFrom_airport_iata
Обработка колонки: legs0_segments0_flightNumber
Обработка колонки: legs0_segments0_marketingCarrier_code
Обработка колонки: legs0_segments0_operatingCarrier_code
Обработка колонки: legs0_segments0_seatsAvailable
Обработка колонки: legs0_segments1_aircraft_code
Обработка колонки: legs0_segments1_arrivalTo_airport_city_iata
Обработка колонки: legs0_segments1_arriva

Предсказание: 100%|██████████| 211/211 [06:07<00:00,  1.74s/it]


In [6]:
# --------------------------------------------------------------------------
# | БЛОК 6: Формирование файла для отправки (submission)                   |
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------

print("\n--- Формирование файла для отправки ---")

CONFIG['submission_path'] = f'C:/Users/Николай/PycharmProjects/FlightRank_2025/submissions/submission_{VER}.csv'

sample_submission_df = pd.read_parquet(CONFIG['sample_submission_path'])
test_df['Id'] = sample_submission_df['Id'].values

test_df['selected'] = test_df.groupby('ranker_id')['score'].rank(method='first', ascending=False).astype(int)

submission_df = test_df[['Id', 'ranker_id', 'selected']]

submission_df = submission_df.set_index('Id').loc[sample_submission_df['Id']].reset_index()

submission_df.to_csv(CONFIG['submission_path'], index=False)

print(f"\nГотово! Файл для отправки сохранен в: {CONFIG['submission_path']}")
print("Пример содержимого submission файла:")
print(submission_df.head())


--- Формирование файла для отправки ---

Готово! Файл для отправки сохранен в: C:/Users/Николай/PycharmProjects/FlightRank_2025/submissions/submission_20.csv
Пример содержимого submission файла:
         Id                         ranker_id  selected
0  18144679  c9373e5f772e43d593dd6ad2fa90f67a        26
1  18144680  c9373e5f772e43d593dd6ad2fa90f67a        53
2  18144681  c9373e5f772e43d593dd6ad2fa90f67a       224
3  18144682  c9373e5f772e43d593dd6ad2fa90f67a        80
4  18144683  c9373e5f772e43d593dd6ad2fa90f67a        97


In [8]:
test_df['score']

18145372    0.018461
18145373    0.009058
18145374    0.000156
18145375    0.003248
18145376    0.002061
              ...   
25043143    0.015423
25043144    0.230324
25043145    0.016983
25043146    0.272953
25043147    0.006084
Name: score, Length: 6897776, dtype: float32