# AeroClub RecSys 2025 - XGBoost Ranking Baseline

This notebook implements an improved ranking approach using XGBoost for the AeroClub recommendation challenge.

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
# from kaggle.api.kaggle_api_extended import KaggleApi

# # Set display options for better readability
# pd.set_option('display.max_columns', 50)

## 1. Configuration

In [8]:
# Global parameters
TRAIN_SAMPLE_FRAC = 0.5  # Sample 50% of data for faster iteration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Initialize Kaggle API
# api = KaggleApi()
# api.authenticate()

## 2. Load Data

In [9]:
# Load parquet files
train = pd.read_parquet('aeroclub-recsys-2025/train.parquet')
test = pd.read_parquet('aeroclub-recsys-2025/test.parquet')

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [4]:
print(f"Train shape: {train.shape}, Test shape: {test.shape}")
print(f"Unique ranker_ids in train: {train['ranker_id'].nunique():,}")
print(f"Selected rate: {train['selected'].mean():.3f}")

Train shape: (18145372, 126), Test shape: (6897776, 125)


Unique ranker_ids in train: 105,539


Selected rate: 0.006


## 3. Data Sampling & Preprocessing

In [5]:
# Sample by ranker_id to keep groups intact
if TRAIN_SAMPLE_FRAC < 1.0:
    unique_rankers = train['ranker_id'].unique()
    n_sample = int(len(unique_rankers) * TRAIN_SAMPLE_FRAC)
    sampled_rankers = np.random.RandomState(RANDOM_STATE).choice(
        unique_rankers, size=n_sample, replace=False
    )
    train = train[train['ranker_id'].isin(sampled_rankers)]
    print(f"Sampled train to {len(train):,} rows ({train['ranker_id'].nunique():,} groups)")

Sampled train to 9,123,530 rows (52,769 groups)


In [6]:
# Convert ranker_id to string for CatBoost
train['ranker_id'] = train['ranker_id'].astype(str)
test['ranker_id'] = test['ranker_id'].astype(str)

## 4. Feature Engineering

In [7]:
cat_features = [
    'nationality', 'searchRoute', 'corporateTariffCode',
    # Leg 0 segments 0-1
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber'
]

In [8]:
# TODO: add time profiling
def create_features(df):
    """
    Return a copy of df enriched with engineered features.
    Fixed issues with zero-importance features.
    """
    df = df.copy()

    def hms_to_minutes(s: pd.Series) -> np.ndarray:
        """Vectorised 'HH:MM:SS' → minutes (seconds ignored)."""
        mask = s.notna()
        out = np.zeros(len(s), dtype=float)
        if mask.any():
            parts = s[mask].astype(str).str.split(':', expand=True)
            out[mask] = (
                pd.to_numeric(parts[0], errors="coerce").fillna(0) * 60
                + pd.to_numeric(parts[1], errors="coerce").fillna(0)
            )
        return out

    # Duration columns
    dur_cols = (
        ["legs0_duration", "legs1_duration"]
        + [f"legs{l}_segments{s}_duration" for l in (0, 1) for s in (0, 1)]
    )
    for col in dur_cols:
        if col in df.columns:
            df[col] = hms_to_minutes(df[col])

    # Feature container
    feat = {}

    # Price features
    feat["price_per_tax"] = df["totalPrice"] / (df["taxes"] + 1)
    feat["tax_rate"] = df["taxes"] / (df["totalPrice"] + 1)
    feat["log_price"] = np.log1p(df["totalPrice"])

    # Duration features
    df["total_duration"] = df["legs0_duration"].fillna(0) + df["legs1_duration"].fillna(0)
    feat["duration_ratio"] = np.where(
        df["legs1_duration"].fillna(0) > 0,
        df["legs0_duration"] / (df["legs1_duration"] + 1),
        1.0,
    )

    # Fix segment count features
    # Count actual segments based on non-null duration values
    for leg in (0, 1):
        seg_count = 0
        for seg in range(4):  # Check up to 4 segments
            col = f"legs{leg}_segments{seg}_duration"
            if col in df.columns:
                seg_count += df[col].notna().astype(int)
            else:
                break
        feat[f"n_segments_leg{leg}"] = seg_count
    
    feat["total_segments"] = feat["n_segments_leg0"] + feat["n_segments_leg1"]

    # Fix trip type detection
    # is_one_way should be 1 when there's no return leg
    feat["is_one_way"] = (
        df["legs1_duration"].isna() | 
        (df["legs1_duration"] == 0) |
        df["legs1_segments0_departureFrom_airport_iata"].isna()
    ).astype(int)
    
    feat["has_return"] = (1 - feat["is_one_way"]).astype(int)

    # Rank features
    grp = df.groupby("ranker_id")
    feat["price_rank"] = grp["totalPrice"].rank()
    feat["price_pct_rank"] = grp["totalPrice"].rank(pct=True)
    feat["duration_rank"] = grp["total_duration"].rank()
    feat["is_cheapest"] = (grp["totalPrice"].transform("min") == df["totalPrice"]).astype(int)
    feat["is_most_expensive"] = (grp["totalPrice"].transform("max") == df["totalPrice"]).astype(int)
    feat["price_from_median"] = grp["totalPrice"].transform(
        lambda x: (x - x.median()) / (x.std() + 1)
    )

    # Frequent-flyer features - only for airlines actually present in data
    ff = df["frequentFlyer"].fillna("").astype(str)
    feat["n_ff_programs"] = ff.str.count("/") + (ff != "")
    
    # Check which airlines are actually in the data
    carrier_cols = ["legs0_segments0_marketingCarrier_code", "legs1_segments0_marketingCarrier_code"]
    present_airlines = set()
    for col in carrier_cols:
        if col in df.columns:
            present_airlines.update(df[col].dropna().unique())
    
    # Only create ff features for airlines present in data
    for al in ["SU", "S7", "U6", "TK"]:  # Keep only major Russian/Turkish airlines
        if al in present_airlines:
            feat[f"ff_{al}"] = ff.str.contains(rf"\b{al}\b").astype(int)
    
    # Check if FF matches carrier
    feat["ff_matches_carrier"] = 0
    for al in ["SU", "S7", "U6", "TK"]:
        if f"ff_{al}" in feat and "legs0_segments0_marketingCarrier_code" in df.columns:
            feat["ff_matches_carrier"] |= (
                (feat.get(f"ff_{al}", 0) == 1) & 
                (df["legs0_segments0_marketingCarrier_code"] == al)
            ).astype(int)

    # Binary flags
    feat["is_vip_freq"] = ((df["isVip"] == 1) | (feat["n_ff_programs"] > 0)).astype(int)
    feat["has_corporate_tariff"] = (~df["corporateTariffCode"].isna()).astype(int)

    # Baggage and fees
    feat["baggage_total"] = (
        df["legs0_segments0_baggageAllowance_quantity"].fillna(0)
        + df["legs1_segments0_baggageAllowance_quantity"].fillna(0)
    )
    feat["has_baggage"] = (feat["baggage_total"] > 0).astype(int)
    feat["total_fees"] = (
        df["miniRules0_monetaryAmount"].fillna(0) + df["miniRules1_monetaryAmount"].fillna(0)
    )
    feat["has_fees"] = (feat["total_fees"] > 0).astype(int)
    feat["fee_rate"] = feat["total_fees"] / (df["totalPrice"] + 1)

    # Time-of-day features
    for col in ("legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"):
        if col in df.columns:
            dt = pd.to_datetime(df[col], errors="coerce")
            feat[f"{col}_hour"] = dt.dt.hour.fillna(12)
            feat[f"{col}_weekday"] = dt.dt.weekday.fillna(0)
            h = dt.dt.hour.fillna(12)
            feat[f"{col}_business_time"] = (((6 <= h) & (h <= 9)) | ((17 <= h) & (h <= 20))).astype(int)

    # Fix direct flight detection
    feat["is_direct_leg0"] = (feat["n_segments_leg0"] == 1).astype(int)
    feat["is_direct_leg1"] = np.where(
        feat["is_one_way"] == 1,
        0,  # One-way flights don't have leg1
        (feat["n_segments_leg1"] == 1).astype(int)
    )
    feat["both_direct"] = (feat["is_direct_leg0"] & feat["is_direct_leg1"]).astype(int)

    # Cheapest direct flight
    df["_is_direct"] = feat["is_direct_leg0"] == 1
    direct_groups = df[df["_is_direct"]].groupby("ranker_id")["totalPrice"]
    if len(direct_groups) > 0:
        direct_min_price = direct_groups.min()
        feat["is_direct_cheapest"] = (
            df["_is_direct"] & 
            (df["totalPrice"] == df["ranker_id"].map(direct_min_price))
        ).astype(int)
    else:
        feat["is_direct_cheapest"] = 0
    df.drop(columns="_is_direct", inplace=True)

    # Other features
    feat["has_access_tp"] = (df["pricingInfo_isAccessTP"] == 1).astype(int)
    feat["group_size"] = df.groupby("ranker_id")["Id"].transform("count")
    feat["group_size_log"] = np.log1p(feat["group_size"])
    
    # Check if major carrier
    if "legs0_segments0_marketingCarrier_code" in df.columns:
        feat["is_major_carrier"] = df["legs0_segments0_marketingCarrier_code"].isin(["SU", "S7", "U6"]).astype(int)
    else:
        feat["is_major_carrier"] = 0
    
    # Popular routes
    popular_routes = {"MOWLED/LEDMOW", "LEDMOW/MOWLED", "MOWLED", "LEDMOW", "MOWAER/AERMOW"}
    feat["is_popular_route"] = df["searchRoute"].isin(popular_routes).astype(int)
    
    # Cabin class features
    feat["avg_cabin_class"] = df[["legs0_segments0_cabinClass", "legs1_segments0_cabinClass"]].mean(axis=1)
    feat["cabin_class_diff"] = (
        df["legs0_segments0_cabinClass"].fillna(0) - df["legs1_segments0_cabinClass"].fillna(0)
    )

    # Merge new features
    df = pd.concat([df, pd.DataFrame(feat, index=df.index)], axis=1)

    # Final NaN handling
    for col in df.select_dtypes(include="number").columns:
        df[col] = df[col].fillna(0)
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].fillna("missing")

    return df

In [9]:
# Apply feature engineering
train = create_features(train)
test = create_features(test)

## 5. Feature Selection

In [10]:
# Categorical features
cat_features = [
    'nationality', 'searchRoute', 'corporateTariffCode',
    # Leg 0 segments 0-1
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber'
]

# Columns to exclude (uninformative or problematic)
exclude_cols = [
    'Id', 'ranker_id', 'selected', 'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',  # >90% missing
    'frequentFlyer',  # Already processed
    # Exclude constant or near-constant columns
    'bySelf', 'pricingInfo_passengerCount',
    # Exclude baggageAllowance_weightMeasurementType columns (likely constant)
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'legs0_segments1_baggageAllowance_weightMeasurementType',
    'legs1_segments0_baggageAllowance_weightMeasurementType',
    'legs1_segments1_baggageAllowance_weightMeasurementType',
    # Exclude ff features for airlines not in data
    'ff_DP', 'ff_UT', 'ff_EK', 'ff_N4', 'ff_5N', 'ff_LH'
]


# Exclude segment 2-3 columns (>98% missing)
for leg in [0, 1]:
    for seg in [2, 3]:
        for suffix in ['aircraft_code', 'arrivalTo_airport_city_iata', 'arrivalTo_airport_iata',
                      'baggageAllowance_quantity', 'baggageAllowance_weightMeasurementType',
                      'cabinClass', 'departureFrom_airport_iata', 'duration', 'flightNumber',
                      'marketingCarrier_code', 'operatingCarrier_code', 'seatsAvailable']:
            exclude_cols.append(f'legs{leg}_segments{seg}_{suffix}')

feature_cols = [col for col in train.columns if col not in exclude_cols]
cat_features_final = [col for col in cat_features if col in feature_cols]

print(f"Using {len(feature_cols)} features ({len(cat_features_final)} categorical)")

Using 112 features (31 categorical)


## 6. Train/Validation Split

In [11]:
# Prepare data
X_train = train[feature_cols]
y_train = train['selected']
groups_train = train['ranker_id']

X_test = test[feature_cols]
groups_test = test['ranker_id']

# Group-based split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, val_idx = next(gss.split(X_train, y_train, groups_train))

X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
groups_tr, groups_val = groups_train.iloc[train_idx], groups_train.iloc[val_idx]

print(f"Train: {len(X_tr):,} rows, Val: {len(X_val):,} rows, Test: {len(X_test):,} rows")

Train: 7,292,940 rows, Val: 1,830,590 rows, Test: 6,897,776 rows


## 7. Model Training

In [12]:
%%capture
pip install -U xgboost

In [13]:
import xgboost as xgb

In [14]:
# Prepare data for XGBoost
# Convert categorical features to numeric codes for XGBoost
X_tr_xgb = X_tr.copy()
X_val_xgb = X_val.copy()
X_test_xgb = X_test.copy()

# Label encode categorical features
for col in cat_features_final:
    if col in X_tr_xgb.columns:
        # Create a mapping from train data
        unique_vals = pd.concat([X_tr_xgb[col], X_val_xgb[col], X_test_xgb[col]]).unique()
        mapping = {val: idx for idx, val in enumerate(unique_vals)}
        
        X_tr_xgb[col] = X_tr_xgb[col].map(mapping).fillna(-1).astype(int)
        X_val_xgb[col] = X_val_xgb[col].map(mapping).fillna(-1).astype(int)
        X_test_xgb[col] = X_test_xgb[col].map(mapping).fillna(-1).astype(int)

In [15]:
# Create group sizes for XGBoost
group_sizes_tr = pd.DataFrame(groups_tr).groupby('ranker_id').size().values
group_sizes_val = pd.DataFrame(groups_val).groupby('ranker_id').size().values

# Create XGBoost DMatrix
dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr, group=group_sizes_tr)
dval = xgb.DMatrix(X_val_xgb, label=y_val, group=group_sizes_val)

# XGBoost parameters
xgb_params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg@3',
    'max_depth': 8,
    'min_child_weight': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 10.0,
    'learning_rate': 0.05,
    'seed': RANDOM_STATE,
    'n_jobs': -1
}

# Train XGBoost model
print("Training XGBoost model...")
xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=1500,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=100,
    verbose_eval=50
)

Training XGBoost model...


[0]	train-ndcg@3:0.78096	val-ndcg@3:0.77434




[50]	train-ndcg@3:0.82747	val-ndcg@3:0.80814




[100]	train-ndcg@3:0.83781	val-ndcg@3:0.81239




[150]	train-ndcg@3:0.84603	val-ndcg@3:0.81651




[200]	train-ndcg@3:0.85469	val-ndcg@3:0.81706




[250]	train-ndcg@3:0.86304	val-ndcg@3:0.81836




[300]	train-ndcg@3:0.86988	val-ndcg@3:0.82033


[350]	train-ndcg@3:0.87568	val-ndcg@3:0.82167




[400]	train-ndcg@3:0.88061	val-ndcg@3:0.82274


[450]	train-ndcg@3:0.88505	val-ndcg@3:0.82347




[500]	train-ndcg@3:0.88959	val-ndcg@3:0.82489


[550]	train-ndcg@3:0.89300	val-ndcg@3:0.82564




[600]	train-ndcg@3:0.89647	val-ndcg@3:0.82663


[650]	train-ndcg@3:0.90063	val-ndcg@3:0.82746




[700]	train-ndcg@3:0.90394	val-ndcg@3:0.82798


[750]	train-ndcg@3:0.90739	val-ndcg@3:0.82919




[800]	train-ndcg@3:0.91099	val-ndcg@3:0.82939




[850]	train-ndcg@3:0.91405	val-ndcg@3:0.82969


[900]	train-ndcg@3:0.91753	val-ndcg@3:0.83013


[950]	train-ndcg@3:0.92086	val-ndcg@3:0.83114




[1000]	train-ndcg@3:0.92364	val-ndcg@3:0.83156


[1050]	train-ndcg@3:0.92673	val-ndcg@3:0.83177




[1100]	train-ndcg@3:0.92973	val-ndcg@3:0.83172




[1150]	train-ndcg@3:0.93264	val-ndcg@3:0.83230




[1200]	train-ndcg@3:0.93521	val-ndcg@3:0.83256


[1250]	train-ndcg@3:0.93756	val-ndcg@3:0.83305




[1300]	train-ndcg@3:0.93980	val-ndcg@3:0.83342


[1350]	train-ndcg@3:0.94175	val-ndcg@3:0.83398




[1400]	train-ndcg@3:0.94386	val-ndcg@3:0.83396




[1450]	train-ndcg@3:0.94577	val-ndcg@3:0.83423




[1499]	train-ndcg@3:0.94737	val-ndcg@3:0.83487


## 8. Model Evaluation

In [16]:
# Convert scores to probabilities using sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x / 10))

# HitRate@3 calculation
def calculate_hitrate_at_k(df, k=3):
    """Calculate HitRate@k for groups with >10 options"""
    hits = []
    for ranker_id, group in df.groupby('ranker_id'):
        if len(group) > 10:
            top_k = group.nlargest(k, 'pred')
            hit = (top_k['selected'] == 1).any()
            hits.append(hit)
    return np.mean(hits) if hits else 0.0

def evaluate_model(y_true, y_pred, groups, model_name="Model"):
    """Evaluate model performance"""
    df = pd.DataFrame({
        'ranker_id': groups,
        'pred': y_pred,
        'selected': y_true
    })
    
    # Get top prediction per group
    top_preds = df.loc[df.groupby('ranker_id')['pred'].idxmax()]
    top_preds['prob'] = sigmoid(top_preds['pred'])
    
    # Calculate metrics
    logloss = log_loss(top_preds['selected'], top_preds['prob'])
    hitrate_at_3 = calculate_hitrate_at_k(df, k=3)
    accuracy = (top_preds['selected'] == 1).mean()
    
    print(f"{model_name} Validation Metrics:")
    print(f"HitRate@3 (groups >10): {hitrate_at_3:.4f}")
    print(f"LogLoss:                {logloss:.4f}")
    print(f"Top-1 Accuracy:         {accuracy:.4f}")
    
    return df, hitrate_at_3

In [17]:
# Evaluate XGBoost
xgb_val_preds = xgb_model.predict(dval)
xgb_val_df, xgb_hr3 = evaluate_model(y_val, xgb_val_preds, groups_val, "XGBoost")

XGBoost Validation Metrics:
HitRate@3 (groups >10): 0.5042
LogLoss:                0.6871
Top-1 Accuracy:         0.3520


In [18]:
# Get XGBoost feature importance
xgb_importance = xgb_model.get_score(importance_type='gain')

# XGBoost уже возвращает словарь с именами фич, просто конвертируем в DataFrame
xgb_importance_df = pd.DataFrame([
    {'feature': k, 'xgb_importance': v} 
    for k, v in xgb_importance.items()
]).sort_values('xgb_importance', ascending=False)

print(xgb_importance_df.iloc[:30].to_string())

                                         feature  xgb_importance
102                             is_popular_route       45.019257
67                                    price_rank       23.592730
18   legs0_segments1_arrivalTo_airport_city_iata       19.792063
9      legs0_segments0_baggageAllowance_quantity       16.547651
10                    legs0_segments0_cabinClass       14.647137
53                     miniRules1_monetaryAmount       14.383418
48         legs1_segments1_marketingCarrier_code       13.401310
103                              avg_cabin_class       13.284999
22    legs0_segments1_departureFrom_airport_iata       12.149506
69                                 duration_rank       11.658153
101                             is_major_carrier       11.405441
98                                 has_access_tp       11.266839
40                 legs1_segments1_aircraft_code       11.097654
4                                          isVip       10.994371
17                 legs0_

## 9. Generate Predictions

In [19]:
# Generate predictions for test set with XGBoost
group_sizes_test = test.groupby('ranker_id').size().values
dtest = xgb.DMatrix(X_test_xgb, group=group_sizes_test)
xgb_test_preds = xgb_model.predict(dtest)

submission_xgb = test[['Id', 'ranker_id']].copy()
submission_xgb['pred_score'] = xgb_test_preds
submission_xgb['selected'] = submission_xgb.groupby('ranker_id')['pred_score'].rank(
    ascending=False, method='first'
).astype(int)

# Save submissions
submission_xgb[['Id', 'ranker_id', 'selected']].to_csv('submission.csv', index=False)

print(f"XGBoost submission saved. Shape: {submission_xgb.shape}")

XGBoost submission saved. Shape: (6897776, 4)


## Submit to Competition with API

In [20]:
# # Submit to competition
# api.competition_submit(
#     file_name="submission.parquet", 
#     competition="aeroclub-recsys-2025", 
#     message="CatBoost Ranking Baseline"
# )