In [1]:
%%capture
!pip install -U xgboost
!pip install -U polars
!pip install -U optuna
!pip install -U catboost
!pip install -U lightgbm

In [2]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import time
import xgboost as xgb
import catboost
import lightgbm as lgb
import optuna
import pandas as pd
from scipy.stats import spearmanr
from scipy.optimize import minimize

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# Load data
train = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/train.parquet').drop('__index_level_0__')
test = pl.read_parquet('/kaggle/input/aeroclub-recsys-2025/test.parquet').drop('__index_level_0__').with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))

data_raw = pl.concat((train, test))

FileNotFoundError: 系统找不到指定的路径。 (os error 3): /kaggle/input/aeroclub-recsys-2025/train.parquet

This error occurred with the following context stack:
	[1] 'parquet scan'
	[2] 'sink'


## Helpers

In [4]:
def hitrate_at_3(y_true, y_pred, groups):
    df = pl.DataFrame({
        'group': groups,
        'pred': y_pred,
        'true': y_true
    })
    
    return (
        df.filter(pl.col("group").count().over("group") > 10)
        .sort(["group", "pred"], descending=[False, True])
        .group_by("group", maintain_order=True)
        .head(3)
        .group_by("group")
        .agg(pl.col("true").max())
        .select(pl.col("true").mean())
        .item()
    )

## Feature Engineering

In [5]:
df = data_raw.clone()

# More efficient duration to minutes converter
def dur_to_min(col):
    # Extract days and time parts in one pass
    days = col.str.extract(r"^(\d+)\.", 1).cast(pl.Int64).fill_null(0) * 1440
    time_str = pl.when(col.str.contains(r"^\d+\.")).then(col.str.replace(r"^\d+\.", "")).otherwise(col)
    hours = time_str.str.extract(r"^(\d+):", 1).cast(pl.Int64).fill_null(0) * 60
    minutes = time_str.str.extract(r":(\d+):", 1).cast(pl.Int64).fill_null(0)
    return (days + hours + minutes).fill_null(0)

# Process duration columns
dur_cols = ["legs0_duration", "legs1_duration"] + [f"legs{l}_segments{s}_duration" for l in (0, 1) for s in (0, 1)]
dur_exprs = [dur_to_min(pl.col(c)).alias(c) for c in dur_cols if c in df.columns]

# Apply duration transformations first
if dur_exprs:
    df = df.with_columns(dur_exprs)

# Precompute marketing carrier columns check
mc_cols = [f'legs{l}_segments{s}_marketingCarrier_code' for l in (0, 1) for s in range(4)]
mc_exists = [col for col in mc_cols if col in df.columns]

# Combine all initial transformations
df = df.with_columns([
        # Price features
        (pl.col("totalPrice") / (pl.col("taxes") + 1)).alias("price_per_tax"),
        (pl.col("taxes") / (pl.col("totalPrice") + 1)).alias("tax_rate"),
        pl.col("totalPrice").log1p().alias("log_price"),
        
        # Duration features
        (pl.col("legs0_duration").fill_null(0) + pl.col("legs1_duration").fill_null(0)).alias("total_duration"),
        pl.when(pl.col("legs1_duration").fill_null(0) > 0)
            .then(pl.col("legs0_duration") / (pl.col("legs1_duration") + 1))
            .otherwise(1.0).alias("duration_ratio"),
        
        # Trip type
        (pl.col("legs1_duration").is_null() | 
         (pl.col("legs1_duration") == 0) | 
         pl.col("legs1_segments0_departureFrom_airport_iata").is_null()).cast(pl.Int32).alias("is_one_way"),
        
        # Total segments count
        (pl.sum_horizontal(pl.col(col).is_not_null().cast(pl.UInt8) for col in mc_exists) 
         if mc_exists else pl.lit(0)).alias("l0_seg"),
        
        # FF features
        (pl.col("frequentFlyer").fill_null("").str.count_matches("/") + 
         (pl.col("frequentFlyer").fill_null("") != "").cast(pl.Int32)).alias("n_ff_programs"),
        
        # Binary features
        pl.col("corporateTariffCode").is_not_null().cast(pl.Int32).alias("has_corporate_tariff"),
        (pl.col("pricingInfo_isAccessTP") == 1).cast(pl.Int32).alias("has_access_tp"),
        
        # Baggage & fees
        (pl.col("legs0_segments0_baggageAllowance_quantity").fill_null(0) + 
         pl.col("legs1_segments0_baggageAllowance_quantity").fill_null(0)).alias("baggage_total"),
        (pl.col("miniRules0_monetaryAmount").fill_null(0) + 
         pl.col("miniRules1_monetaryAmount").fill_null(0)).alias("total_fees"),
        
        # Routes & carriers
        pl.col("searchRoute").is_in(["MOWLED/LEDMOW", "LEDMOW/MOWLED", "MOWLED", "LEDMOW", "MOWAER/AERMOW"])
            .cast(pl.Int32).alias("is_popular_route"),
        
        # Cabin
        pl.mean_horizontal(["legs0_segments0_cabinClass", "legs1_segments0_cabinClass"]).alias("avg_cabin_class"),
        (pl.col("legs0_segments0_cabinClass").fill_null(0) - 
         pl.col("legs1_segments0_cabinClass").fill_null(0)).alias("cabin_class_diff"),
])

# Segment counts - more efficient
seg_exprs = []
for leg in (0, 1):
    seg_cols = [f"legs{leg}_segments{s}_duration" for s in range(4) if f"legs{leg}_segments{s}_duration" in df.columns]
    if seg_cols:
        seg_exprs.append(
            pl.sum_horizontal(pl.col(c).is_not_null() for c in seg_cols)
                .cast(pl.Int32).alias(f"n_segments_leg{leg}")
        )
    else:
        seg_exprs.append(pl.lit(0).cast(pl.Int32).alias(f"n_segments_leg{leg}"))

# Add segment-based features
# First create segment counts
df = df.with_columns(seg_exprs)

# Then use them for derived features
df = df.with_columns([
    (pl.col("n_segments_leg0") + pl.col("n_segments_leg1")).alias("total_segments"),
    (pl.col("n_segments_leg0") == 1).cast(pl.Int32).alias("is_direct_leg0"),
    pl.when(pl.col("is_one_way") == 1).then(0)
        .otherwise((pl.col("n_segments_leg1") == 1).cast(pl.Int32)).alias("is_direct_leg1"),
])

# More derived features
df = df.with_columns([
    (pl.col("is_direct_leg0") & pl.col("is_direct_leg1")).cast(pl.Int32).alias("both_direct"),
    ((pl.col("isVip") == 1) | (pl.col("n_ff_programs") > 0)).cast(pl.Int32).alias("is_vip_freq"),
    (pl.col("baggage_total") > 0).cast(pl.Int32).alias("has_baggage"),
    (pl.col("total_fees") > 0).cast(pl.Int32).alias("has_fees"),
    (pl.col("total_fees") / (pl.col("totalPrice") + 1)).alias("fee_rate"),
    pl.col("Id").count().over("ranker_id").alias("group_size"),
])

# Add major carrier flag if column exists
if "legs0_segments0_marketingCarrier_code" in df.columns:
    df = df.with_columns(
        pl.col("legs0_segments0_marketingCarrier_code").is_in(["SU", "S7", "U6"])
            .cast(pl.Int32).alias("is_major_carrier")
    )
else:
    df = df.with_columns(pl.lit(0).alias("is_major_carrier"))

df = df.with_columns(pl.col("group_size").log1p().alias("group_size_log"))

# Time features - batch process
time_exprs = []
for col in ("legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"):
    if col in df.columns:
        dt = pl.col(col).str.to_datetime(strict=False)
        h = dt.dt.hour().fill_null(12)
        time_exprs.extend([
            h.alias(f"{col}_hour"),
            dt.dt.weekday().fill_null(0).alias(f"{col}_weekday"),
            (((h >= 6) & (h <= 9)) | ((h >= 17) & (h <= 20))).cast(pl.Int32).alias(f"{col}_business_time")
        ])
if time_exprs:
    df = df.with_columns(time_exprs)

# Batch rank computations - more efficient with single pass
# First apply the columns that will be used for ranking
df = df.with_columns([
    pl.col("group_size").log1p().alias("group_size_log"),
])

# Price and duration basic ranks
rank_exprs = []
for col, alias in [("totalPrice", "price"), ("total_duration", "duration")]:
    rank_exprs.append(pl.col(col).rank().over("ranker_id").alias(f"{alias}_rank"))

# Price-specific features
price_exprs = [
    (pl.col("totalPrice").rank("average").over("ranker_id") / 
     pl.col("totalPrice").count().over("ranker_id")).alias("price_pct_rank"),
    (pl.col("totalPrice") == pl.col("totalPrice").min().over("ranker_id")).cast(pl.Int32).alias("is_cheapest"),
    ((pl.col("totalPrice") - pl.col("totalPrice").median().over("ranker_id")) / 
     (pl.col("totalPrice").std().over("ranker_id") + 1)).alias("price_from_median"),
    (pl.col("l0_seg") == pl.col("l0_seg").min().over("ranker_id")).cast(pl.Int32).alias("is_min_segments"),
]

# Apply initial ranks
df = df.with_columns(rank_exprs + price_exprs)

# Cheapest direct - more efficient
direct_cheapest = (
    df.filter(pl.col("is_direct_leg0") == 1)
    .group_by("ranker_id")
    .agg(pl.col("totalPrice").min().alias("min_direct"))
)

df = df.join(direct_cheapest, on="ranker_id", how="left").with_columns(
    ((pl.col("is_direct_leg0") == 1) & 
     (pl.col("totalPrice") == pl.col("min_direct"))).cast(pl.Int32).fill_null(0).alias("is_direct_cheapest")
).drop("min_direct")

In [None]:
# Advanced Feature Engineering - Focus on Business Traveler Preferences
print("Adding advanced business traveler features...")

# 1. Enhanced price and policy features
df = df.with_columns([
    # Corporate policy compliance
    (pl.col("pricingInfo_isAccessTP") == 1).cast(pl.Int32).alias("policy_compliant"),
    
    # Enhanced price buckets with controlled group stats
    pl.when(pl.col("price_pct_rank") <= 0.15).then(1)
    .when(pl.col("price_pct_rank") <= 0.3).then(2)
    .when(pl.col("price_pct_rank") <= 0.5).then(3)
    .when(pl.col("price_pct_rank") <= 0.7).then(4)
    .when(pl.col("price_pct_rank") <= 0.85).then(5)
    .otherwise(6).alias("price_bucket"),
    
    # Price competitiveness (keep essential group features)
    (pl.col("totalPrice") <= pl.col("totalPrice").quantile(0.25).over("ranker_id")).cast(pl.Int32).alias("is_cheap_quartile"),
    (pl.col("totalPrice") == pl.col("totalPrice").min().over("ranker_id")).cast(pl.Int32).alias("is_absolute_cheapest"),
    
    # Tax efficiency for business
    pl.when(pl.col("taxes") > 0).then(pl.col("totalPrice") / pl.col("taxes")).otherwise(0).alias("price_tax_efficiency"),
])

# 2. Advanced time features for business travelers
time_features = []
for prefix in ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt"]:
    if f"{prefix}_hour" in df.columns:
        time_features.extend([
            # Premium business hours
            ((pl.col(f"{prefix}_hour") >= 7) & (pl.col(f"{prefix}_hour") <= 9)).cast(pl.Int32).alias(f"{prefix}_morning_business"),
            ((pl.col(f"{prefix}_hour") >= 17) & (pl.col(f"{prefix}_hour") <= 19)).cast(pl.Int32).alias(f"{prefix}_evening_business"),
            # Avoid red-eye flights
            ((pl.col(f"{prefix}_hour") >= 23) | (pl.col(f"{prefix}_hour") <= 5)).cast(pl.Int32).alias(f"{prefix}_red_eye"),
            # Premium time slots
            (pl.col(f"{prefix}_hour").is_in([7, 8, 9, 18, 19, 20])).cast(pl.Int32).alias(f"{prefix}_premium_time"),
            # Weekend patterns
            ((pl.col(f"{prefix}_weekday") >= 5) & (pl.col(f"{prefix}_hour") >= 10)).cast(pl.Int32).alias(f"{prefix}_weekend_leisure"),
        ])

if time_features:
    df = df.with_columns(time_features)

# 3. Seat availability and scarcity features
seat_features = []
for leg in [0, 1]:
    for seg in [0, 1]:
        seat_col = f"legs{leg}_segments{seg}_seatsAvailable"
        if seat_col in df.columns:
            seat_features.extend([
                # Seat scarcity (low availability premium)
                (pl.col(seat_col).fill_null(0) <= 5).cast(pl.Int32).alias(f"{seat_col}_scarce"),
                # High availability
                (pl.col(seat_col).fill_null(0) >= 20).cast(pl.Int32).alias(f"{seat_col}_abundant"),
                # Normalized seat availability within group
                (pl.col(seat_col) / (pl.col(seat_col).mean().over("ranker_id") + 1)).alias(f"{seat_col}_relative"),
            ])

if seat_features:
    df = df.with_columns(seat_features)

# 4. Cancellation and change policy features (critical for business)
policy_features = []
has_cancellation = False
has_exchange = False

# Check if cancellation features exist
if "miniRules0_monetaryAmount" in df.columns:
    has_cancellation = True
    policy_features.extend([
        # Flexible cancellation
        (pl.col("miniRules0_monetaryAmount").fill_null(999999) == 0).cast(pl.Int32).alias("free_cancellation"),
        # Low cancellation fee
        (pl.col("miniRules0_monetaryAmount").fill_null(999999) <= pl.col("totalPrice") * 0.1).cast(pl.Int32).alias("low_cancel_fee"),
        # Cancellation fee rate
        (pl.col("miniRules0_monetaryAmount") / (pl.col("totalPrice") + 1)).alias("cancel_fee_rate"),
    ])

if "miniRules1_monetaryAmount" in df.columns:
    has_exchange = True
    policy_features.extend([
        # Flexible exchange
        (pl.col("miniRules1_monetaryAmount").fill_null(999999) == 0).cast(pl.Int32).alias("free_exchange"),
        # Low exchange fee
        (pl.col("miniRules1_monetaryAmount").fill_null(999999) <= pl.col("totalPrice") * 0.1).cast(pl.Int32).alias("low_exchange_fee"),
        # Exchange fee rate
        (pl.col("miniRules1_monetaryAmount") / (pl.col("totalPrice") + 1)).alias("exchange_fee_rate"),
    ])

if policy_features:
    df = df.with_columns(policy_features)

# 5. Route and carrier sophistication
route_features = []
has_airports = False
has_carriers = False

if "legs0_segments0_departureFrom_airport_iata" in df.columns:
    has_airports = True
    route_features.extend([
        # Major hub airports (Moscow, St. Petersburg)
        pl.col("legs0_segments0_departureFrom_airport_iata").is_in(["SVO", "DME", "VKO", "LED", "PKC"])
        .cast(pl.Int32).alias("departs_from_major_hub"),
        
        # International airports
        pl.col("legs0_segments0_arrivalTo_airport_iata").is_in(["LED", "SVO", "DME", "VKO"])
        .cast(pl.Int32).alias("arrives_to_major_hub"),
        
        # Airport consistency (same departure/arrival airports for round trips)
        (pl.col("legs0_segments0_departureFrom_airport_iata") == 
         pl.col("legs1_segments0_arrivalTo_airport_iata").fill_null("")).cast(pl.Int32).alias("consistent_airports"),
    ])

if "legs0_segments0_marketingCarrier_code" in df.columns:
    has_carriers = True
    route_features.extend([
        # Carrier consistency across legs
        (pl.col("legs0_segments0_marketingCarrier_code") == 
         pl.col("legs1_segments0_marketingCarrier_code").fill_null("")).cast(pl.Int32).alias("same_carrier_both_legs"),
        
        # Premium carriers (Aeroflot, S7, etc.)
        pl.col("legs0_segments0_marketingCarrier_code").is_in(["SU", "S7", "U6", "DP"])
        .cast(pl.Int32).alias("is_premium_carrier"),
        
        # Marketing vs Operating carrier alignment
        (pl.col("legs0_segments0_marketingCarrier_code") == 
         pl.col("legs0_segments0_operatingCarrier_code").fill_null("")).cast(pl.Int32).alias("aligned_carriers_leg0"),
    ])

if route_features:
    df = df.with_columns(route_features)

# 6. Aircraft and service quality features
aircraft_features = []
has_aircraft = False

if "legs0_segments0_aircraft_code" in df.columns:
    has_aircraft = True
    aircraft_features.extend([
        # Wide-body aircraft (better for long flights)
        pl.col("legs0_segments0_aircraft_code").is_in(["330", "777", "787", "320", "321"])
        .cast(pl.Int32).alias("wide_body_leg0"),
        
        # Modern aircraft
        pl.col("legs0_segments0_aircraft_code").is_in(["787", "350", "320", "321", "737"])
        .cast(pl.Int32).alias("modern_aircraft_leg0"),
    ])

if aircraft_features:
    df = df.with_columns(aircraft_features)

# 7. Cabin class optimization
cabin_features = []
has_cabin = False

if "legs0_segments0_cabinClass" in df.columns:
    has_cabin = True
    cabin_features.extend([
        # Business class upgrade availability
        (pl.col("legs0_segments0_cabinClass").fill_null(1) >= 2).cast(pl.Int32).alias("business_class_leg0"),
        # Premium economy or better
        (pl.col("legs0_segments0_cabinClass").fill_null(1) >= 1.5).cast(pl.Int32).alias("premium_class_leg0"),
        # Cabin class consistency
        (pl.col("legs0_segments0_cabinClass") == pl.col("legs1_segments0_cabinClass")).cast(pl.Int32).alias("consistent_cabin"),
    ])

if cabin_features:
    df = df.with_columns(cabin_features)

# 8. Competitive analysis within group (basic features first)
competitive_features = [
    # Direct flight premium
    pl.col("both_direct").sum().over("ranker_id").alias("n_direct_options"),
    
    # Premium combinations
    ((pl.col("both_direct") == 1) & (pl.col("is_cheap_quartile") == 1)).cast(pl.Int32).alias("is_cheap_direct"),
    ((pl.col("policy_compliant") == 1) & (pl.col("is_cheap_quartile") == 1)).cast(pl.Int32).alias("compliant_and_cheap"),
    
    # Business convenience
    ((pl.col("legs0_departureAt_hour").fill_null(12) >= 7) & 
     (pl.col("legs0_departureAt_hour").fill_null(12) <= 9)).cast(pl.Int32).alias("morning_departure_business"),
    
    # Fast and convenient
    (pl.col("total_duration") <= pl.col("total_duration").quantile(0.3).over("ranker_id")).cast(pl.Int32).alias("is_fast_option"),
]

df = df.with_columns(competitive_features)

# 8b. Advanced competitive features (requires previous features to exist)
advanced_competitive = []

if has_carriers:
    advanced_competitive.extend([
        # VIP treatment with premium carriers
        ((pl.col("is_vip_freq") == 1) & (pl.col("is_premium_carrier") == 1)).cast(pl.Int32).alias("vip_premium_carrier"),
    ])

if has_cancellation and has_carriers:
    advanced_competitive.extend([
        # Flexible and premium
        ((pl.col("free_cancellation") == 1) & (pl.col("is_premium_carrier") == 1)).cast(pl.Int32).alias("flexible_premium"),
    ])

if advanced_competitive:
    df = df.with_columns(advanced_competitive)

# 9. High-impact interaction features (safe with all previous features defined)
interaction_features = [
    # Price-policy compliance
    (pl.col("price_bucket") * pl.col("policy_compliant")).alias("price_policy_interaction"),
    
    # Business convenience score
    (pl.col("morning_departure_business") * pl.col("is_cheap_quartile") * pl.col("both_direct")).alias("business_value_score"),
    
    # Hub efficiency (basic version)
    (pl.col("both_direct") * pl.col("is_fast_option")).alias("hub_efficiency_score"),
    
    # Corporate optimization (basic version)
    (pl.col("policy_compliant") * pl.col("compliant_and_cheap")).alias("corporate_optimization"),
]

# Add premium service interaction if features exist
if has_carriers and has_cabin:
    interaction_features.append(
        (pl.col("is_premium_carrier") * pl.col("business_class_leg0")).alias("premium_service_score")
    )

# Add enhanced hub efficiency if airport features exist
if has_airports:
    interaction_features.append(
        (pl.col("departs_from_major_hub") * pl.col("both_direct") * pl.col("is_fast_option")).alias("enhanced_hub_efficiency_score")
    )

# Add enhanced corporate optimization if carrier features exist
if has_carriers:
    interaction_features.append(
        (pl.col("policy_compliant") * pl.col("compliant_and_cheap") * pl.col("same_carrier_both_legs")).alias("enhanced_corporate_optimization")
    )

df = df.with_columns(interaction_features)

print("Advanced business traveler features added successfully!")
print("Key improvements:")
print("- Corporate policy compliance and tax efficiency")
print("- Advanced time preferences for business travelers")
print("- Seat scarcity and availability features")
print("- Cancellation/exchange policy flexibility")
print("- Premium carrier and aircraft preferences")
print("- Hub airport and route optimization")
print("- Strategic business value interactions")
print("- Safe feature creation with proper dependency ordering")
print("- Targeting 0.5+ Kaggle score with business insights")

In [6]:
# Fill nulls
data = df.with_columns(
    [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
    [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
)

## Feature Selection

In [None]:
# Categorical features
cat_features = [
    'nationality', 'searchRoute', 'corporateTariffCode',
    'bySelf', 'sex', 'companyID',
    # Leg 0 segments 0-1
    'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata',
    'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata',
    'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code',
    'legs0_segments0_flightNumber',
    'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_arrivalTo_airport_iata', 'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_marketingCarrier_code', 'legs0_segments1_operatingCarrier_code',
    'legs0_segments1_flightNumber',
    # Leg 1 segments 0-1
    'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata',
    'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_flightNumber',
    'legs1_segments1_aircraft_code', 'legs1_segments1_arrivalTo_airport_city_iata',
    'legs1_segments1_arrivalTo_airport_iata', 'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_marketingCarrier_code', 'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_flightNumber',
    # Only add price_bucket as new categorical
    'price_bucket'
]

# Columns to exclude (uninformative or problematic)
exclude_cols = [
    'Id', 'ranker_id', 'selected', 'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt',
    'miniRules0_percentage', 'miniRules1_percentage',  # >90% missing
    'frequentFlyer',  # Already processed
    # Exclude constant columns
    'pricingInfo_passengerCount'
]

# Exclude segment 2-3 columns (>98% missing)
for leg in [0, 1]:
    for seg in [2, 3]:
        for suffix in ['aircraft_code', 'arrivalTo_airport_city_iata', 'arrivalTo_airport_iata',
                      'baggageAllowance_quantity', 'baggageAllowance_weightMeasurementType',
                      'cabinClass', 'departureFrom_airport_iata', 'duration', 'flightNumber',
                      'marketingCarrier_code', 'operatingCarrier_code', 'seatsAvailable']:
            exclude_cols.append(f'legs{leg}_segments{seg}_{suffix}')

feature_cols = [col for col in data.columns if col not in exclude_cols]
cat_features_final = [col for col in cat_features if col in feature_cols]

# Create CatBoost categorical feature indices (column positions in feature_cols)
catboost_cat_indices = [i for i, col in enumerate(feature_cols) if col in cat_features_final]

print(f"Using {len(feature_cols)} features ({len(cat_features_final)} categorical)")
print(f"CatBoost categorical indices: {len(catboost_cat_indices)} features")

X = data.select(feature_cols)
y = data.select('selected')
groups = data.select('ranker_id')

Using 112 features (34 categorical)


## Model Training and Tuning

### 🔍 Early CatBoost Validation (Preventing Late Errors)

In [None]:
# FIXED Early CatBoost Validation (Using SAME logic as actual training)
print("Performing FIXED Early CatBoost Validation...")

# Check 1: Verify catboost_cat_indices exists and is properly defined
try:
    if 'catboost_cat_indices' not in locals():
        print("ERROR: catboost_cat_indices is not defined!")
        print("   This variable is required for CatBoost Pool creation.")
        print("   Expected: List of column indices for categorical features")
        raise NameError("catboost_cat_indices is not defined in Feature Selection section")
    else:
        print(f"catboost_cat_indices found: {len(catboost_cat_indices)} categorical features")
        print(f"   Indices: {catboost_cat_indices[:10]}{'...' if len(catboost_cat_indices) > 10 else ''}")
except NameError as e:
    print(f"CRITICAL ERROR: {e}")
    print("This error would occur during CatBoost training after hours of waiting!")
    print("   Solution: Define catboost_cat_indices in Feature Selection section")
    raise

# Check 2: Verify categorical features exist in dataset
try:
    missing_cat_features = [col for col in cat_features_final if col not in feature_cols]
    if missing_cat_features:
        print(f"WARNING: {len(missing_cat_features)} categorical features missing from feature_cols")
        print(f"   Missing: {missing_cat_features[:5]}{'...' if len(missing_cat_features) > 5 else ''}")
    else:
        print(f"All {len(cat_features_final)} categorical features found in dataset")
except Exception as e:
    print(f"ERROR in categorical feature validation: {e}")
    raise

# Check 3: Test CatBoost Pool creation with EXACT SAME logic as actual training
try:
    print("Testing CatBoost Pool creation with EXACT SAME logic as actual training...")
    
    # Take a small sample for testing
    sample_size = min(1000, len(X))
    
    # CRITICAL: Use EXACT SAME unified categorical encoding as actual training
    print("   Applying EXACT SAME unified categorical encoding as actual training...")
    
    # Step 1: Get sample data (same as actual training)
    X_sample_test = X.head(sample_size).clone()
    y_sample = y.head(sample_size).to_numpy().flatten()
    groups_sample = groups.head(sample_size).to_numpy().flatten()
    
    # Step 2: Apply EXACT SAME unified categorical encoding as actual training
    # CRITICAL: Use EXACT SAME logic as the actual CatBoost training cell
    for col in cat_features_final:
        print(f"      Processing categorical feature: {col}")
        
        # FIXED: Use the EXACT SAME unified approach as actual training
        # Create mapping from ALL unique values in the sample (simulating full dataset)
        all_values = X_sample_test.select(col).unique().sort(col)
        
        # Create EXACT SAME mapping: unique values -> integers (0, 1, 2, ...)
        mapping_dict = {
            val: idx for idx, val in enumerate(all_values[col].to_list())
        }
        
        # Apply EXACT SAME mapping with map_elements and explicit Int32
        X_sample_test = X_sample_test.with_columns(
            pl.col(col).map_elements(lambda x: mapping_dict.get(x, -1), return_dtype=pl.Int32).alias(col)
        )
    
    # Step 3: Convert numerical features to float32 (SAME AS TRAINING)
    non_cat_features = [col for col in feature_cols if col not in cat_features_final]
    for col in non_cat_features:
        X_sample_test = X_sample_test.with_columns(pl.col(col).cast(pl.Float32).alias(col))
    
    # Step 4: Convert to numpy (SAME AS TRAINING)
    sample_data = X_sample_test.to_numpy()
    
    # Verify data types (should match training)
    print(f"   Sample data shape: {sample_data.shape}")
    print(f"   Sample data dtype: {sample_data.dtype}")
    print(f"   Categorical indices: {catboost_cat_indices[:5]}...")
    
    # Check categorical columns data types
    for i, cat_idx in enumerate(catboost_cat_indices[:3]):
        col_data = sample_data[:, cat_idx]
        print(f"   Cat feature {i} (col {cat_idx}): dtype={col_data.dtype}, sample={col_data[:3]}")
    
    # Step 5: Test Pool creation with EXACT SAME logic as actual training
    from catboost import Pool
    test_pool = Pool(
        data=sample_data,
        label=y_sample,
        group_id=groups_sample,
        cat_features=catboost_cat_indices
    )
    
    print(f"CatBoost Pool created successfully with EXACT SAME logic as training!")
    print(f"   Sample size: {sample_size} rows")
    print(f"   Features: {sample_data.shape[1]} columns")
    print(f"   Categorical features: {len(catboost_cat_indices)} indices")
    print(f"   Data types verified: mixed int32/float32 array compatible with CatBoost")
    
    # Step 6: Test quick model creation with SAME parameters as actual training
    from catboost import CatBoostRanker
    test_model = CatBoostRanker(
        loss_function='YetiRank',
        iterations=10,  # Just a few iterations for testing
        verbose=False,
        random_seed=42
    )
    
    test_model.fit(test_pool)
    test_preds = test_model.predict(test_pool)
    
    print(f"CatBoost model training test successful with EXACT SAME logic!")
    print(f"   Predictions shape: {test_preds.shape}")
    print(f"   Sample predictions: {test_preds[:5]}")
    print(f"   No data type conflicts detected!")
    
except Exception as e:
    print(f"CRITICAL ERROR in CatBoost validation: {e}")
    print(f"   Error type: {type(e).__name__}")
    print("This error would occur during actual training after hours!")
    print("   Fix required before proceeding with full training")
    if "floating point numerical type" in str(e):
        print("   IDENTIFIED: Data type mismatch - categorical features in float array")
        print("   SOLUTION: Use unified categorical encoding with proper type control")
    raise

print("\nEarly CatBoost Validation PASSED with FIXED unified encoding!")
print("All CatBoost components verified and working")
print("Data type processing EXACTLY matches actual training")
print("No late-stage errors expected during actual training")
print("Safe to proceed with full model training pipeline")
print("=" * 60)

### 1. XGBoost Model

In [8]:
data_xgb = X.with_columns([(pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32) for c in cat_features_final])

n1 = 16487352 # split train to train and val (10%) in time
n2 = train.height
data_xgb_tr, data_xgb_va, data_xgb_te = data_xgb[:n1], data_xgb[n1:n2], data_xgb[n2:]
y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

group_sizes_tr = groups_tr.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
group_sizes_va = groups_va.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
group_sizes_te = groups_te.group_by('ranker_id').agg(pl.len()).sort('ranker_id')['len'].to_numpy()
dtrain = xgb.DMatrix(data_xgb_tr, label=y_tr, group=group_sizes_tr, feature_names=data_xgb.columns)
dval   = xgb.DMatrix(data_xgb_va, label=y_va, group=group_sizes_va, feature_names=data_xgb.columns)
dtest  = xgb.DMatrix(data_xgb_te, label=y_te, group=group_sizes_te, feature_names=data_xgb.columns)

In [None]:
# Optimized XGBoost parameters for business traveler features (v2)
final_xgb_params = {
    'objective': 'rank:pairwise', 
    'eval_metric': 'ndcg@3', 
    'max_depth': 8,              # 从9减少到8，防止轻微过拟合
    'min_child_weight': 10,      # 从12减少到10，更灵活
    'subsample': 0.92,           # 从0.9增加到0.92，更好的泛化
    'colsample_bytree': 0.9,     # 从0.88增加到0.9，更多特征利用
    'lambda': 3.0,              # 从3.5减少到3.0，减少正则化
    'alpha': 0.12,              # 从0.15减少到0.12，更灵活
    'learning_rate': 0.055,     # 从0.052增加到0.055，稍快收敛
    'gamma': 0.06,              # 从0.08减少到0.06，更多分裂
    'seed': RANDOM_STATE, 
    'n_jobs': -1,
    'tree_method': 'hist',
    'grow_policy': 'lossguide'
}

print("\nTraining XGBoost with v2 business-optimized parameters...")
print("V2 optimizations for better convergence:")
print("- Reduced depth (8) to prevent slight overfitting")
print("- Improved sampling rates for better generalization")
print("- Fine-tuned regularization for optimal expressiveness")
print("- Adjusted learning rate for faster convergence")

xgb_model = xgb.train(
    final_xgb_params, dtrain,
    num_boost_round=1600,        # Increased for complex features
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=110,   # Increased patience for complex learning
    verbose_eval=50
)


Training final XGBoost model with optimized parameters...


[0]	train-ndcg@3:0.78524	val-ndcg@3:0.80685


[50]	train-ndcg@3:0.81400	val-ndcg@3:0.83248


[100]	train-ndcg@3:0.82217	val-ndcg@3:0.83573


[150]	train-ndcg@3:0.82861	val-ndcg@3:0.83820


[200]	train-ndcg@3:0.83513	val-ndcg@3:0.83957


[250]	train-ndcg@3:0.84131	val-ndcg@3:0.84052




[300]	train-ndcg@3:0.84615	val-ndcg@3:0.84221




[350]	train-ndcg@3:0.85105	val-ndcg@3:0.84242


[400]	train-ndcg@3:0.85546	val-ndcg@3:0.84368




[450]	train-ndcg@3:0.85942	val-ndcg@3:0.84365




[500]	train-ndcg@3:0.86289	val-ndcg@3:0.84426




[550]	train-ndcg@3:0.86569	val-ndcg@3:0.84448


[600]	train-ndcg@3:0.86851	val-ndcg@3:0.84505




[650]	train-ndcg@3:0.87145	val-ndcg@3:0.84537




[700]	train-ndcg@3:0.87429	val-ndcg@3:0.84528


[750]	train-ndcg@3:0.87703	val-ndcg@3:0.84504




[778]	train-ndcg@3:0.87830	val-ndcg@3:0.84488




### 3. LightGBM Model

In [10]:
# CODE CELL
# LightGBM requires its own Dataset object. We can reuse the rank-encoded data from XGBoost.
print("Creating LightGBM Datasets...")
lgb_train = lgb.Dataset(
    data=data_xgb_tr, 
    label=y_tr.to_numpy().flatten(), 
    group=group_sizes_tr,
    feature_name=feature_cols,
    free_raw_data=False
)

lgb_val = lgb.Dataset(
    data=data_xgb_va, 
    label=y_va.to_numpy().flatten(), 
    group=group_sizes_va,
    feature_name=feature_cols,
    reference=lgb_train,
    free_raw_data=False
)
print("LightGBM Datasets created successfully.")

Creating LightGBM Datasets...


LightGBM Datasets created successfully.


In [None]:
# Optimized LightGBM parameters for business traveler features (v2)
final_lgb_params = {
    'objective': 'lambdarank', 
    'metric': 'ndcg', 
    'boosting_type': 'gbdt',
    'eval_at': [3],
    'num_leaves': 140,           # 从160减少到140，防止过拟合
    'learning_rate': 0.15,       # 从0.18减少到0.15，更稳定
    'min_child_samples': 60,     # 从55增加到60，更保守
    'lambda_l1': 0.006,          # 从0.003增加到0.006，更多L1正则化
    'lambda_l2': 7.5,            # 从6.5增加到7.5，更多L2正则化
    'feature_fraction': 0.75,    # 从0.78减少到0.75，防止过拟合
    'bagging_fraction': 0.84,    # 从0.87减少到0.84，更保守采样
    'bagging_freq': 5,           
    'min_gain_to_split': 0.005,  # 从0.003增加到0.005，更严格分裂
    'max_depth': 10,             # 从12减少到10，控制复杂度
    'force_row_wise': True,      
    'n_jobs': -1, 
    'random_state': RANDOM_STATE, 
    'label_gain': [0, 1],
    'verbosity': -1
}

print("\nTraining LightGBM model with v2 business-optimized parameters...")
print("V2 business feature optimizations:")
print("- Reduced leaves (140) and depth (10) to prevent overfitting")
print("- Lower learning rate (0.15) for better stability")
print("- Increased regularization (L1=0.006, L2=7.5) for generalization")
print("- Conservative sampling for better model robustness")

lgb_model = lgb.train(
    final_lgb_params,
    lgb_train,
    num_boost_round=2000,        # Increased for complex features
    valid_sets=[lgb_train, lgb_val],
    callbacks=[lgb.early_stopping(130), lgb.log_evaluation(50)]  # Increased patience
)


Training final LightGBM model with optimized parameters...


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.978894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Info] Total Bins 10050


[LightGBM] [Info] Number of data points in the train set: 16487352, number of used features: 108


Training until validation scores don't improve for 100 rounds


[50]	training's ndcg@3: 0.842894	valid_1's ndcg@3: 0.839999




[100]	training's ndcg@3: 0.864782	valid_1's ndcg@3: 0.842916


[150]	training's ndcg@3: 0.879591	valid_1's ndcg@3: 0.842223




[200]	training's ndcg@3: 0.892081	valid_1's ndcg@3: 0.842145




Early stopping, best iteration is:
[100]	training's ndcg@3: 0.864782	valid_1's ndcg@3: 0.842916




In [1]:
print("\n--- Training LightGBM DART Model ---")

# Keep original DART parameters but with slight optimization (v2)
dart_params = {
    'objective': 'lambdarank', 
    'metric': 'ndcg', 
    'eval_at': [3],
    'boosting_type': 'dart', 
    'n_estimators': 2500,      # 保持训练轮数
    'learning_rate': 0.045,    # 从0.042增加到0.045，更好学习能力
    'num_leaves': 60,          # 从55增加到60，更多模型容量
    'drop_rate': 0.075,        # 从0.08减少到0.075，更稳定dropout
    'subsample': 0.85,         # 从0.82增加到0.85，更好采样
    'skip_drop': 0.48,         # 从0.45增加到0.48，更平衡的保留
    'max_depth': 12,           # 添加深度限制，控制复杂度
    'min_child_samples': 45,   # 添加最小样本数，防止过拟合
    'lambda_l1': 0.001,        # 添加L1正则化
    'lambda_l2': 2.5,          # 添加L2正则化
    'n_jobs': -1,
    'random_state': RANDOM_STATE,
    'label_gain': [0, 1]
}

print("\nTraining Enhanced LightGBM DART model (v2)...")
print("V2 DART optimizations:")
print("- Refined learning rate (0.045) for better convergence")
print("- Increased capacity (60 leaves) with depth control (12)")
print("- Optimized DART dropout balance (0.075 rate, 0.48 skip)")
print("- Added regularization for better generalization")

lgb_model_dart = lgb.train(
    dart_params,
    lgb_train, 
    num_boost_round=dart_params['n_estimators'], 
    valid_sets=[lgb_val],
    callbacks=[lgb.early_stopping(110), lgb.log_evaluation(50)]  # Increased patience
)

print("\n--- Training CatBoost Model (After Validation) ---")
print("🔧 Now training CatBoost with full dataset in original sequence...")

# Prepare data for CatBoost (needs specific format)
from catboost import CatBoostRanker, Pool

# 🔧 FIXED: Create CatBoost data with proper type handling
# CatBoost has specific requirements for categorical features - they must be integers
# But when converting polars DataFrame to numpy, the entire array becomes float64
# Solution: Use the ORIGINAL data (X) instead of rank-encoded data for CatBoost

print("🔧 Preparing CatBoost data with original categorical features...")

# Use original data X (not rank-encoded data_xgb) for CatBoost
# This preserves the original categorical values as intended
X_catboost_tr = X[:n1]  # Same split as training data
X_catboost_va = X[n1:n2]  # Same split as validation data  
X_catboost_te = X[n2:]  # Same split as test data

# Ensure all categorical features are properly encoded as integers
print(f"📊 Converting {len(cat_features_final)} categorical features to integers...")
for col in cat_features_final:
    # Convert categorical columns to proper integer encoding for CatBoost
    X_catboost_tr = X_catboost_tr.with_columns(
        pl.col(col).cast(pl.String).rank("dense").cast(pl.Int32).alias(col)
    )
    X_catboost_va = X_catboost_va.with_columns(
        pl.col(col).cast(pl.String).rank("dense").cast(pl.Int32).alias(col)
    )
    X_catboost_te = X_catboost_te.with_columns(
        pl.col(col).cast(pl.String).rank("dense").cast(pl.Int32).alias(col)
    )

# Convert non-categorical features to float32 for efficiency
non_cat_features = [col for col in feature_cols if col not in cat_features_final]
print(f"📊 Converting {len(non_cat_features)} numerical features to float32...")
for col in non_cat_features:
    X_catboost_tr = X_catboost_tr.with_columns(pl.col(col).cast(pl.Float32).alias(col))
    X_catboost_va = X_catboost_va.with_columns(pl.col(col).cast(pl.Float32).alias(col))
    X_catboost_te = X_catboost_te.with_columns(pl.col(col).cast(pl.Float32).alias(col))

print("✅ CatBoost data preparation completed with proper data types")

# Create CatBoost pools with properly typed data
print("🏗️ Creating CatBoost Pool objects...")
train_pool = Pool(
    data=X_catboost_tr.to_numpy(),
    label=y_tr.to_numpy().flatten(),
    group_id=groups_tr.to_numpy().flatten(),
    cat_features=catboost_cat_indices
)

val_pool = Pool(
    data=X_catboost_va.to_numpy(),
    label=y_va.to_numpy().flatten(),
    group_id=groups_va.to_numpy().flatten(),
    cat_features=catboost_cat_indices
)

print("✅ CatBoost Pool objects created successfully!")
print(f"   Training pool: {train_pool.num_row()} rows x {train_pool.num_col()} cols")
print(f"   Validation pool: {val_pool.num_row()} rows x {val_pool.num_col()} cols")
print(f"   Categorical features: {len(catboost_cat_indices)} indices")

# CatBoost parameters optimized for business features
catboost_params = {
    'loss_function': 'YetiRank',
    'custom_metric': ['NDCG:top=3'],
    'iterations': 2000,
    'learning_rate': 0.12,
    'depth': 8,
    'l2_leaf_reg': 4.0,
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 0.8,
    'subsample': 0.85,
    'random_strength': 0.8,
    'one_hot_max_size': 10,
    'max_ctr_complexity': 3,
    'random_seed': RANDOM_STATE,
    'thread_count': -1,
    'verbose': 50
}

catboost_model = CatBoostRanker(**catboost_params)
catboost_model.fit(
    train_pool, 
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=50
)

print("All models trained successfully!")


--- Training LightGBM DART Model ---


NameError: name 'RANDOM_STATE' is not defined

## 4. Blending and Final Evaluation

In [None]:
print("\n--- Training CatBoost Model (FIXED) ---")
print("Training CatBoost with CORRECTED data type handling...")

# Prepare data for CatBoost (needs specific format)
from catboost import CatBoostRanker, Pool

# CRITICAL FIX: Use unified categorical encoding approach
# The problem: Polars DataFrame to numpy conversion always results in float64 arrays
# Solution: Create separate data processing pipeline for CatBoost with explicit type control

print("Preparing CatBoost data with FIXED categorical encoding...")

# Step 1: Get data splits using the same indices as other models
X_catboost_tr = X[:n1].clone()  # Training split
X_catboost_va = X[n1:n2].clone()  # Validation split 
X_catboost_te = X[n2:].clone()  # Test split

print(f"CatBoost data splits: Train={len(X_catboost_tr)}, Val={len(X_catboost_va)}, Test={len(X_catboost_te)}")

# Step 2: CRITICAL FIX - Apply proper categorical encoding
# Convert categorical features to integers using a unified approach
print(f"Converting {len(cat_features_final)} categorical features to integers...")

# Create a unified string-to-integer mapping for each categorical feature
for col in cat_features_final:
    print(f"   Processing categorical feature: {col}")
    
    # Combine all data to create consistent encoding across splits
    all_values = pl.concat([
        X_catboost_tr.select(col),
        X_catboost_va.select(col), 
        X_catboost_te.select(col)
    ]).unique().sort(col)
    
    # Create mapping: unique values -> integers (0, 1, 2, ...)
    mapping_dict = {
        val: idx for idx, val in enumerate(all_values[col].to_list())
    }
    
    # Apply mapping to all splits consistently
    X_catboost_tr = X_catboost_tr.with_columns(
        pl.col(col).map_elements(lambda x: mapping_dict.get(x, -1), return_dtype=pl.Int32).alias(col)
    )
    X_catboost_va = X_catboost_va.with_columns(
        pl.col(col).map_elements(lambda x: mapping_dict.get(x, -1), return_dtype=pl.Int32).alias(col)
    )
    X_catboost_te = X_catboost_te.with_columns(
        pl.col(col).map_elements(lambda x: mapping_dict.get(x, -1), return_dtype=pl.Int32).alias(col)
    )

# Step 3: Convert numerical features to float32 for memory efficiency
non_cat_features = [col for col in feature_cols if col not in cat_features_final]
print(f"Converting {len(non_cat_features)} numerical features to float32...")

for col in non_cat_features:
    X_catboost_tr = X_catboost_tr.with_columns(pl.col(col).cast(pl.Float32).alias(col))
    X_catboost_va = X_catboost_va.with_columns(pl.col(col).cast(pl.Float32).alias(col))
    X_catboost_te = X_catboost_te.with_columns(pl.col(col).cast(pl.Float32).alias(col))

print("CatBoost data preparation completed with FIXED data types")

# Step 4: Verify data types before Pool creation
print("Verifying data types before Pool creation...")
sample_data = X_catboost_tr.head(100).to_numpy()
print(f"   Sample data shape: {sample_data.shape}")
print(f"   Sample data dtype: {sample_data.dtype}")

# Check categorical columns specifically
for i, cat_idx in enumerate(catboost_cat_indices[:3]):
    col_data = sample_data[:, cat_idx]
    print(f"   Cat feature {i} (col {cat_idx}): dtype={col_data.dtype}, sample={col_data[:3]}")

# Step 5: Create CatBoost pools with properly typed data
print("Creating CatBoost Pool objects...")

train_pool = Pool(
    data=X_catboost_tr.to_numpy(),
    label=y_tr.to_numpy().flatten(),
    group_id=groups_tr.to_numpy().flatten(),
    cat_features=catboost_cat_indices
)

val_pool = Pool(
    data=X_catboost_va.to_numpy(),
    label=y_va.to_numpy().flatten(),
    group_id=groups_va.to_numpy().flatten(),
    cat_features=catboost_cat_indices
)

print("CatBoost Pool objects created successfully!")
print(f"   Training pool: {train_pool.num_row()} rows x {train_pool.num_col()} cols")
print(f"   Validation pool: {val_pool.num_row()} rows x {val_pool.num_col()} cols")
print(f"   Categorical features: {len(catboost_cat_indices)} indices")

# Step 6: Train CatBoost model with optimized parameters
print("Training CatBoost model...")

catboost_params = {
    'loss_function': 'YetiRank',
    'custom_metric': ['NDCG:top=3'],
    'iterations': 2000,
    'learning_rate': 0.12,
    'depth': 8,
    'l2_leaf_reg': 4.0,
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 0.8,
    'subsample': 0.85,
    'random_strength': 0.8,
    'one_hot_max_size': 10,
    'max_ctr_complexity': 3,
    'random_seed': RANDOM_STATE,
    'thread_count': -1,
    'verbose': 50
}

catboost_model = CatBoostRanker(**catboost_params)
catboost_model.fit(
    train_pool, 
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=50
)

print("CatBoost model trained successfully with FIXED data types!")
print("All models training completed successfully!")

Evaluating all models on the validation set...




------------------------------
XGBoost HitRate@3:     0.5045
LGBM GBDT HitRate@3:   0.4844
LGBM DART HitRate@3:   0.4932
------------------------------
3-Model Blend HitRate@3: 0.5039
------------------------------


## 5. Submission

In [None]:
# Validation and Ensemble Optimization (Required Variables)
print("Setting up ensemble strategies...")

# Get validation predictions for ensemble optimization
xgb_val_preds = xgb_model.predict(dval)
lgb_gbdt_val_preds = lgb_model.predict(data_xgb_va)
lgb_dart_val_preds = lgb_model_dart.predict(data_xgb_va)

# CatBoost validation predictions
catboost_val_preds = catboost_model.predict(val_pool)

# Calculate individual model performance on validation set
val_hitrates = {}
val_hitrates['XGBoost'] = hitrate_at_3(y_va.to_numpy().flatten(), xgb_val_preds, groups_va.to_numpy().flatten())
val_hitrates['LightGBM_GBDT'] = hitrate_at_3(y_va.to_numpy().flatten(), lgb_gbdt_val_preds, groups_va.to_numpy().flatten())
val_hitrates['LightGBM_DART'] = hitrate_at_3(y_va.to_numpy().flatten(), lgb_dart_val_preds, groups_va.to_numpy().flatten())
val_hitrates['CatBoost'] = hitrate_at_3(y_va.to_numpy().flatten(), catboost_val_preds, groups_va.to_numpy().flatten())

print("Individual model validation performance:")
for model, hr in val_hitrates.items():
    print(f"  {model}: {hr:.4f}")

# Define ensemble strategies
strategies = {
    "Static Balanced": np.array([0.35, 0.25, 0.10, 0.30]),  # XGB, LGB-GBDT, LGB-DART, CatBoost
    "DART Focused": np.array([0.25, 0.25, 0.35, 0.15]),
    "Performance Weighted": np.array([0.4, 0.3, 0.1, 0.2]),
    "Static XGBoost Focused": np.array([0.45, 0.15, 0.05, 0.35])
}

# Test strategies on validation data
val_submission_df = data_xgb_va.select(['ranker_id']).with_columns([
    pl.Series('xgb_score', xgb_val_preds),
    pl.Series('lgb_gbdt_score', lgb_gbdt_val_preds),
    pl.Series('lgb_dart_score', lgb_dart_val_preds),
    pl.Series('catboost_score', catboost_val_preds)
])

# Convert scores to ranks
val_submission_df = val_submission_df.with_columns([
    pl.col("xgb_score").rank(method="average", descending=True).over("ranker_id").alias("xgb_rank"),
    pl.col("lgb_gbdt_score").rank(method="average", descending=True).over("ranker_id").alias("lgb_gbdt_rank"),
    pl.col("lgb_dart_score").rank(method="average", descending=True).over("ranker_id").alias("lgb_dart_rank"),
    pl.col("catboost_score").rank(method="average", descending=True).over("ranker_id").alias("catboost_rank")
])

# Test ensemble strategies
best_strategy_hr3 = 0
best_strategy_name = "Static Balanced"
strategy_results = {}

for strategy_name, weights in strategies.items():
    # Create ensemble
    ensemble_score = (weights[0] * val_submission_df.get_column("xgb_rank") + 
                     weights[1] * val_submission_df.get_column("lgb_gbdt_rank") + 
                     weights[2] * val_submission_df.get_column("lgb_dart_rank") + 
                     weights[3] * val_submission_df.get_column("catboost_rank"))
    
    # Get best option per group (lowest ensemble rank)
    val_ensemble_df = val_submission_df.with_columns([
        pl.Series("ensemble_rank", ensemble_score)
    ])
    
    # Rank ensemble scores within each group
    val_ensemble_df = val_ensemble_df.with_columns([
        pl.col("ensemble_rank").rank(method="ordinal", descending=False).over("ranker_id").alias("final_rank")
    ])
    
    # Select top option per group
    selected = val_ensemble_df.filter(pl.col("final_rank") == 1)
    
    # Create predictions (1 for selected, 0 for others)
    val_preds = np.zeros(len(val_ensemble_df))
    selected_indices = selected.select(pl.int_range(pl.len()).over("ranker_id")).to_numpy().flatten()
    # val_preds[selected_indices] = 1
    
    # Calculate hit rate using ensemble ranking
    val_ensemble_preds = val_ensemble_df.get_column("ensemble_rank").to_numpy()
    strategy_hr3 = hitrate_at_3(y_va.to_numpy().flatten(), -val_ensemble_preds, groups_va.to_numpy().flatten())
    
    strategy_results[strategy_name] = strategy_hr3
    print(f"Strategy '{strategy_name}': {strategy_hr3:.4f}")
    
    if strategy_hr3 > best_strategy_hr3:
        best_strategy_hr3 = strategy_hr3
        best_strategy_name = strategy_name

# Set the best strategy weights
if best_strategy_name in strategies:
    optimized_weights = strategies[best_strategy_name]
    dart_focused_weights = strategies.get("DART Focused", strategies["Static Balanced"])
    performance_weights = strategies.get("Performance Weighted", strategies["Static Balanced"])
    balanced_weights = strategies["Static Balanced"]
else:
    optimized_weights = strategies["Static Balanced"]
    dart_focused_weights = strategies["Static Balanced"]
    performance_weights = strategies["Static Balanced"]
    balanced_weights = strategies["Static Balanced"]

print(f"\nBest strategy: {best_strategy_name} (HR@3: {best_strategy_hr3:.4f})")
print("Ensemble optimization completed successfully!")

In [None]:
print("Generating predictions on test set...")

# XGBoost test predictions
xgb_test_preds = xgb_model.predict(dtest)

# LightGBM test predictions  
lgb_gbdt_test_preds = lgb_model.predict(data_xgb_te)
lgb_dart_test_preds = lgb_model_dart.predict(data_xgb_te)

# CatBoost test predictions with SAME encoding as training
print("Preparing CatBoost test predictions with FIXED encoding...")

# CatBoost test data is already properly encoded in the training cell
# X_catboost_te was processed with the same categorical encoding pipeline
# So we can directly create the test pool and predict

test_pool = Pool(
    data=X_catboost_te.to_numpy(),
    group_id=groups_te.to_numpy().flatten(),
    cat_features=catboost_cat_indices
)

print("CatBoost test pool created successfully")
catboost_test_preds = catboost_model.predict(test_pool)
print("CatBoost test predictions generated successfully")

# Create comprehensive submission dataframe with all models and strategies
submission_df = test.select(['Id', 'ranker_id']).with_columns([
    pl.Series('xgb_score', xgb_test_preds),
    pl.Series('lgb_gbdt_score', lgb_gbdt_test_preds),
    pl.Series('lgb_dart_score', lgb_dart_test_preds),
    pl.Series('catboost_score', catboost_test_preds)
])

# Convert scores to ranks within each group
submission_df = submission_df.with_columns([
    pl.col("xgb_score").rank(method="average", descending=True).over("ranker_id").alias("xgb_rank"),
    pl.col("lgb_gbdt_score").rank(method="average", descending=True).over("ranker_id").alias("lgb_gbdt_rank"),
    pl.col("lgb_dart_score").rank(method="average", descending=True).over("ranker_id").alias("lgb_dart_rank"),
    pl.col("catboost_score").rank(method="average", descending=True).over("ranker_id").alias("catboost_rank")
])

print("All test predictions generated successfully!")
print(f"Test predictions shape: {submission_df.shape}")
print(f"XGBoost predictions range: {xgb_test_preds.min():.4f} - {xgb_test_preds.max():.4f}")
print(f"LightGBM GBDT predictions range: {lgb_gbdt_test_preds.min():.4f} - {lgb_gbdt_test_preds.max():.4f}")
print(f"LightGBM DART predictions range: {lgb_dart_test_preds.min():.4f} - {lgb_dart_test_preds.max():.4f}")
print(f"CatBoost predictions range: {catboost_test_preds.min():.4f} - {catboost_test_preds.max():.4f}")

# STEP 1: STRATEGY SELECTION AND ENSEMBLE OPTIMIZATION
print("\nStep 1: Applying Best Strategy to Test Data...")

# Apply the best strategy from validation to test data
if best_strategy_name == "DART Focused":
    submission_df = submission_df.with_columns([
        (dart_focused_weights[0] * pl.col("xgb_rank") + dart_focused_weights[1] * pl.col("lgb_gbdt_rank") + 
         dart_focused_weights[2] * pl.col("lgb_dart_rank") + dart_focused_weights[3] * pl.col("catboost_rank")).alias("best_ensemble")
    ])
    strategy_weights = dart_focused_weights
    
elif best_strategy_name == "Dynamic Optimized":
    submission_df = submission_df.with_columns([
        (optimized_weights[0] * pl.col("xgb_rank") + optimized_weights[1] * pl.col("lgb_gbdt_rank") + 
         optimized_weights[2] * pl.col("lgb_dart_rank") + optimized_weights[3] * pl.col("catboost_rank")).alias("best_ensemble")
    ])
    strategy_weights = optimized_weights
    
elif best_strategy_name == "Performance Weighted":
    submission_df = submission_df.with_columns([
        (performance_weights[0] * pl.col("xgb_rank") + performance_weights[1] * pl.col("lgb_gbdt_rank") + 
         performance_weights[2] * pl.col("lgb_dart_rank") + performance_weights[3] * pl.col("catboost_rank")).alias("best_ensemble")
    ])
    strategy_weights = performance_weights
    
elif best_strategy_name == "Adaptive Balanced":
    submission_df = submission_df.with_columns([
        (balanced_weights[0] * pl.col("xgb_rank") + balanced_weights[1] * pl.col("lgb_gbdt_rank") + 
         balanced_weights[2] * pl.col("lgb_dart_rank") + balanced_weights[3] * pl.col("catboost_rank")).alias("best_ensemble")
    ])
    strategy_weights = balanced_weights
    
elif best_strategy_name == "Static XGBoost Focused":
    submission_df = submission_df.with_columns([
        (0.45 * pl.col("xgb_rank") + 0.35 * pl.col("catboost_rank") + 
         0.15 * pl.col("lgb_gbdt_rank") + 0.05 * pl.col("lgb_dart_rank")).alias("best_ensemble")
    ])
    strategy_weights = np.array([0.45, 0.15, 0.05, 0.35])  # XGB, LGB-GBDT, LGB-DART, CatBoost
    
else:  # Static Balanced as fallback
    submission_df = submission_df.with_columns([
        (0.35 * pl.col("xgb_rank") + 0.3 * pl.col("catboost_rank") + 
         0.25 * pl.col("lgb_gbdt_rank") + 0.1 * pl.col("lgb_dart_rank")).alias("best_ensemble")
    ])
    strategy_weights = np.array([0.35, 0.25, 0.10, 0.30])  # XGB, LGB-GBDT, LGB-DART, CatBoost

print(f"Applied Strategy: {best_strategy_name}")
print(f"Strategy Weights: XGB={strategy_weights[0]:.3f}, LGB-GBDT={strategy_weights[1]:.3f}, LGB-DART={strategy_weights[2]:.3f}, CatBoost={strategy_weights[3]:.3f}")

# STEP 2: FINAL SUBMISSION RANKING
print("\nStep 2: Creating Final Submission...")

# Convert ensemble scores to final ranks
final_submission = submission_df.select(['Id']).with_columns([
    pl.col('Id').alias('Id'),
    pl.lit(1).alias('selected')  # All flights ranked as 1 initially
])

# Rank flights within each ranker group based on best ensemble score
submission_with_ranks = submission_df.with_columns([
    pl.col("best_ensemble").rank(method="ordinal", descending=False).over("ranker_id").alias("final_rank")
])

# Keep only the top-ranked flight for each ranker (rank 1)
final_submission = submission_with_ranks.filter(pl.col("final_rank") == 1).select(['Id']).with_columns([
    pl.lit(1).alias('selected')
])

print(f"Final submission created!")
print(f"Selected flights: {len(final_submission)}")
print(f"Expected rankers: ~{len(test.select('ranker_id').unique())}")

# Save submission file
final_submission.write_csv('submission.csv')
print("Submission saved to 'submission.csv'")

print("\nENSEMBLE OPTIMIZATION COMPLETE!")
print("=" * 60)
print(f"Best Strategy: {best_strategy_name}")
print(f"Validation HitRate@3: {best_strategy_hr3:.4f}")
print(f"Strategy Weights: XGB={strategy_weights[0]:.3f}, LGB-GBDT={strategy_weights[1]:.3f}, LGB-DART={strategy_weights[2]:.3f}, CatBoost={strategy_weights[3]:.3f}")
print("Ready for Kaggle submission!")

Generating predictions for the test set with all three models...



Submission file 'submission.csv' created successfully.


shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│ 18144679 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 18       │
│ 18144680 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 76       │
│ 18144681 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 267      │
│ 18144682 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 102      │
│ 18144683 ┆ c9373e5f772e43d593dd6ad2fa90f6… ┆ 82       │
└──────────┴─────────────────────────────────┴──────────┘
