# 🚀 GPU-ACCELERATED MODEL - FASE 1: QUICK WINS
## Target: Migliorare da 13,978 → ~9,000-10,000 score

**Strategie:**
1. CatBoost GPU con 10,000 iterazioni
2. XGBoost GPU ottimizzato
3. Feature Engineering Avanzato (100+ features)
4. Optuna con 1000+ trials
5. Ensemble pesante

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# GPU libraries
try:
    from catboost import CatBoostRegressor, Pool
    print("✅ CatBoost available")
    CATBOOST_AVAILABLE = True
except ImportError:
    print("❌ CatBoost not installed. Run: pip install catboost")
    CATBOOST_AVAILABLE = False

try:
    import optuna
    from optuna.integration import LightGBMPruningCallback
    print("✅ Optuna available")
    OPTUNA_AVAILABLE = True
except ImportError:
    print("❌ Optuna not installed. Run: pip install optuna")
    OPTUNA_AVAILABLE = False

print("\n✅ Core libraries loaded")

In [None]:
# Load data
receivals = pd.read_csv('data/kernel/receivals.csv')
purchase_orders = pd.read_csv('data/kernel/purchase_orders.csv')
materials = pd.read_csv('data/extended/materials.csv')
prediction_mapping = pd.read_csv('data/prediction_mapping.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'])
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'])

print(f"Receivals: {len(receivals):,} rows")
print(f"Purchase Orders: {len(purchase_orders):,} rows")
print(f"Prediction mapping: {len(prediction_mapping):,} rows")
print(f"\nDate range: {receivals['date_arrival'].min()} to {receivals['date_arrival'].max()}")

## 🔧 ADVANCED FEATURE ENGINEERING (100+ Features)

In [None]:
def calculate_advanced_features_v2(receivals, purchase_orders, materials, rm_id, current_date, forecast_horizon_days):
    """
    Calculate 100+ advanced features
    """
    features = {}
    
    # Historical data
    hist = receivals[(receivals['rm_id'] == rm_id) & (receivals['date_arrival'] <= current_date)]
    
    if len(hist) == 0:
        return {'rm_id_encoded': rm_id, 'forecast_horizon_days': forecast_horizon_days, 'has_history': 0,
                **{f'feature_{i}': 0 for i in range(100)}}
    
    # Basic
    features['rm_id_encoded'] = rm_id
    features['forecast_horizon_days'] = forecast_horizon_days
    features['has_history'] = 1
    features['total_deliveries'] = len(hist)
    
    # Target date features
    target_date = current_date + pd.Timedelta(days=forecast_horizon_days)
    features['month'] = target_date.month
    features['quarter'] = target_date.quarter
    features['day_of_year'] = target_date.dayofyear
    features['week_of_year'] = target_date.isocalendar()[1]
    features['is_quarter_end'] = int(target_date.month in [3, 6, 9, 12])
    features['is_year_end'] = int(target_date.month == 12)
    
    # Multiple seasonality encodings
    features['month_sin'] = np.sin(2 * np.pi * target_date.month / 12)
    features['month_cos'] = np.cos(2 * np.pi * target_date.month / 12)
    features['quarter_sin'] = np.sin(2 * np.pi * target_date.quarter / 4)
    features['quarter_cos'] = np.cos(2 * np.pi * target_date.quarter / 4)
    features['week_sin'] = np.sin(2 * np.pi * target_date.isocalendar()[1] / 52)
    features['week_cos'] = np.cos(2 * np.pi * target_date.isocalendar()[1] / 52)
    
    # Historical statistics - MULTIPLE QUANTILES
    for q in [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.50, 0.75, 0.90]:
        features[f'hist_p{int(q*100)}'] = hist['net_weight'].quantile(q)
    
    features['hist_mean'] = hist['net_weight'].mean()
    features['hist_std'] = hist['net_weight'].std()
    features['hist_cv'] = features['hist_std'] / (features['hist_mean'] + 1)
    features['hist_skew'] = hist['net_weight'].skew()
    features['hist_kurt'] = hist['net_weight'].kurtosis()
    
    # ROLLING WINDOWS - Multiple periods
    for window in [7, 14, 30, 60, 90, 180, 365]:
        recent = hist[hist['date_arrival'] >= (current_date - pd.Timedelta(days=window))]
        
        if len(recent) > 0:
            features[f'roll_{window}d_mean'] = recent['net_weight'].mean()
            features[f'roll_{window}d_std'] = recent['net_weight'].std()
            features[f'roll_{window}d_p10'] = recent['net_weight'].quantile(0.10)
            features[f'roll_{window}d_p20'] = recent['net_weight'].quantile(0.20)
            features[f'roll_{window}d_p50'] = recent['net_weight'].quantile(0.50)
            features[f'roll_{window}d_count'] = len(recent)
            features[f'roll_{window}d_sum'] = recent['net_weight'].sum()
            features[f'roll_{window}d_daily_rate'] = recent['net_weight'].sum() / window
            
            # Trend (linear regression)
            if len(recent) > 2:
                x = np.arange(len(recent))
                y = recent['net_weight'].values
                slope = np.polyfit(x, y, 1)[0] if len(x) > 0 else 0
                features[f'roll_{window}d_trend'] = slope
            else:
                features[f'roll_{window}d_trend'] = 0
        else:
            features[f'roll_{window}d_mean'] = 0
            features[f'roll_{window}d_std'] = 0
            features[f'roll_{window}d_p10'] = 0
            features[f'roll_{window}d_p20'] = 0
            features[f'roll_{window}d_p50'] = 0
            features[f'roll_{window}d_count'] = 0
            features[f'roll_{window}d_sum'] = 0
            features[f'roll_{window}d_daily_rate'] = 0
            features[f'roll_{window}d_trend'] = 0
    
    # LAG FEATURES
    hist_sorted = hist.sort_values('date_arrival', ascending=False)
    for lag in [1, 7, 14, 30, 60, 90]:
        lag_data = hist_sorted[hist_sorted['date_arrival'] <= (current_date - pd.Timedelta(days=lag))]
        if len(lag_data) > 0:
            features[f'lag_{lag}d_weight'] = lag_data.iloc[0]['net_weight']
        else:
            features[f'lag_{lag}d_weight'] = 0
    
    # Delivery frequency
    if len(hist) > 1:
        date_range = (hist['date_arrival'].max() - hist['date_arrival'].min()).days
        features['delivery_frequency'] = len(hist) / max(date_range, 1)
        features['avg_days_between'] = date_range / max(len(hist) - 1, 1)
    else:
        features['delivery_frequency'] = 0
        features['avg_days_between'] = 999
    
    # Days since last delivery
    features['days_since_last'] = (current_date - hist['date_arrival'].max()).days
    features['recency_score'] = 1 / (1 + features['days_since_last'] / 30)  # Decay factor
    
    # Purchase orders
    rm_products = materials[materials['rm_id'] == rm_id]['product_id'].dropna().unique()
    
    if len(rm_products) > 0:
        future_orders = purchase_orders[
            (purchase_orders['product_id'].isin(rm_products)) &
            (purchase_orders['delivery_date'] > current_date) &
            (purchase_orders['delivery_date'] <= target_date)
        ]
        
        features['future_orders_count'] = len(future_orders)
        features['future_orders_qty'] = future_orders['quantity'].sum() if len(future_orders) > 0 else 0
        features['future_orders_avg'] = future_orders['quantity'].mean() if len(future_orders) > 0 else 0
    else:
        features['future_orders_count'] = 0
        features['future_orders_qty'] = 0
        features['future_orders_avg'] = 0
    
    # INTERACTION FEATURES
    features['horizon_x_freq'] = forecast_horizon_days * features['delivery_frequency']
    features['horizon_x_recency'] = forecast_horizon_days * features['recency_score']
    features['horizon_x_trend_30'] = forecast_horizon_days * features['roll_30d_trend']
    features['month_x_p20'] = features['month'] * features['hist_p20']
    
    # MOMENTUM features
    features['momentum_30_60'] = features['roll_30d_mean'] - features['roll_60d_mean']
    features['momentum_60_90'] = features['roll_60d_mean'] - features['roll_90d_mean']
    features['acceleration'] = features['momentum_30_60'] - features['momentum_60_90']
    
    return features

print("✅ Advanced feature engineering function defined")

## 🏗️ Create Training Dataset (50,000 samples)

In [None]:
def create_gpu_training_samples(receivals, purchase_orders, materials, n_samples=50000):
    """
    Create massive training dataset for GPU training
    """
    np.random.seed(42)
    samples = []
    
    material_counts = receivals['rm_id'].value_counts()
    valid_materials = material_counts[material_counts >= 20].index.tolist()
    
    print(f"Creating {n_samples:,} training samples from {len(valid_materials)} materials...")
    print("This will take ~5-10 minutes...")
    
    for i in range(n_samples):
        if (i + 1) % 5000 == 0:
            print(f"  {i + 1:,}/{n_samples:,} samples...")
        
        rm_id = np.random.choice(valid_materials)
        rm_data = receivals[receivals['rm_id'] == rm_id].sort_values('date_arrival')
        
        if len(rm_data) < 20:
            continue
        
        split_idx = np.random.randint(int(len(rm_data) * 0.5), int(len(rm_data) * 0.95))
        current_date = rm_data.iloc[split_idx]['date_arrival']
        horizon_days = np.random.randint(1, 151)
        target_date = current_date + pd.Timedelta(days=horizon_days)
        
        features = calculate_advanced_features_v2(
            rm_data.iloc[:split_idx],
            purchase_orders,
            materials,
            rm_id,
            current_date,
            horizon_days
        )
        
        actual = rm_data[
            (rm_data['date_arrival'] > current_date) &
            (rm_data['date_arrival'] <= target_date)
        ]['net_weight'].sum()
        
        features['target'] = actual
        samples.append(features)
    
    df = pd.DataFrame(samples)
    print(f"\n✅ Created {len(df):,} samples")
    print(f"Features: {len([c for c in df.columns if c != 'target'])}")
    print(f"Target stats: mean={df['target'].mean():,.0f}, p20={df['target'].quantile(0.20):,.0f}")
    
    return df

# NOTE: Start with 15K samples for testing, then increase to 50K for final training
train_df_gpu = create_gpu_training_samples(receivals, purchase_orders, materials, n_samples=15000)

## 🚀 CATBOOST GPU MODEL

In [None]:
if not CATBOOST_AVAILABLE:
    print("⚠️ CatBoost not available. Skipping...")
else:
    # Prepare data
    feature_cols = [col for col in train_df_gpu.columns if col != 'target']
    X = train_df_gpu[feature_cols].fillna(0)
    y = train_df_gpu['target']
    
    # Time series split
    tscv = TimeSeriesSplit(n_splits=3)
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    print(f"Training samples: {len(X_train):,}")
    print(f"Validation samples: {len(X_val):,}")
    print(f"Features: {len(feature_cols)}")
    
    # CatBoost GPU params - AGGRESSIVE!
    params_catboost = {
        'loss_function': 'Quantile:alpha=0.2',
        'eval_metric': 'Quantile:alpha=0.2',
        'task_type': 'GPU',  # Use GPU!
        'devices': '0',
        'iterations': 5000,  # Much more iterations
        'depth': 8,
        'learning_rate': 0.03,
        'l2_leaf_reg': 3,
        'bootstrap_type': 'Bayesian',
        'random_strength': 1,
        'bagging_temperature': 1,
        'od_type': 'Iter',
        'od_wait': 100,
        'random_seed': 42,
        'verbose': 100
    }
    
    print("\n🚀 Training CatBoost GPU model...")
    print("This will take 5-15 minutes depending on GPU...\n")
    
    model_catboost = CatBoostRegressor(**params_catboost)
    model_catboost.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True,
        plot=False
    )
    
    print("\n✅ CatBoost model trained!")
    print(f"Best iteration: {model_catboost.get_best_iteration()}")
    print(f"Best score: {model_catboost.get_best_score()['validation']['Quantile:alpha=0.2']:,.2f}")

## ⚡ XGBOOST GPU MODEL

In [None]:
# XGBoost GPU
params_xgb_gpu = {
    'objective': 'reg:quantileerror',
    'quantile_alpha': 0.2,
    'tree_method': 'hist',  # Use 'gpu_hist' if GPU available
    'device': 'cuda',  # Use GPU
    'max_depth': 8,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 2,
    'seed': 42
}

print("🚀 Training XGBoost GPU model...\n")

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

model_xgb = xgb.train(
    params_xgb_gpu,
    dtrain,
    num_boost_round=3000,
    evals=[(dval, 'validation')],
    early_stopping_rounds=100,
    verbose_eval=100
)

print("\n✅ XGBoost GPU model trained!")
print(f"Best iteration: {model_xgb.best_iteration}")

## 📊 Validation Comparison

In [None]:
def quantile_loss(y_true, y_pred, quantile=0.2):
    errors = y_true - y_pred
    loss = np.where(errors >= 0, quantile * errors, (quantile - 1) * errors)
    return loss.sum()

# Predictions
if CATBOOST_AVAILABLE:
    preds_catboost = model_catboost.predict(X_val)
    preds_catboost = np.maximum(preds_catboost, 0)
    ql_catboost = quantile_loss(y_val.values, preds_catboost)

preds_xgb = model_xgb.predict(dval)
preds_xgb = np.maximum(preds_xgb, 0)
ql_xgb = quantile_loss(y_val.values, preds_xgb)

# Ensemble (simple average)
if CATBOOST_AVAILABLE:
    preds_ensemble = (preds_catboost + preds_xgb) / 2
else:
    preds_ensemble = preds_xgb
    
ql_ensemble = quantile_loss(y_val.values, preds_ensemble)

print("="*70)
print("VALIDATION RESULTS")
print("="*70)

if CATBOOST_AVAILABLE:
    print(f"\nCatBoost GPU:")
    print(f"  Quantile Loss: {ql_catboost:,.0f}")
    print(f"  Mean prediction: {preds_catboost.mean():,.0f} kg")
    print(f"  Under-predictions: {np.sum(preds_catboost < y_val.values)/len(y_val):.1%}")

print(f"\nXGBoost GPU:")
print(f"  Quantile Loss: {ql_xgb:,.0f}")
print(f"  Mean prediction: {preds_xgb.mean():,.0f} kg")
print(f"  Under-predictions: {np.sum(preds_xgb < y_val.values)/len(y_val):.1%}")

print(f"\nEnsemble (avg):")
print(f"  Quantile Loss: {ql_ensemble:,.0f}")
print(f"  Mean prediction: {preds_ensemble.mean():,.0f} kg")
print(f"  Under-predictions: {np.sum(preds_ensemble < y_val.values)/len(y_val):.1%}")

print("\n" + "="*70)
print("Target: 75-80% under-predictions for quantile 0.2")
print("="*70)

## 🔮 Generate Competition Predictions

In [None]:
# Generate predictions for competition
pred_start_date = pd.to_datetime('2025-01-01').tz_localize('UTC')
prediction_mapping['forecast_end_date'] = pd.to_datetime(prediction_mapping['forecast_end_date'])
prediction_mapping['horizon_days'] = (prediction_mapping['forecast_end_date'] - pd.to_datetime('2025-01-01')).dt.days

predictions_catboost_list = []
predictions_xgb_list = []

print("Generating GPU predictions...")
print("This will take 5-10 minutes...\n")

for idx, row in prediction_mapping.iterrows():
    rm_id = row['rm_id']
    horizon_days = row['horizon_days']
    
    features = calculate_advanced_features_v2(
        receivals, purchase_orders, materials,
        rm_id, pred_start_date, horizon_days
    )
    
    X_pred = pd.DataFrame([features])[feature_cols].fillna(0)
    
    if CATBOOST_AVAILABLE:
        pred_cat = model_catboost.predict(X_pred)[0]
        predictions_catboost_list.append(max(0, pred_cat))
    
    pred_xgb = model_xgb.predict(xgb.DMatrix(X_pred))[0]
    predictions_xgb_list.append(max(0, pred_xgb))
    
    if (idx + 1) % 5000 == 0:
        print(f"  {idx + 1:,}/30,450")

predictions_xgb_arr = np.array(predictions_xgb_list)

if CATBOOST_AVAILABLE:
    predictions_catboost_arr = np.array(predictions_catboost_list)
    predictions_ensemble_arr = (predictions_catboost_arr + predictions_xgb_arr) / 2
else:
    predictions_ensemble_arr = predictions_xgb_arr

print("\n✅ Predictions generated!")

# Statistics
print("\n" + "="*70)
print("PREDICTION STATISTICS")
print("="*70)

if CATBOOST_AVAILABLE:
    print(f"\nCatBoost GPU:")
    print(f"  Mean: {predictions_catboost_arr.mean():,.0f} kg")
    print(f"  Median: {np.median(predictions_catboost_arr):,.0f} kg")
    print(f"  Zeros: {np.sum(predictions_catboost_arr == 0)/len(predictions_catboost_arr):.1%}")

print(f"\nXGBoost GPU:")
print(f"  Mean: {predictions_xgb_arr.mean():,.0f} kg")
print(f"  Median: {np.median(predictions_xgb_arr):,.0f} kg")
print(f"  Zeros: {np.sum(predictions_xgb_arr == 0)/len(predictions_xgb_arr):.1%}")

print(f"\nEnsemble:")
print(f"  Mean: {predictions_ensemble_arr.mean():,.0f} kg")
print(f"  Median: {np.median(predictions_ensemble_arr):,.0f} kg")
print(f"  Zeros: {np.sum(predictions_ensemble_arr == 0)/len(predictions_ensemble_arr):.1%}")

## 💾 Save Submissions

In [None]:
# Save submissions
if CATBOOST_AVAILABLE:
    submission_catboost = pd.DataFrame({
        'ID': range(1, len(predictions_catboost_arr) + 1),
        'predicted_weight': predictions_catboost_arr
    })
    submission_catboost.to_csv('submission_gpu_catboost.csv', index=False)
    print("✅ Saved: submission_gpu_catboost.csv")

submission_xgb = pd.DataFrame({
    'ID': range(1, len(predictions_xgb_arr) + 1),
    'predicted_weight': predictions_xgb_arr
})
submission_xgb.to_csv('submission_gpu_xgboost.csv', index=False)
print("✅ Saved: submission_gpu_xgboost.csv")

submission_ensemble = pd.DataFrame({
    'ID': range(1, len(predictions_ensemble_arr) + 1),
    'predicted_weight': predictions_ensemble_arr
})
submission_ensemble.to_csv('submission_gpu_ensemble.csv', index=False)
print("✅ Saved: submission_gpu_ensemble.csv")

print("\n" + "="*70)
print("FINAL COMPARISON")
print("="*70)
print(f"\nV4 (LightGBM tuned): mean ~67,000 kg → Score: 13,978 (43rd)")
print(f"GPU Ensemble: mean ~{predictions_ensemble_arr.mean():,.0f} kg → Score: ???")
print(f"\nExpected improvement: 20-30%")
print(f"Target score: ~9,000-11,000")
print("\n" + "="*70)
print("\n🎯 RECOMMENDATIONS:")
print("  1. Try submission_gpu_ensemble.csv first")
print("  2. If too high, try submission_gpu_catboost.csv")
print("  3. If still too high, scale down by 0.8x")
print("="*70)