# 🎯 V5: ULTRA-CONSERVATIVE MODEL
## Target: Beat 4,000 score (leader)
### Current: 13,978 (43rd place) ❌

**Strategy:**
1. Lower quantile predictions (use 0.10 instead of 0.2)
2. Stronger regularization
3. Reduce predictions systematically
4. Focus on NOT over-predicting

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries loaded")

✅ Libraries loaded


In [4]:
# Load data
receivals = pd.read_csv('data/kernel/receivals.csv')
purchase_orders = pd.read_csv('data/kernel/purchase_orders.csv')
materials = pd.read_csv('data/extended/materials.csv')
prediction_mapping = pd.read_csv('data/prediction_mapping.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'])
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'])
purchase_orders['created_date_time'] = pd.to_datetime(purchase_orders['created_date_time'])

print(f"Receivals: {len(receivals):,} rows")
print(f"Purchase Orders: {len(purchase_orders):,} rows")
print(f"Prediction mapping: {len(prediction_mapping):,} rows")
print(f"\nDate range: {receivals['date_arrival'].min()} to {receivals['date_arrival'].max()}")

Receivals: 122,590 rows
Purchase Orders: 33,171 rows
Prediction mapping: 30,450 rows

Date range: 2004-06-15 13:34:00+02:00 to 2024-12-19 13:36:00+00:00


## 📊 Analyze Current Predictions vs Actuals

In [3]:
# What percentile should we ACTUALLY use?
# Let's check historical 0.2 quantile vs mean

historical_stats = []
for rm_id in receivals['rm_id'].unique():
    rm_data = receivals[receivals['rm_id'] == rm_id]['net_weight']
    if len(rm_data) > 10:
        historical_stats.append({
            'rm_id': rm_id,
            'mean': rm_data.mean(),
            'p10': rm_data.quantile(0.10),
            'p20': rm_data.quantile(0.20),
            'p30': rm_data.quantile(0.30),
            'median': rm_data.median(),
            'count': len(rm_data)
        })

stats_df = pd.DataFrame(historical_stats)
print("Historical Weight Distributions:")
print(f"Mean of means: {stats_df['mean'].mean():,.0f} kg")
print(f"Mean of p10: {stats_df['p10'].mean():,.0f} kg")
print(f"Mean of p20: {stats_df['p20'].mean():,.0f} kg")
print(f"Mean of p30: {stats_df['p30'].mean():,.0f} kg")
print(f"Mean of medians: {stats_df['median'].mean():,.0f} kg")
print(f"\nRatio p20/mean: {stats_df['p20'].mean() / stats_df['mean'].mean():.2%}")
print(f"Ratio p10/mean: {stats_df['p10'].mean() / stats_df['mean'].mean():.2%}")

Historical Weight Distributions:
Mean of means: 13,376 kg
Mean of p10: 7,579 kg
Mean of p20: 9,630 kg
Mean of p30: 11,025 kg
Mean of medians: 13,359 kg

Ratio p20/mean: 71.99%
Ratio p10/mean: 56.66%


## 🔧 Enhanced Feature Engineering - ULTRA CONSERVATIVE

In [5]:
def calculate_ultra_conservative_features(receivals, purchase_orders, materials, rm_id, current_date, forecast_horizon_days):
    """
    Calculate features with EXTREME conservative bias
    """
    features = {}
    
    # Historical data up to current_date
    hist = receivals[(receivals['rm_id'] == rm_id) & (receivals['date_arrival'] <= current_date)]
    
    if len(hist) == 0:
        # No history = predict ZERO or very small
        return {
            'rm_id_encoded': rm_id,
            'forecast_horizon_days': forecast_horizon_days,
            'has_history': 0,
            **{f'feature_{i}': 0 for i in range(25)}
        }
    
    # Basic info
    features['rm_id_encoded'] = rm_id
    features['forecast_horizon_days'] = forecast_horizon_days
    features['has_history'] = 1
    
    # Time-based features
    target_date = current_date + pd.Timedelta(days=forecast_horizon_days)
    features['month'] = target_date.month
    features['quarter'] = target_date.quarter
    features['day_of_year'] = target_date.dayofyear
    
    # Seasonality (sin/cos)
    features['month_sin'] = np.sin(2 * np.pi * target_date.month / 12)
    features['month_cos'] = np.cos(2 * np.pi * target_date.month / 12)
    
    # Historical statistics - USE LOWER QUANTILES!
    features['hist_p05'] = hist['net_weight'].quantile(0.05)  # 5th percentile!
    features['hist_p10'] = hist['net_weight'].quantile(0.10)  # 10th percentile
    features['hist_p15'] = hist['net_weight'].quantile(0.15)  # 15th percentile
    features['hist_p20'] = hist['net_weight'].quantile(0.20)
    features['hist_mean'] = hist['net_weight'].mean()
    features['hist_std'] = hist['net_weight'].std()
    features['hist_cv'] = features['hist_std'] / (features['hist_mean'] + 1)
    
    # Recent trends (last 30, 60, 90 days) - weighted toward LOWER values
    for days in [30, 60, 90]:
        recent = hist[hist['date_arrival'] >= (current_date - pd.Timedelta(days=days))]
        if len(recent) > 0:
            features[f'recent_{days}d_p10'] = recent['net_weight'].quantile(0.10)
            features[f'recent_{days}d_count'] = len(recent)
            features[f'recent_{days}d_daily_rate'] = recent['net_weight'].sum() / days
        else:
            features[f'recent_{days}d_p10'] = 0
            features[f'recent_{days}d_count'] = 0
            features[f'recent_{days}d_daily_rate'] = 0
    
    # Delivery frequency
    if len(hist) > 1:
        date_range = (hist['date_arrival'].max() - hist['date_arrival'].min()).days
        features['delivery_frequency'] = len(hist) / max(date_range, 1)
        features['avg_days_between'] = date_range / max(len(hist) - 1, 1)
    else:
        features['delivery_frequency'] = 0
        features['avg_days_between'] = 999
    
    # Days since last delivery
    features['days_since_last'] = (current_date - hist['date_arrival'].max()).days
    
    # Purchase orders - CONSERVATIVE interpretation
    # Map product_id to rm_id using materials dataset
    rm_products = materials[materials['rm_id'] == rm_id]['product_id'].dropna().unique()
    
    if len(rm_products) > 0:
        future_orders = purchase_orders[
            (purchase_orders['product_id'].isin(rm_products)) &
            (purchase_orders['delivery_date'] > current_date) &
            (purchase_orders['delivery_date'] <= target_date)
        ]
        
        features['future_orders_count'] = len(future_orders)
        features['future_orders_weight'] = future_orders['quantity'].sum() if len(future_orders) > 0 else 0
        # DISCOUNT future orders by 30% (uncertainty)
        features['future_orders_weight_discounted'] = features['future_orders_weight'] * 0.7
    else:
        features['future_orders_count'] = 0
        features['future_orders_weight'] = 0
        features['future_orders_weight_discounted'] = 0
    
    return features

## 🏗️ Create Training Dataset

In [6]:
def create_ultra_conservative_training_samples(receivals, purchase_orders, materials, n_samples=15000):
    """
    Create training samples focusing on UNDER-prediction
    """
    np.random.seed(42)
    samples = []
    
    # Get materials with sufficient history
    material_counts = receivals['rm_id'].value_counts()
    valid_materials = material_counts[material_counts >= 20].index.tolist()
    
    print(f"Creating {n_samples} training samples from {len(valid_materials)} materials...")
    
    for _ in range(n_samples):
        # Random material
        rm_id = np.random.choice(valid_materials)
        rm_data = receivals[receivals['rm_id'] == rm_id].sort_values('date_arrival')
        
        if len(rm_data) < 20:
            continue
        
        # Random split point (use 80% of data for history)
        split_idx = np.random.randint(int(len(rm_data) * 0.5), int(len(rm_data) * 0.95))
        current_date = rm_data.iloc[split_idx]['date_arrival']
        
        # Random forecast horizon (1 to 150 days)
        horizon_days = np.random.randint(1, 151)
        target_date = current_date + pd.Timedelta(days=horizon_days)
        
        # Calculate features
        features = calculate_ultra_conservative_features(
            rm_data.iloc[:split_idx],
            purchase_orders,
            materials,
            rm_id,
            current_date,
            horizon_days
        )
        
        # Calculate actual cumulative weight
        actual = rm_data[
            (rm_data['date_arrival'] > current_date) &
            (rm_data['date_arrival'] <= target_date)
        ]['net_weight'].sum()
        
        features['target'] = actual
        samples.append(features)
    
    df = pd.DataFrame(samples)
    print(f"✅ Created {len(df)} samples")
    print(f"Target stats: mean={df['target'].mean():,.0f}, median={df['target'].median():,.0f}")
    print(f"Target p20={df['target'].quantile(0.20):,.0f}, p10={df['target'].quantile(0.10):,.0f}")
    
    return df

train_df = create_ultra_conservative_training_samples(receivals, purchase_orders, materials, n_samples=15000)

Creating 15000 training samples from 103 materials...
✅ Created 15000 samples
Target stats: mean=614,558, median=145,947
Target p20=22,300, p10=1,080


## 🎯 Train ULTRA-CONSERVATIVE Model

In [7]:
# Prepare data
feature_cols = [col for col in train_df.columns if col != 'target']
X = train_df[feature_cols].fillna(0)
y = train_df['target']

# Time series split
tscv = TimeSeriesSplit(n_splits=3)
for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

print(f"Training samples: {len(X_train):,}")
print(f"Validation samples: {len(X_val):,}")

# ULTRA-CONSERVATIVE LightGBM params - TARGET QUANTILE 0.10!
params_ultra = {
    'objective': 'quantile',
    'alpha': 0.10,  # 10th percentile instead of 20th!
    'metric': 'quantile',
    'num_leaves': 20,  # Very simple trees
    'learning_rate': 0.01,  # Slower learning
    'feature_fraction': 0.6,  # Use fewer features
    'bagging_fraction': 0.6,  # Use fewer samples
    'bagging_freq': 5,
    'min_data_in_leaf': 50,  # Stronger regularization
    'lambda_l1': 2.0,  # Strong L1
    'lambda_l2': 2.0,  # Strong L2
    'max_depth': 5,  # Shallow trees
    'verbosity': -1,
    'seed': 42
}

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

print("\n🚀 Training ULTRA-CONSERVATIVE model (quantile 0.10)...")
model_ultra = lgb.train(
    params_ultra,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_val],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

print("\n✅ Model trained!")

Training samples: 11,250
Validation samples: 3,750

🚀 Training ULTRA-CONSERVATIVE model (quantile 0.10)...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's quantile: 56280.9
[100]	valid_0's quantile: 56280.9
[200]	valid_0's quantile: 52893.1
[200]	valid_0's quantile: 52893.1
[300]	valid_0's quantile: 49850.3
[400]	valid_0's quantile: 47644.5
[300]	valid_0's quantile: 49850.3
[400]	valid_0's quantile: 47644.5
[500]	valid_0's quantile: 45629.3
Did not meet early stopping. Best iteration is:
[500]	valid_0's quantile: 45629.3

✅ Model trained!
[500]	valid_0's quantile: 45629.3
Did not meet early stopping. Best iteration is:
[500]	valid_0's quantile: 45629.3

✅ Model trained!


## 📈 Validation Analysis

In [8]:
# Predict on validation
preds_val_ultra = model_ultra.predict(X_val)
preds_val_ultra = np.maximum(preds_val_ultra, 0)  # No negative

# Calculate quantile loss at 0.2 (competition metric)
def quantile_loss(y_true, y_pred, quantile=0.2):
    errors = y_true - y_pred
    loss = np.where(errors >= 0, quantile * errors, (quantile - 1) * errors)
    return loss.sum()

ql_ultra = quantile_loss(y_val.values, preds_val_ultra, quantile=0.2)

# Analysis
over_preds = np.sum(preds_val_ultra > y_val.values)
under_preds = np.sum(preds_val_ultra < y_val.values)

print(f"🎯 VALIDATION RESULTS (ULTRA-CONSERVATIVE):")
print(f"\nQuantile Loss (0.2): {ql_ultra:,.0f}")
print(f"\nPredictions:")
print(f"  Mean: {preds_val_ultra.mean():,.0f} kg")
print(f"  Median: {np.median(preds_val_ultra):,.0f} kg")
print(f"  Zero predictions: {np.sum(preds_val_ultra == 0)} ({np.sum(preds_val_ultra == 0)/len(preds_val_ultra):.1%})")
print(f"\nActuals:")
print(f"  Mean: {y_val.mean():,.0f} kg")
print(f"  Median: {y_val.median():,.0f} kg")
print(f"\nError distribution:")
print(f"  Over-predictions: {over_preds} ({over_preds/len(y_val):.1%})")
print(f"  Under-predictions: {under_preds} ({under_preds/len(y_val):.1%})")
print(f"\nTarget: ~80% under-predictions for quantile 0.2")

🎯 VALIDATION RESULTS (ULTRA-CONSERVATIVE):

Quantile Loss (0.2): 332,609,607

Predictions:
  Mean: 162,894 kg
  Median: 62,522 kg
  Zero predictions: 323 (8.6%)

Actuals:
  Mean: 593,634 kg
  Median: 140,422 kg

Error distribution:
  Over-predictions: 403 (10.7%)
  Under-predictions: 3134 (83.6%)

Target: ~80% under-predictions for quantile 0.2


## 🔮 Generate Competition Predictions

In [11]:
# Generate predictions for competition
pred_start_date = pd.to_datetime('2025-01-01').tz_localize('UTC')
prediction_mapping['forecast_end_date'] = pd.to_datetime(prediction_mapping['forecast_end_date'])
prediction_mapping['horizon_days'] = (prediction_mapping['forecast_end_date'] - pd.to_datetime('2025-01-01')).dt.days

predictions_ultra = []

print("Generating ULTRA-CONSERVATIVE predictions...")
for idx, row in prediction_mapping.iterrows():
    rm_id = row['rm_id']
    horizon_days = row['horizon_days']
    
    # Calculate features
    features = calculate_ultra_conservative_features(
        receivals,
        purchase_orders,
        materials,
        rm_id,
        pred_start_date,
        horizon_days
    )
    
    # Convert to DataFrame
    X_pred = pd.DataFrame([features])[feature_cols].fillna(0)
    
    # Predict
    pred = model_ultra.predict(X_pred)[0]
    pred = max(0, pred)  # No negative
    
    predictions_ultra.append(pred)
    
    if (idx + 1) % 5000 == 0:
        print(f"  Processed {idx + 1:,}/{len(prediction_mapping):,}")

predictions_ultra = np.array(predictions_ultra)

print(f"\n✅ Predictions generated!")
print(f"\nStatistics:")
print(f"  Mean: {predictions_ultra.mean():,.0f} kg")
print(f"  Median: {np.median(predictions_ultra):,.0f} kg")
print(f"  Min: {predictions_ultra.min():,.0f} kg")
print(f"  Max: {predictions_ultra.max():,.0f} kg")
print(f"  Zero predictions: {np.sum(predictions_ultra == 0)} ({np.sum(predictions_ultra == 0)/len(predictions_ultra):.1%})")
print(f"\nPercentiles:")
print(f"  25th: {np.percentile(predictions_ultra, 25):,.0f} kg")
print(f"  50th: {np.percentile(predictions_ultra, 50):,.0f} kg")
print(f"  75th: {np.percentile(predictions_ultra, 75):,.0f} kg")

Generating ULTRA-CONSERVATIVE predictions...
  Processed 5,000/30,450
  Processed 5,000/30,450
  Processed 10,000/30,450
  Processed 10,000/30,450
  Processed 15,000/30,450
  Processed 15,000/30,450
  Processed 20,000/30,450
  Processed 20,000/30,450
  Processed 25,000/30,450
  Processed 25,000/30,450
  Processed 30,000/30,450
  Processed 30,000/30,450

✅ Predictions generated!

Statistics:
  Mean: 19,546 kg
  Median: 637 kg
  Min: 0 kg
  Max: 1,123,722 kg
  Zero predictions: 9146 (30.0%)

Percentiles:
  25th: 0 kg
  50th: 637 kg
  75th: 4,049 kg

✅ Predictions generated!

Statistics:
  Mean: 19,546 kg
  Median: 637 kg
  Min: 0 kg
  Max: 1,123,722 kg
  Zero predictions: 9146 (30.0%)

Percentiles:
  25th: 0 kg
  50th: 637 kg
  75th: 4,049 kg


## 💾 Save Submission

In [12]:
# Create submission
submission_ultra = pd.DataFrame({
    'ID': range(1, len(predictions_ultra) + 1),
    'predicted_weight': predictions_ultra
})

submission_ultra.to_csv('submission_v5_ultra_conservative.csv', index=False)
print("✅ Submission saved as 'submission_v5_ultra_conservative.csv'")

print("\nFirst 20 predictions:")
print(submission_ultra.head(20))

print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)
print(f"\nV4 (tuned, quantile 0.2): mean ~67,000 kg → Score: 13,978 (43rd)")
print(f"V5 (ultra-conservative, quantile 0.10): mean ~{predictions_ultra.mean():,.0f} kg → Score: ???")
print(f"\nTarget: Beat leader score of 4,000")
print(f"Strategy: Much more conservative predictions (quantile 0.10 instead of 0.20)")
print("="*60)

✅ Submission saved as 'submission_v5_ultra_conservative.csv'

First 20 predictions:
    ID  predicted_weight
0    1               0.0
1    2               0.0
2    3               0.0
3    4               0.0
4    5               0.0
5    6               0.0
6    7               0.0
7    8               0.0
8    9               0.0
9   10               0.0
10  11               0.0
11  12               0.0
12  13               0.0
13  14               0.0
14  15               0.0
15  16               0.0
16  17               0.0
17  18               0.0
18  19               0.0
19  20               0.0

FINAL COMPARISON

V4 (tuned, quantile 0.2): mean ~67,000 kg → Score: 13,978 (43rd)
V5 (ultra-conservative, quantile 0.10): mean ~19,546 kg → Score: ???

Target: Beat leader score of 4,000
Strategy: Much more conservative predictions (quantile 0.10 instead of 0.20)


## 🔥 V6 EXTREME: Scale down predictions by 50%

In [13]:
# V6: Apply 50% scaling to be EVEN MORE conservative
predictions_v6 = predictions_ultra * 0.5

submission_v6 = pd.DataFrame({
    'ID': range(1, len(predictions_v6) + 1),
    'predicted_weight': predictions_v6
})

submission_v6.to_csv('submission_v6_extreme.csv', index=False)
print("✅ V6 EXTREME submission saved as 'submission_v6_extreme.csv'")

print(f"\n📊 V6 STATISTICS:")
print(f"  Mean: {predictions_v6.mean():,.0f} kg (50% of V5)")
print(f"  Median: {np.median(predictions_v6):,.0f} kg")
print(f"  Zero predictions: {np.sum(predictions_v6 == 0)} ({np.sum(predictions_v6 == 0)/len(predictions_v6):.1%})")

print("\n" + "="*70)
print("RECOMMENDATION:")
print("="*70)
print("Try these in order:")
print("  1. submission_v5_ultra_conservative.csv (mean 19,546 kg)")
print("  2. submission_v6_extreme.csv (mean ~9,773 kg) if V5 still too high")
print("="*70)

✅ V6 EXTREME submission saved as 'submission_v6_extreme.csv'

📊 V6 STATISTICS:
  Mean: 9,773 kg (50% of V5)
  Median: 319 kg
  Zero predictions: 9146 (30.0%)

RECOMMENDATION:
Try these in order:
  1. submission_v5_ultra_conservative.csv (mean 19,546 kg)
  2. submission_v6_extreme.csv (mean ~9,773 kg) if V5 still too high


## 🎯 V7 ULTIMATE: Train with quantile 0.05 (5th percentile)

In [14]:
# Train with QUANTILE 0.05
params_ultimate = {
    'objective': 'quantile',
    'alpha': 0.05,  # 5th percentile!
    'metric': 'quantile',
    'num_leaves': 15,
    'learning_rate': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'min_data_in_leaf': 100,
    'lambda_l1': 3.0,
    'lambda_l2': 3.0,
    'max_depth': 4,
    'verbosity': -1,
    'seed': 42
}

print("🚀 Training V7 ULTIMATE model (quantile 0.05)...")
model_v7 = lgb.train(
    params_ultimate,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_val],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# Predict on validation
preds_v7_val = model_v7.predict(X_val)
preds_v7_val = np.maximum(preds_v7_val, 0)

ql_v7 = quantile_loss(y_val.values, preds_v7_val, quantile=0.2)
over_v7 = np.sum(preds_v7_val > y_val.values)
under_v7 = np.sum(preds_v7_val < y_val.values)

print(f"\n🎯 V7 VALIDATION (quantile 0.05):") 
print(f"  Quantile Loss (0.2): {ql_v7:,.0f}")
print(f"  Mean prediction: {preds_v7_val.mean():,.0f} kg")
print(f"  Under-predictions: {under_v7} ({under_v7/len(y_val):.1%})")

🚀 Training V7 ULTIMATE model (quantile 0.05)...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's quantile: 28787.3
[200]	valid_0's quantile: 28060.7
[300]	valid_0's quantile: 27200.4
[200]	valid_0's quantile: 28060.7
[300]	valid_0's quantile: 27200.4
[400]	valid_0's quantile: 26458.3
[500]	valid_0's quantile: 26003.7
Did not meet early stopping. Best iteration is:
[500]	valid_0's quantile: 26003.7

🎯 V7 VALIDATION (quantile 0.05):
  Quantile Loss (0.2): 380,722,870
  Mean prediction: 90,109 kg
  Under-predictions: 3280 (87.5%)
[400]	valid_0's quantile: 26458.3
[500]	valid_0's quantile: 26003.7
Did not meet early stopping. Best iteration is:
[500]	valid_0's quantile: 26003.7

🎯 V7 VALIDATION (quantile 0.05):
  Quantile Loss (0.2): 380,722,870
  Mean prediction: 90,109 kg
  Under-predictions: 3280 (87.5%)


In [None]:
# Generate V7 predictions
predictions_v7 = []

print("Generating V7 predictions (quantile 0.05)...")
for idx, row in prediction_mapping.iterrows():
    rm_id = row['rm_id']
    horizon_days = row['horizon_days']
    
    features = calculate_ultra_conservative_features(
        receivals, purchase_orders, materials,
        rm_id, pred_start_date, horizon_days
    )
    
    X_pred = pd.DataFrame([features])[feature_cols].fillna(0)
    pred = model_v7.predict(X_pred)[0]
    predictions_v7.append(max(0, pred))
    
    if (idx + 1) % 5000 == 0:
        print(f"  {idx + 1:,}/30,450")

predictions_v7 = np.array(predictions_v7)

submission_v7 = pd.DataFrame({
    'ID': range(1, len(predictions_v7) + 1),
    'predicted_weight': predictions_v7
})

submission_v7.to_csv('submission_v7_ultimate.csv', index=False)

print(f"\n✅ V7 saved as 'submission_v7_ultimate.csv'")
print(f"  Mean: {predictions_v7.mean():,.0f} kg")
print(f"  Median: {np.median(predictions_v7):,.0f} kg")
print(f"  Zeros: {np.sum(predictions_v7 == 0)} ({np.sum(predictions_v7 == 0)/len(predictions_v7):.1%})")