In [None]:
# Combines main2.ipynb feature engineering with Kaggle API template


"""
This notebook includes:
1. All helper functions from main2.ipynb
2. Complete feature engineering pipeline
3. XGBoost model training with all features
4. Kaggle API prediction loop with state management

USE: Copy entire code into ONE cell in Kaggle notebook
"""

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from collections import deque
import enefit

# HELPER FUNCTIONS 

def upsample_daily_to_hourly(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    """
    Upsample (process) client data. Client data is reported daily, but we need it hourly.
    Takes every row (1 day) and duplicates (explodes) it into 24 rows.
    """
    df_hourly = df.copy(deep=True)
    df_hourly[date_col] = pd.to_datetime(df_hourly[date_col])
    df_hourly['datetime'] = df_hourly[date_col].apply(
        lambda x: [x + pd.Timedelta(hours=i) for i in range(24)]
    )
    df_hourly = df_hourly.explode('datetime')
    df_hourly = df_hourly.drop(columns=[date_col])
    return df_hourly


def process_forecast_weather(df_forecast: pd.DataFrame, location_map: dict) -> pd.DataFrame:
    """
    Cleans, aggregates, and pivots forecast weather data.
    
    Logic:
    1. Map Lat/Lon to County ID
    2. Convert 'Origin Time' (when forecast was made) to 'Target Time' (when weather happens)
    3. Group forecasts into 'batches' (Day 1 forecast vs Day 2 forecast)
    4. Average the values per county/hour/batch
    5. Pivot so batches become columns (e.g., temperature_1, temperature_2)
    """
    df = df_forecast.copy(deep=True)
    
    # 1. Map Coordinates to County
    df['county'] = [location_map.get((x, y), -1) for x, y in zip(df['latitude'], df['longitude'])]
    df = df[df['county'] != -1]
    
    # 2. Calculate Target Time
    df['origin_datetime'] = pd.to_datetime(df['origin_datetime'])
    df['origin_datetime'] = pd.to_datetime(df['origin_datetime'].dt.date.astype(str) + ' 02:00:00')
    df['forecast_datetime'] = df['origin_datetime'] + pd.to_timedelta(df['hours_ahead'], unit='h')
    df.drop(columns=['origin_datetime'], inplace=True)
    
    # 3. Create "Batches" (cumcount)
    df['cumcount'] = (df['hours_ahead'] - 1) // 24 + 1
    
    # 4. Aggregate (Mean) by County, Time, and Batch
    exclude_cols = ['latitude', 'longitude', 'hours_ahead', 'forecast_datetime', 
                    'cumcount', 'county', 'data_block_id']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    agg_dict = {col: 'mean' for col in feature_cols}
    
    df_grouped = df.groupby(['county', 'forecast_datetime', 'cumcount']).agg(agg_dict)
    
    # 5. Pivot Batches into Columns
    df_pivoted = df_grouped.unstack(level=-1)
    df_pivoted.columns = [f'{col[0]}_{col[1]}' for col in df_pivoted.columns]
    df_pivoted.reset_index(inplace=True)
    df_pivoted.rename(columns={'forecast_datetime': 'datetime'}, inplace=True)
    df_pivoted.fillna(0, inplace=True)
    
    return df_pivoted


def process_historical_weather(df_historical: pd.DataFrame, location_map: dict) -> pd.DataFrame:
    """
    Process historical weather by averaging multiple weather stations per county.
    """
    df = df_historical.copy(deep=True)
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    df['county'] = [location_map.get((x, y), -1) for x, y in zip(df['latitude'], df['longitude'])]
    df = df[df['county'] != -1]
    
    exclude_cols = ['latitude', 'longitude', 'datetime', 'county', 'data_block_id']
    agg_dict = {col: 'mean' for col in df.columns if col not in exclude_cols}
    
    df_grouped = df.groupby(['county', 'datetime']).agg(agg_dict)
    df_grouped.reset_index(inplace=True)
    
    return df_grouped


def generate_features(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_historical_weather,
    df_forecast_weather,
    df_weather_station_to_county_mapping
):
    """
    Generate all features by merging multiple data sources.
    
    Merges:
    - Client data (eic_count, installed_capacity)
    - Gas prices (lowest, highest)
    - Electricity prices (euros_per_mwh)
    - Historical weather
    - Forecast weather (with Day 1/Day 2 batches)
    """
    # Build location mapping
    location_map = dict(zip(
        zip(
            round(df_weather_station_to_county_mapping['latitude'], 1),
            round(df_weather_station_to_county_mapping['longitude'], 1)
        ),
        df_weather_station_to_county_mapping['county']
    ))
    
    combined_df = df_data.copy()
    combined_df['datetime'] = pd.to_datetime(combined_df['datetime'])
    
    # Merge client data (daily → hourly)
    if len(df_client) > 0:
        df_client_hourly = upsample_daily_to_hourly(df_client, date_col='date')
        if 'data_block_id' in df_client_hourly.columns:
            df_client_hourly.drop(columns=['data_block_id'], inplace=True)
        combined_df = combined_df.merge(
            df_client_hourly, 
            on=['county', 'product_type', 'is_business', 'datetime'], 
            how='left'
        )
    
    # Merge gas prices (daily → hourly)
    if len(df_gas_prices) > 0:
        df_gas_hourly = upsample_daily_to_hourly(df_gas_prices, date_col='forecast_date')
        df_gas_hourly.drop(
            columns=[c for c in ['origin_date', 'data_block_id'] 
                    if c in df_gas_hourly.columns], 
            inplace=True
        )
        combined_df = combined_df.merge(df_gas_hourly, on=['datetime'], how='left')
    
    # Merge electricity prices
    if len(df_electricity_prices) > 0:
        df_elec = df_electricity_prices.copy()
        df_elec.drop(
            columns=[c for c in ['origin_date', 'data_block_id'] 
                    if c in df_elec.columns], 
            inplace=True
        )
        df_elec['forecast_date'] = pd.to_datetime(df_elec['forecast_date'])
        df_elec.rename(columns={"forecast_date": "datetime"}, inplace=True)
        combined_df = combined_df.merge(df_elec, on=['datetime'], how='left')
    
    # Merge forecast weather
    if len(df_forecast_weather) > 0:
        forecast_processed = process_forecast_weather(df_forecast_weather, location_map)
        combined_df = combined_df.merge(
            forecast_processed, 
            on=['county', 'datetime'], 
            how='left'
        )
    
    # Merge historical weather
    if len(df_historical_weather) > 0:
        weather_processed = process_historical_weather(df_historical_weather, location_map)
        combined_df = combined_df.merge(
            weather_processed, 
            on=['county', 'datetime'], 
            how='left'
        )
    
    # Fill NaN
    combined_df = combined_df.fillna(0)
    
    return combined_df


# LOAD TRAINING DATA

DATA_DIR = "/kaggle/input/predict-energy-behavior-of-prosumers/"

df_data = pd.read_csv(DATA_DIR + "train.csv")
df_client = pd.read_csv(DATA_DIR + "client.csv")
df_historical_weather = pd.read_csv(DATA_DIR + "historical_weather.csv")
df_forecast_weather = pd.read_csv(DATA_DIR + "forecast_weather.csv")
df_electricity_prices = pd.read_csv(DATA_DIR + "electricity_prices.csv")
df_gas_prices = pd.read_csv(DATA_DIR + "gas_prices.csv")
df_weather_station_to_county_mapping = pd.read_csv(
    DATA_DIR + "weather_station_to_county_mapping.csv"
)

print(f"   Train: {len(df_data):,} rows")
print(f"   Client: {len(df_client):,} rows")
print(f"   Historical weather: {len(df_historical_weather):,} rows")
print(f"   Forecast weather: {len(df_forecast_weather):,} rows")
print(f"   Electricity prices: {len(df_electricity_prices):,} rows")
print(f"   Gas prices: {len(df_gas_prices):,} rows")


# FEATURE ENGINEERING

print("\n1. Generating base features (merging data sources)...")
combined_df = generate_features(
    df_data,
    df_client,
    df_gas_prices,
    df_electricity_prices,
    df_historical_weather,
    df_forecast_weather,
    df_weather_station_to_county_mapping
)
print(f"   Combined shape: {combined_df.shape}")

print("\n2. Adding time features...")
combined_df['hour'] = pd.to_datetime(combined_df['datetime']).dt.hour
combined_df['day_of_week'] = pd.to_datetime(combined_df['datetime']).dt.dayofweek
combined_df['month'] = pd.to_datetime(combined_df['datetime']).dt.month
combined_df['is_weekend'] = combined_df['day_of_week'].isin([5, 6]).astype(int)
print("Added: hour, day_of_week, month, is_weekend")

print("\n3. Adding lag features...")
combined_df = combined_df.sort_values(['county', 'product_type', 'is_business', 'datetime'])
combined_df['target_lag_24h'] = combined_df.groupby(
    ['county', 'product_type', 'is_business']
)['target'].shift(24)
combined_df['target_lag_168h'] = combined_df.groupby(
    ['county', 'product_type', 'is_business']
)['target'].shift(168)
print("Added: target_lag_24h, target_lag_168h")

# Define feature list
improved_features = [
    # Identifiers
    'county', 'is_business', 'product_type', 'is_consumption',
    # Client info
    'eic_count', 'installed_capacity',
    # Prices
    'euros_per_mwh', 'lowest_price_per_mwh', 'highest_price_per_mwh',
    # Weather (historical)
    'temperature', 'cloudcover_total', 'rain',
    # Weather (forecast)
    'temperature_1', 'temperature_2',
    # Time
    'hour', 'day_of_week', 'month', 'is_weekend',
    # Lags
    'target_lag_24h', 'target_lag_168h',
]

# Filter features that exist
improved_features = [f for f in improved_features if f in combined_df.columns]



# PREPARE TRAINING DATA
train_df = combined_df[combined_df['target'].notna()].copy()
print(f"\n1. Rows with target: {len(train_df):,}")

train_df = train_df.dropna(subset=['target_lag_24h', 'target_lag_168h'])
print(f"2. After dropping NaN lags: {len(train_df):,}")

# Sample for faster training (30%)
train_sample = train_df.sample(frac=0.3, random_state=42)
print(f"3. After sampling (30%): {len(train_sample):,}")

X = train_sample[improved_features].copy()
y = train_sample['target'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"   Training: {len(X_train):,} rows")
print(f"   Validation: {len(X_val):,} rows")


# TRAIN XGBOOST MODEL

params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 150,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
}

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# Evaluate
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

train_mae = mean_absolute_error(y_train, y_pred_train)
val_mae = mean_absolute_error(y_val, y_pred_val)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))


print(f"Training MAE:    {train_mae:.2f}")
print(f"Validation MAE:  {val_mae:.2f}")
print(f"Training RMSE:   {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': improved_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
for idx, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"   {idx:2d}. {row['feature']:30s} : {row['importance']*100:.2f}%")


last_y = {}
printed_debug = False

def update_state(revealed_targets: pd.DataFrame):
    """Update state with revealed targets for lag features."""
    if revealed_targets is None or len(revealed_targets) == 0:
        return
    
    rt = revealed_targets.copy()
    rt["is_business"] = rt["is_business"].astype(int)
    rt["is_consumption"] = rt["is_consumption"].astype(int)
    
    for r in rt.itertuples(index=False):
        key = (r.county, r.product_type, r.is_business, r.is_consumption)
        last_y[key] = float(r.target)


def add_lag1_from_state(df: pd.DataFrame) -> pd.DataFrame:
    """Add lag_1 feature from state (most recent revealed target)."""
    out = df.copy()
    out["is_business"] = out["is_business"].astype(int)
    out["is_consumption"] = out["is_consumption"].astype(int)
    
    lag1 = []
    for r in out.itertuples(index=False):
        key = (r.county, r.product_type, r.is_business, r.is_consumption)
        lag1.append(last_y.get(key, 0.0))
    
    out["target_lag_1"] = lag1
    return out


# Initialize API
env = enefit.make_env()
iter_test = env.iter_test()

counter = 0

# Main prediction loop
for (test, revealed_targets, client, historical_weather,
     forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    counter += 1
    
    if counter % 100 == 0:
        print(f" Iteration {counter}...")
    
    # Update state with revealed targets
    update_state(revealed_targets)
    
    # Rename column to match training
    if 'prediction_datetime' in test.columns:
        test = test.rename(columns={'prediction_datetime': 'datetime'})
    
    # Generate features
    feat = generate_features(
        df_data=test.copy(),
        df_client=client,
        df_gas_prices=gas_prices,
        df_electricity_prices=electricity_prices,
        df_historical_weather=historical_weather,
        df_forecast_weather=forecast_weather,
        df_weather_station_to_county_mapping=df_weather_station_to_county_mapping
    )
    
    # Add time features
    feat['hour'] = pd.to_datetime(feat['datetime']).dt.hour
    feat['day_of_week'] = pd.to_datetime(feat['datetime']).dt.dayofweek
    feat['month'] = pd.to_datetime(feat['datetime']).dt.month
    feat['is_weekend'] = feat['day_of_week'].isin([5, 6]).astype(int)
    
    # Add lag from state
    feat = add_lag1_from_state(feat)
    feat['target_lag_24h'] = feat['target_lag_1']
    feat['target_lag_168h'] = feat['target_lag_1']
    
    # Debug once
    if not printed_debug:
        print(f"First batch: {len(test)} rows → {len(feat)} features\n")
        printed_debug = True
    
    # Align features to sample_prediction
    if "row_id" in sample_prediction.columns and "row_id" in feat.columns:
        feat_aligned = sample_prediction[["row_id"]].merge(feat, on="row_id", how="left")
    else:
        feat_aligned = feat.reindex(sample_prediction.index)
    
    # Prepare X_test
    X_test = feat_aligned.reindex(columns=improved_features, fill_value=0)
    X_test = X_test.fillna(0)
    
    # Make predictions
    predictions = model.predict(X_test)
    predictions = np.maximum(predictions, 0)  
    
    # Submit
    sample_prediction['target'] = predictions
    env.predict(sample_prediction)


print(f"Processed {counter} iterations")
print(f"State size: {len(last_y)} unique series tracked")