<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/rf_heat_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from tabulate import tabulate
import glob
import os
from datetime import datetime, timedelta
import warnings
import sys

# Suppress warnings
warnings.filterwarnings('ignore')

# Set max rows/cols for better console display
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)

In [48]:
def get_philippines_season(month):
    """
    Philippines seasons: 0=Dry (Nov-Apr), 1=Wet (May-Oct)
    """
    return 0 if month in [11, 12, 1, 2, 3, 4] else 1

def load_and_clean_data(data_directory="/content/"):
    """
    Load, combine, clean, and feature engineer data from all PAGASA stations.
    """
    print("Starting data loading and cleaning...")

    all_data = []
    required_date_cols = ['YEAR', 'MONTH', 'DAY']

    # Find all CSV files based on expected patterns
    csv_patterns = ["*Daily Data.csv", "*Daily_Data.csv"]
    csv_files = []
    for pattern in csv_patterns:
        csv_files.extend(glob.glob(os.path.join(data_directory, pattern)))

    if not csv_files:
        print("Error: No station data CSV files found. Please ensure files are in the expected directory.")
        sys.exit(1)

    for file_path in csv_files:
        try:
            filename = os.path.basename(file_path)
            station_name = filename.replace(' Daily Data.csv', '').replace(' Daily_Data.csv', '')

            df = pd.read_csv(file_path)

            # Check for required date columns
            if not all(col in df.columns for col in required_date_cols):
                 print(f"Warning: Skipping {station_name}. Missing required date columns.")
                 continue

            df['STATION'] = station_name
            all_data.append(df)

        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")

    if not all_data:
        raise ValueError("No valid station data could be loaded! Check file paths and content.")

    df_combined = pd.concat(all_data, ignore_index=True)

    # Rename 'HI' to 'HI_observed' if it exists
    if 'HI' in df_combined.columns:
        df_combined.rename(columns={'HI': 'HI_observed'}, inplace=True)

    df_clean = df_combined.copy()


    # ------------------
    # Data Cleaning
    # ------------------
    target_cols = ['HI_observed', 'TMAX', 'RH'] # Update target_cols to reflect the renamed column
    df_clean = df_clean.dropna(subset=target_cols, how='all')

    # Fill missing values: first by station median, then by overall median
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.drop(['YEAR', 'MONTH', 'DAY'], errors='ignore')

    for station in df_clean['STATION'].unique():
        station_mask = df_clean['STATION'] == station
        station_data = df_clean[station_mask]
        for col in numeric_cols:
            if col in df_clean.columns:
                station_median = station_data[col].median()
                if pd.notna(station_median):
                    df_clean.loc[station_mask, col] = df_clean.loc[station_mask, col].fillna(station_median)

    for col in numeric_cols:
        if col in df_clean.columns and df_clean[col].isnull().sum() > 0:
            overall_median = df_clean[col].median()
            df_clean[col].fillna(overall_median, inplace=True)

    # ------------------
    # Feature Engineering
    # ------------------

    # Temporal Features
    df_clean['DATE'] = pd.to_datetime(df_clean[['YEAR', 'MONTH', 'DAY']])
    df_clean['DAY_OF_YEAR'] = df_clean['DATE'].dt.dayofyear

    df_clean['SEASON'] = df_clean['MONTH'].apply(get_philippines_season)
    df_clean['IS_DRY_SEASON'] = (df_clean['SEASON'] == 0).astype(int)
    df_clean['IS_WET_SEASON'] = (df_clean['SEASON'] == 1).astype(int)
    df_clean['IS_SOUTHWEST_MONSOON'] = df_clean['MONTH'].apply(lambda x: 1 if x in [6, 7, 8, 9] else 0)
    df_clean['IS_NORTHEAST_MONSOON'] = df_clean['MONTH'].apply(lambda x: 1 if x in [12, 1, 2, 3] else 0)

    # Derived Meteorological and GEE Features (if columns exist)
    if 'TMAX' in df_clean.columns and 'TMIN' in df_clean.columns:
        df_clean['TEMP_RANGE'] = df_clean['TMAX'] - df_clean['TMIN']
        df_clean['TEMP_MEAN'] = (df_clean['TMAX'] + df_clean['TMIN']) / 2

    if 'WIND_SPEED' in df_clean.columns and 'WIND_DIRECTION' in df_clean.columns:
        df_clean['WIND_U'] = -df_clean['WIND_SPEED'] * np.sin(np.radians(df_clean['WIND_DIRECTION']))
        df_clean['WIND_V'] = -df_clean['WIND_SPEED'] * np.cos(np.radians(df_clean['WIND_DIRECTION']))

    # Create vegetation/urban ratios from Google Earth Engine indices
    if 'NDVI_original' in df_clean.columns and 'NDBI_linear' in df_clean.columns:
        df_clean['URBAN_VEG_RATIO'] = df_clean['NDBI_linear'] / (df_clean['NDVI_original'] + 0.001)

    # Sort data chronologically for time series split
    df_clean = df_clean.sort_values(by='DATE').reset_index(drop=True)

    print(f"Data processing complete. Final records: {len(df_clean):,}")
    print(f"Date Range: {df_clean['DATE'].min().date()} to {df_clean['DATE'].max().date()}")
    return df_clean

# Execute data loading and cleaning
try:
    df_processed = load_and_clean_data()
except Exception as e:
    print(f"FATAL SETUP ERROR: {e}")
    sys.exit(1)

Starting data loading and cleaning...
Data processing complete. Final records: 87,587
Date Range: 2014-01-01 to 2023-12-31


## Create hi calculation function

### Subtask:
Define a new function to calculate 'HI_calculated' using the Rothfusz formula based on 'TMAX' and 'RH'.


**Reasoning**:
Define the Python function `calculate_rothfusz_hi` to compute the Heat Index using the provided formula.



In [49]:
def calculate_rothfusz_hi(T, R):
    """
    Calculates the Heat Index using the Rothfusz formula.

    Args:
        T (float or pandas.Series): Temperature in Fahrenheit.
        R (float or pandas.Series): Relative Humidity in percentage.

    Returns:
        float or pandas.Series: Calculated Heat Index value(s).
    """
    HI = (-42.379 + 2.04901523 * T + 10.14333127 * R - 0.22475541 * T * R -
          6.83783e-03 * T**2 - 5.481717e-02 * R**2 + 1.22874e-03 * T**2 * R +
          8.5282e-04 * T * R**2 - 1.99e-06 * T**2 * R**2)
    return HI

## Calculate hi and residuals

### Subtask:
Apply the new calculation function to the dataframe and calculate the 'HI_residuals' ('HI_observed' - 'HI_calculated').


**Reasoning**:
Apply the Rothfusz formula to calculate the Heat Index using the converted TMAX (Fahrenheit) and RH, then calculate the residuals between the observed HI and the calculated HI. Also, convert TMAX back to Celsius after the calculation to maintain consistency.



In [50]:
# Convert TMAX from Celsius to Fahrenheit for HI calculation
df_processed['TMAX_F'] = (df_processed['TMAX'] * 9/5) + 32

# Apply the Rothfusz formula to get HI in Fahrenheit
df_processed['HI_calculated_F'] = calculate_rothfusz_hi(df_processed['TMAX_F'], df_processed['RH'])

# Convert HI_calculated from Fahrenheit to Celsius for consistency with HI_observed
df_processed['HI_calculated'] = (df_processed['HI_calculated_F'] - 32) * 5/9

# Calculate residuals using HI_observed (Celsius) and HI_calculated (Celsius)
df_processed['HI_residuals'] = df_processed['HI_observed'] - df_processed['HI_calculated']

# Display the new columns and a few relevant original columns
# Displaying both the F and C calculated HI for clarity
display(df_processed[['DATE', 'STATION', 'TMAX', 'RH', 'HI_observed', 'TMAX_F', 'HI_calculated_F', 'HI_calculated', 'HI_residuals']].head())

Unnamed: 0,DATE,STATION,TMAX,RH,HI_observed,TMAX_F,HI_calculated_F,HI_calculated,HI_residuals
0,2014-01-01,Clark,30.4,71.0,30.8,86.72,97.339751,36.299862,-5.499862
1,2014-01-01,Cabanatuan_CLSU,32.0,83.0,38.5,89.6,114.220788,45.678215,-7.178215
2,2014-01-01,San Jose,34.0,82.0,46.1,93.2,128.096675,53.387041,-7.287041
3,2014-01-01,Daet,29.4,81.0,35.9,84.92,97.021583,36.123101,-0.223101
4,2014-01-01,Baler Radar,30.0,81.0,35.0,86.0,100.315771,37.953206,-2.953206


In [51]:
def create_lag_features(df, lag_cols, lag_periods):
    """
    Generates lag features for specified columns and periods, grouped by station.

    Args:
        df (pd.DataFrame): The input DataFrame containing the original data.
        lag_cols (list): A list of column names for which to create lag features.
        lag_periods (list): A list of integers representing the lag periods (in days).

    Returns:
        pd.DataFrame: The DataFrame with added lag features.
    """
    print(f"Creating lag features for columns {lag_cols} with periods {lag_periods}...")

    df_lagged = df.copy()
    stations = df_lagged['STATION'].unique()

    for station in stations:
        station_mask = df_lagged['STATION'] == station
        for col in lag_cols:
            if col in df_lagged.columns:
                for lag in lag_periods:
                    new_col_name = f'{col}_lag_{lag}'
                    # Ensure data is sorted by date for correct lagging within each station
                    df_lagged.loc[station_mask, new_col_name] = df_lagged.loc[station_mask, col].shift(lag)
            else:
                 print(f"Warning: Column '{col}' not found in DataFrame.")

    print("Lag feature creation complete.")
    return df_lagged

# Define columns and periods for lagging
lag_columns = ['TMAX', 'RH', 'HI_observed', 'HI_calculated', 'HI_residuals', 'TEMP_MEAN', 'TEMP_RANGE']
lag_periods_list = list(range(7, 15)) # Lags from 7 to 14 days

# Apply the function to create lag features
df_lagged = create_lag_features(df_processed, lag_columns, lag_periods_list)

# Display the first few rows with some original and new lag columns
display(df_lagged[['DATE', 'STATION', 'TMAX', 'TMAX_lag_7', 'TMAX_lag_14', 'RH', 'RH_lag_7', 'RH_lag_14', 'HI_residuals', 'HI_residuals_lag_7']].head())

Creating lag features for columns ['TMAX', 'RH', 'HI_observed', 'HI_calculated', 'HI_residuals', 'TEMP_MEAN', 'TEMP_RANGE'] with periods [7, 8, 9, 10, 11, 12, 13, 14]...
Lag feature creation complete.


Unnamed: 0,DATE,STATION,TMAX,TMAX_lag_7,TMAX_lag_14,RH,RH_lag_7,RH_lag_14,HI_residuals,HI_residuals_lag_7
0,2014-01-01,Clark,30.4,,,71.0,,,-5.499862,
1,2014-01-01,Cabanatuan_CLSU,32.0,,,83.0,,,-7.178215,
2,2014-01-01,San Jose,34.0,,,82.0,,,-7.287041,
3,2014-01-01,Daet,29.4,,,81.0,,,-0.223101,
4,2014-01-01,Baler Radar,30.0,,,81.0,,,-2.953206,


In [52]:
def prepare_features_and_targets(df_lagged):
    """
    Prepare feature matrix (X) and target variables (Y) including new lag features.
    """
    print("Preparing features and targets with lag features...")

    # Target variables for the Multi-Task Learning model are TMAX and RH
    target_cols = ['TMAX', 'RH']

    # Create station dummy variables
    station_dummies = pd.get_dummies(df_lagged['STATION'], prefix='STATION', dummy_na=False)
    df_with_stations = pd.concat([df_lagged, station_dummies], axis=1)

    # Define columns to exclude from features
    # Exclude original targets (TMAX, RH - they are also targets here)
    # Exclude the primary target for the overall task (HI_observed)
    # Exclude date/time related original columns and season
    exclude_cols = ['HI_observed', 'DATE', 'STATION', 'YEAR', 'MONTH', 'DAY', 'SEASON', 'TMAX_F'] + target_cols # Exclude TMAX and RH as they are targets

    # Select feature columns
    # Include all columns except those in exclude_cols and the target columns TMAX and RH
    feature_cols = [col for col in df_with_stations.columns if col not in exclude_cols and col not in target_cols]

    # Prepare features and targets
    X = df_with_stations[feature_cols].copy()
    y = df_with_stations[target_cols].copy()

    # Remove any remaining non-numeric columns (should be minimal after exclusions, but safe)
    X = X.select_dtypes(include=[np.number])

    print(f"Features shape: {X.shape}")
    print(f"Targets shape: {y.shape}")
    return X, y

# Prepare features and targets using the lagged DataFrame
# Check if df_lagged was successfully created before proceeding
if 'df_lagged' in locals() and df_lagged is not None:
    X, y = prepare_features_and_targets(df_lagged)
    display(X.head())
    display(y.head())
else:
    print("df_lagged was not created successfully. Cannot prepare features and targets.")


Preparing features and targets with lag features...
Features shape: (87587, 83)
Targets shape: (87587, 2)


Unnamed: 0,TMIN,WIND_SPEED,WIND_DIRECTION,Albedo_linear,Albedo_spline,skin_temperature_min_C,skin_temperature_max_C,NDBaI_linear,NDBaI_spline,NDBI_linear,NDBI_spline,NDVI_original,NDWI_linear,NDWI_spline,DAY_OF_YEAR,IS_DRY_SEASON,IS_WET_SEASON,IS_SOUTHWEST_MONSOON,IS_NORTHEAST_MONSOON,TEMP_RANGE,TEMP_MEAN,WIND_U,WIND_V,URBAN_VEG_RATIO,HI_calculated_F,...,HI_calculated_lag_14,HI_residuals_lag_7,HI_residuals_lag_8,HI_residuals_lag_9,HI_residuals_lag_10,HI_residuals_lag_11,HI_residuals_lag_12,HI_residuals_lag_13,HI_residuals_lag_14,TEMP_MEAN_lag_7,TEMP_MEAN_lag_8,TEMP_MEAN_lag_9,TEMP_MEAN_lag_10,TEMP_MEAN_lag_11,TEMP_MEAN_lag_12,TEMP_MEAN_lag_13,TEMP_MEAN_lag_14,TEMP_RANGE_lag_7,TEMP_RANGE_lag_8,TEMP_RANGE_lag_9,TEMP_RANGE_lag_10,TEMP_RANGE_lag_11,TEMP_RANGE_lag_12,TEMP_RANGE_lag_13,TEMP_RANGE_lag_14
0,19.7,2.0,300.0,0.152315,0.265997,16.754805,31.164641,0.195498,0.202554,-0.195498,-0.198983,0.593783,-0.582388,-0.46688,1,1,0,0,1,10.7,25.05,1.732051,-1.0,-0.328688,97.339751,...,,,,,,,,,,,,,,,,,,,,,,,,,
1,20.5,1.0,360.0,0.133434,0.241356,16.697752,31.077206,0.179793,0.194508,-0.179793,-0.202867,0.472413,-0.499149,-0.448831,1,1,0,0,1,11.5,26.25,2.449294e-16,-1.0,-0.379781,114.220788,...,,,,,,,,,,,,,,,,,,,,,,,,,
2,22.9,2.0,80.0,0.11326,0.230437,23.918974,29.832453,0.065932,0.100102,-0.065932,-0.116196,0.405936,-0.433001,-0.393333,1,1,0,0,1,11.1,28.45,-1.969616,-0.3472964,-0.162021,128.096675,...,,,,,,,,,,,,,,,,,,,,,,,,,
3,25.0,4.0,90.0,0.339162,0.237252,23.367488,27.691225,0.206976,-0.20518,-0.206976,0.198623,0.435625,-0.093535,-0.105917,1,1,0,0,1,4.4,27.2,-4.0,-2.449294e-16,-0.474037,97.021583,...,,,,,,,,,,,,,,,,,,,,,,,,,
4,21.2,2.0,360.0,0.476787,0.159834,19.501772,28.114169,0.198947,-0.269199,-0.198947,0.352417,0.734685,-0.147645,0.093225,1,1,0,0,1,8.8,25.6,4.898587e-16,-2.0,-0.270425,100.315771,...,,,,,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0,TMAX,RH
0,30.4,71.0
1,32.0,83.0
2,34.0,82.0
3,29.4,81.0
4,30.0,81.0


In [59]:
def create_time_series_split(X, y, train_pct=0.7, val_pct=0.1, test_pct=0.2):
    """
    Create chronological 70-10-20 split (Train -> Validation -> Test).
    Assumes X and y are already sorted by date.
    """
    print("Creating chronological Train-Validation-Test split (70/10/20)...")

    total_len = len(X)
    train_end = int(total_len * train_pct)
    val_end = int(total_len * (train_pct + val_pct))

    X_train, y_train = X[:train_end], y[:train_end]

    X_val, y_val = X[train_end:val_end], y[train_end:val_end]

    X_test, y_test = X[val_end:], y[val_end:]

    print(f"Train samples: {len(X_train):,}")
    print(f"Validation samples: {len(X_val):,}")
    print(f"Test samples: {len(X_test):,}")

    return X_train, X_val, X_test, y_train, y_val, y_test

# Perform the chronological split
X_train, X_val, X_test, y_train, y_val, y_test = create_time_series_split(X, y)

Creating chronological Train-Validation-Test split (70/10/20)...
Train samples: 61,310
Validation samples: 8,759
Test samples: 17,518


In [60]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor # Re-import needed as it's the first time used in this cell

# Instantiate the base model (RandomForestRegressor)
# Using appropriate parameters for RandomForestRegressor
base_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)

# Wrap the base model in MultiOutputRegressor
mtl_model = MultiOutputRegressor(base_model)

print("Multi-Task Learning (MTL) model framework defined successfully.")
print(mtl_model)

Multi-Task Learning (MTL) model framework defined successfully.
MultiOutputRegressor(estimator=RandomForestRegressor(max_depth=8,
                                                     n_estimators=500,
                                                     n_jobs=-1,
                                                     random_state=42))


## Hyperparameter tuning and early stopping

### Subtask:
Integrate hyperparameter tuning and early stopping within the multi-task learning model training process.


**Reasoning**:
Implement a custom early stopping mechanism for the `MultiOutputRegressor` by training the model iteratively and evaluating performance on the validation set. Define the parameter grid and instantiate the base estimator within the training function.



In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ParameterGrid

def train_mtlrf_with_early_stopping(X_train, y_train, X_val, y_val,
                                    param_grid, n_estimators=500,
                                    early_stopping_rounds=50,
                                    eval_interval=10, random_state=42):
    """
    Train MultiOutput RandomForest with pseudo-early stopping.
    Instead of refitting each round, train once with max trees and
    evaluate using subsets of the trained estimators.

    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        param_grid: Hyperparameters (only first combination used here)
        n_estimators: Max trees
        early_stopping_rounds: Patience for early stopping
        eval_interval: How often to evaluate (every N trees)
        random_state: RNG seed

    Returns:
        final_model: Trained MultiOutputRegressor
        validation_scores: list of validation MAE at each step
        best_n_estimators: Best number of trees
    """

    # Take first param config
    base_params = list(ParameterGrid(param_grid))[0]

    # Fit model with max trees
    base_rf = RandomForestRegressor(
        **base_params,
        n_estimators=n_estimators,
        random_state=random_state,
        n_jobs=-1,
        warm_start=False # train once
    )
    mtl_model = MultiOutputRegressor(base_rf)
    mtl_model.fit(X_train, y_train)

    # Evaluate staged performance
    validation_scores = []
    best_val_score = float('inf')
    best_n_estimators = n_estimators
    rounds_without_improvement = 0

    # Each target has its own forest inside MultiOutputRegressor
    for n_trees in range(eval_interval, n_estimators+1, eval_interval):
        preds = []
        for est, target in zip(mtl_model.estimators_, y_train.columns):
            # Average predictions over first n_trees
            tree_preds = np.mean([tree.predict(X_val) for tree in est.estimators_[:n_trees]], axis=0)
            preds.append(tree_preds)
        preds = np.vstack(preds).T  # shape (n_samples, n_targets)

        # Validation metric (avg MAE across targets)
        maes = [mean_absolute_error(y_val.iloc[:, j], preds[:, j]) for j in range(y_val.shape[1])]
        current_val_score = np.mean(maes)
        validation_scores.append((n_trees, current_val_score))

        print(f"Trees: {n_trees}, Validation MAE: {current_val_score:.4f}")

        if current_val_score < best_val_score:
            best_val_score = current_val_score
            best_n_estimators = n_trees
            rounds_without_improvement = 0
        else:
            rounds_without_improvement += eval_interval

        if rounds_without_improvement >= early_stopping_rounds:
            print(f"Early stopping at {n_trees} trees.")
            break

    # Retrain final model with best number of trees
    final_rf = RandomForestRegressor(
        **base_params,
        n_estimators=best_n_estimators,
        random_state=random_state,
        n_jobs=-1
    )
    final_mtl_model = MultiOutputRegressor(final_rf)
    final_mtl_model.fit(X_train, y_train)

    return final_mtl_model, validation_scores, best_n_estimators

In [None]:
param_grid = {
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}

trained_mtl_model, val_scores, best_estimators = train_mtlrf_with_early_stopping(
    X_train, y_train, X_val, y_val,
    param_grid=param_grid,
    n_estimators=500,
    early_stopping_rounds=50,
    eval_interval=10
)

print(f"\nBest number of estimators: {best_estimators}")

In [None]:
if 'trained_mtl_model' in locals() and trained_mtl_model is not None:
    print("\nEvaluating trained Multi-Task Learning model on the test set...")

    # Generate predictions
    y_pred = trained_mtl_model.predict(X_test)

    # Extract true vs predicted
    y_true_tmax, y_pred_tmax = y_test['TMAX'], y_pred[:, 0]
    y_true_rh,   y_pred_rh   = y_test['RH'],   y_pred[:, 1]

    # Metrics
    rmse_tmax = np.sqrt(mean_squared_error(y_true_tmax, y_pred_tmax))
    mae_tmax  = mean_absolute_error(y_true_tmax, y_pred_tmax)
    r2_tmax   = r2_score(y_true_tmax, y_pred_tmax)

    rmse_rh = np.sqrt(mean_squared_error(y_true_rh, y_pred_rh))
    mae_rh  = mean_absolute_error(y_true_rh, y_pred_rh)
    r2_rh   = r2_score(y_true_rh, y_pred_rh)

    # Format results
    table_data = [
        ["TMAX", f"{rmse_tmax:.4f}", f"{mae_tmax:.4f}", f"{r2_tmax:.4f}"],
        ["RH",   f"{rmse_rh:.4f}",   f"{mae_rh:.4f}",   f"{r2_rh:.4f}"]
    ]

    print("\nMulti-Task Learning Model Performance Metrics (Test Set):")
    print(tabulate(table_data, headers=['Target', 'RMSE', 'MAE', 'RÂ²'], tablefmt='pipe'))

else:
    print("trained_mtl_model is not available. Evaluation cannot be performed.")

In [None]:
# Check if the trained_mtl_model variable exists and is not None
if 'trained_mtl_model' not in locals() or trained_mtl_model is None:
    # Keep error message
    print("trained_mtl_model is not available. Cannot make future predictions.")
else:
    # Identify the most recent date in the original processed data
    # Assumes df_processed is available from a previous successful execution
    if 'df_processed' in locals() and df_processed is not None:
        last_date = df_processed['DATE'].max()
        # Removed: print(f"Most recent date in historical data: {last_date.date()}")

        # Create future dates
        tomorrow = last_date + timedelta(days=1)
        day_after_tomorrow = last_date + timedelta(days=2)
        future_dates = [tomorrow, day_after_tomorrow]

        # Removed: print(f"Predicting for: {tomorrow.date()} and {day_after_tomorrow.date()}")

        # Get the last available row for each station to use as a base for future features
        # Assumes df_lagged is available from a previous successful execution
        if 'df_lagged' in locals() and df_lagged is not None:
            last_data_per_station = df_lagged.groupby('STATION').tail(1).copy()

            # Create a list to hold future data rows
            future_data_rows = []

            # Iterate through each station and future date to construct feature rows
            for station in last_data_per_station['STATION']:
                last_row = last_data_per_station[last_data_per_station['STATION'] == station].iloc[0].copy()

                for date_to_predict in future_dates:
                    new_row = last_row.copy()
                    new_row['DATE'] = date_to_predict
                    new_row['YEAR'] = date_to_predict.year
                    new_row['MONTH'] = date_to_predict.month
                    new_row['DAY'] = date_to_predict.day
                    new_row['DAY_OF_YEAR'] = date_to_predict.dayofyear
                    new_row['SEASON'] = get_philippines_season(date_to_predict.month)
                    new_row['IS_DRY_SEASON'] = (new_row['SEASON'] == 0).astype(int)
                    new_row['IS_WET_SEASON'] = (new_row['SEASON'] == 1).astype(int)
                    new_row['IS_SOUTHWEST_MONSOON'] = 1 if new_row['MONTH'] in [6, 7, 8, 9] else 0
                    new_row['IS_NORTHEAST_MONSOON'] = 1 if new_row['MONTH'] in [12, 1, 2, 3] else 0

                    # Shift lag features for future dates
                    for col in lag_columns:
                        for lag in lag_periods_list:
                            lag_date = date_to_predict - timedelta(days=lag)
                            lag_col_name = f'{col}_lag_{lag}'

                            # Find the historical value for the specific station and lag_date
                            historical_value = df_lagged[(df_lagged['STATION'] == station) & (df_lagged['DATE'] == lag_date)][col]

                            if not historical_value.empty:
                                new_row[lag_col_name] = historical_value.iloc[0]
                            else:
                                new_row[lag_col_name] = np.nan

                    # Set the non-lagged future values for TMAX, RH, etc. to NaN as they are unknown
                    for col in lag_columns:
                         new_row[col] = np.nan
                    new_row['TMAX_F'] = np.nan
                    new_row['HI_calculated'] = np.nan
                    new_row['HI_residuals'] = np.nan

                    future_data_rows.append(new_row)

            # Create the future features DataFrame
            df_future = pd.DataFrame(future_data_rows)

            # Convert DATE column to datetime
            df_future['DATE'] = pd.to_datetime(df_future['DATE'])

            # Recreate station dummy variables for future data
            future_station_dummies = pd.get_dummies(df_future['STATION'], prefix='STATION', dummy_na=False)

            # Align the future station dummies with the training station dummies
            # Assumes X_train is available from a previous successful execution
            if 'X_train' in locals() and X_train is not None:
                train_station_cols = [col for col in X_train.columns if col.startswith('STATION_')]
                future_station_dummies = future_station_dummies.reindex(columns=train_station_cols, fill_value=0)

                # Combine future features with aligned station dummies
                df_future = df_future.drop(columns=['STATION'])
                df_future_with_dummies = pd.concat([df_future, future_station_dummies], axis=1)

                # Prepare the final future feature matrix (X_future)
                X_future = df_future_with_dummies[X_train.columns].copy()

                # Impute any remaining missing values in X_future
                # Removed: print("Imputing missing values in future features using training data medians...")
                for col in X_future.columns:
                    if X_future[col].isnull().sum() > 0:
                        if col in X_train.columns:
                            # Assumes X_train is available
                            train_median = X_train[col].median()
                            X_future[col].fillna(train_median, inplace=True)
                        else:
                             # Keep warning message
                             print(f"Warning: Column '{col}' in X_future not found in X_train. Cannot impute.")

                # Ensure X_future is numeric
                X_future = X_future.select_dtypes(include=[np.number])

                # Removed: print(f"Future features shape: {X_future.shape}")
                # Keep display() call
                # display(X_future.head())

                # Make predictions for TMAX and RH
                # Removed: print("\nMaking predictions for TMAX and RH...")
                future_predictions = trained_mtl_model.predict(X_future)

                # Store the predictions with their corresponding dates and stations
                future_predictions_list = []
                row_index = 0
                for station in last_data_per_station['STATION']:
                     for date_to_predict in future_dates:
                         future_predictions_list.append({
                             'DATE': date_to_predict,
                             'STATION': station,
                             'Predicted_TMAX': future_predictions[row_index, 0],
                             'Predicted_RH': future_predictions[row_index, 1]
                         })
                         row_index += 1

                df_future_predictions = pd.DataFrame(future_predictions_list)

                # Removed: print("\nFuture TMAX and RH predictions:")
                # Keep display() call
                # display(df_future_predictions)
            else:
                print("X_train is not available. Cannot align future station dummies or impute missing values.")
        else:
            print("df_lagged is not available. Cannot generate future features.")
    else:
        print("df_processed is not available. Cannot determine last date for future predictions.")

In [None]:
# Check if required dataframes are available
if 'df_future_predictions' not in locals() or df_future_predictions is None:
    print("Error: df_future_predictions is not available. Cannot forecast HI.")
elif 'df_processed' not in locals() or df_processed is None:
     print("Error: df_processed is not available. Cannot forecast HI.")
else:
    print("Dataframes available. Proceeding with HI forecasting.")

    # 1. Calculate the average 'HI_residuals' per station from df_processed
    if 'HI_residuals' in df_processed.columns:
        avg_residuals_per_station = df_processed.groupby('STATION')['HI_residuals'].mean().reset_index()
        avg_residuals_per_station.rename(columns={'HI_residuals': 'Avg_HI_Residuals'}, inplace=True)
        print("\nAverage HI Residuals per station calculated.")
        display(avg_residuals_per_station.head())

        # 2. Merge the calculated average 'HI_residuals' back into df_future_predictions
        df_future_predictions = pd.merge(df_future_predictions, avg_residuals_per_station, on='STATION', how='left')
        print("\nAverage HI Residuals merged into future predictions dataframe.")

        # 3. Convert the 'Predicted_TMAX' from Celsius to Fahrenheit
        df_future_predictions['Predicted_TMAX_F'] = (df_future_predictions['Predicted_TMAX'] * 9/5) + 32
        print("Predicted TMAX converted to Fahrenheit.")

        # 4. Apply the calculate_rothfusz_hi function to get 'HI_calculated_future'
        # Ensure the function is available (assuming it was defined in a previous cell)
        if 'calculate_rothfusz_hi' in globals():
             df_future_predictions['HI_calculated_future'] = calculate_rothfusz_hi(
                 df_future_predictions['Predicted_TMAX_F'],
                 df_future_predictions['Predicted_RH']
             )
             print("Future HI calculated using Rothfusz formula.")
        else:
             print("Error: calculate_rothfusz_hi function not found. Cannot calculate future HI.")
             df_future_predictions['HI_calculated_future'] = np.nan


        # 5. Add the average 'HI_residuals' to 'HI_calculated_future' to get the final 'Forecasted_HI'
        if 'HI_calculated_future' in df_future_predictions.columns and 'Avg_HI_Residuals' in df_future_predictions.columns:
            df_future_predictions['Forecasted_HI'] = df_future_predictions['HI_calculated_future'] + df_future_predictions['Avg_HI_Residuals']
            print("Forecasted HI calculated by adding residuals.")
        else:
             print("Error: Cannot calculate Forecasted_HI. Missing 'HI_calculated_future' or 'Avg_HI_Residuals'.")
             df_future_predictions['Forecasted_HI'] = np.nan


        # 6. Display the df_future_predictions DataFrame
        print("\nFuture Heat Index Forecast:")
        display(df_future_predictions[['DATE', 'STATION', 'Predicted_TMAX', 'Predicted_RH', 'Forecasted_HI']])

    else:
        print("Error: 'HI_residuals' column not found in df_processed. Cannot forecast HI using residuals.")

In [None]:
# Check if the df_future_predictions DataFrame exists and contains the 'Forecasted_HI' column.
if 'df_future_predictions' in locals() and df_future_predictions is not None and 'Forecasted_HI' in df_future_predictions.columns:
    print("Presenting forecasted HI values:")

    # Select the 'DATE', 'STATION', and 'Forecasted_HI' columns.
    df_forecasted_hi = df_future_predictions[['DATE', 'STATION', 'Forecasted_HI']].copy()

    # Sort the selected data first by 'STATION' and then by 'DATE'.
    df_forecasted_hi = df_forecasted_hi.sort_values(by=['STATION', 'DATE']).reset_index(drop=True)

    # Display the sorted DataFrame.
    display(df_forecasted_hi)
else:
    # Print an informative message indicating that the results are not available.
    print("Forecasted HI results are not available. Please ensure previous steps completed successfully.")
