In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from tabulate import tabulate
import glob
import os
from datetime import datetime, timedelta
import warnings
import sys

# Suppress warnings
warnings.filterwarnings('ignore')

# Set max rows/cols for better console display
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)

In [None]:
def get_philippines_season(month):
    """
    Philippines seasons: 0=Dry (Nov-Apr), 1=Wet (May-Oct)
    """
    return 0 if month in [11, 12, 1, 2, 3, 4] else 1

def load_and_clean_data(data_directory="C:/Users/jessy/CCTHESS1-CCTHESS2-Dev-and-Docs/merged_datasets"):
    """
    Load, combine, clean, and feature engineer data from all PAGASA stations.
    """
    print("Starting data loading and cleaning...")
    
    all_data = []
    required_date_cols = ['YEAR', 'MONTH', 'DAY']
    
    # Find all CSV files based on expected patterns
    csv_patterns = ["*Daily Data.csv", "*Daily_Data.csv"]
    csv_files = []
    for pattern in csv_patterns:
        csv_files.extend(glob.glob(os.path.join(data_directory, pattern)))
    
    if not csv_files:
        print("Error: No station data CSV files found. Please ensure files are in the expected directory.")
        sys.exit(1)
        
    for file_path in csv_files:
        try:
            filename = os.path.basename(file_path)
            station_name = filename.replace(' Daily Data.csv', '').replace(' Daily_Data.csv', '')
            
            df = pd.read_csv(file_path)
            
            # Check for required date columns
            if not all(col in df.columns for col in required_date_cols):
                 print(f"Warning: Skipping {station_name}. Missing required date columns.")
                 continue

            df['STATION'] = station_name
            all_data.append(df)
            
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
            
    if not all_data:
        raise ValueError("No valid station data could be loaded! Check file paths and content.")
    
    df_combined = pd.concat(all_data, ignore_index=True)
    df_clean = df_combined.copy()
    
    # ------------------
    # Data Cleaning
    # ------------------
    target_cols = ['HI', 'TMAX', 'RH']
    df_clean = df_clean.dropna(subset=target_cols, how='all')
    
    # Fill missing values: first by station median, then by overall median
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.drop(['YEAR', 'MONTH', 'DAY'], errors='ignore')
    
    for station in df_clean['STATION'].unique():
        station_mask = df_clean['STATION'] == station
        station_data = df_clean[station_mask]
        for col in numeric_cols:
            if col in df_clean.columns:
                station_median = station_data[col].median()
                if pd.notna(station_median):
                    df_clean.loc[station_mask, col] = df_clean.loc[station_mask, col].fillna(station_median)
    
    for col in numeric_cols:
        if col in df_clean.columns and df_clean[col].isnull().sum() > 0:
            overall_median = df_clean[col].median()
            df_clean[col].fillna(overall_median, inplace=True)
            
    # ------------------
    # Feature Engineering
    # ------------------
    
    # Temporal Features
    df_clean['DATE'] = pd.to_datetime(df_clean[['YEAR', 'MONTH', 'DAY']])
    df_clean['DAY_OF_YEAR'] = df_clean['DATE'].dt.dayofyear
    
    df_clean['SEASON'] = df_clean['MONTH'].apply(get_philippines_season)
    df_clean['IS_DRY_SEASON'] = (df_clean['SEASON'] == 0).astype(int)
    df_clean['IS_WET_SEASON'] = (df_clean['SEASON'] == 1).astype(int)
    df_clean['IS_SOUTHWEST_MONSOON'] = df_clean['MONTH'].apply(lambda x: 1 if x in [6, 7, 8, 9] else 0)
    df_clean['IS_NORTHEAST_MONSOON'] = df_clean['MONTH'].apply(lambda x: 1 if x in [12, 1, 2, 3] else 0)
    
    # Derived Meteorological and GEE Features (if columns exist)
    if 'TMAX' in df_clean.columns and 'TMIN' in df_clean.columns:
        df_clean['TEMP_RANGE'] = df_clean['TMAX'] - df_clean['TMIN']
        df_clean['TEMP_MEAN'] = (df_clean['TMAX'] + df_clean['TMIN']) / 2
    
    if 'WIND_SPEED' in df_clean.columns and 'WIND_DIRECTION' in df_clean.columns:
        df_clean['WIND_U'] = -df_clean['WIND_SPEED'] * np.sin(np.radians(df_clean['WIND_DIRECTION']))
        df_clean['WIND_V'] = -df_clean['WIND_SPEED'] * np.cos(np.radians(df_clean['WIND_DIRECTION']))

    # Create vegetation/urban ratios from Google Earth Engine indices
    if 'NDVI_original' in df_clean.columns and 'NDBI_linear' in df_clean.columns:
        df_clean['URBAN_VEG_RATIO'] = df_clean['NDBI_linear'] / (df_clean['NDVI_original'] + 0.001)

    # Sort data chronologically for time series split
    df_clean = df_clean.sort_values(by='DATE').reset_index(drop=True)
    
    print(f"Data processing complete. Final records: {len(df_clean):,}")
    print(f"Date Range: {df_clean['DATE'].min().date()} to {df_clean['DATE'].max().date()}")
    return df_clean

# Execute data loading and cleaning
try:
    df_processed = load_and_clean_data()
except Exception as e:
    print(f"FATAL SETUP ERROR: {e}")
    sys.exit(1)

In [None]:
df_processed.head()

In [None]:
def prepare_features_and_targets(df):
    """
    Prepare feature matrix (X) and target variables (Y)
    """
    print("Preparing features and targets...")
    
    target_cols = ['HI', 'TMAX', 'RH']
    
    # Create station dummy variables
    station_dummies = pd.get_dummies(df['STATION'], prefix='STATION', dummy_na=False)
    df_with_stations = pd.concat([df, station_dummies], axis=1)
    
    # Define columns to exclude from features
    exclude_cols = target_cols + ['DATE', 'STATION', 'YEAR', 'MONTH', 'DAY', 'SEASON']
    
    # Select feature columns
    feature_cols = [col for col in df_with_stations.columns if col not in exclude_cols]
    
    # Prepare features and targets
    X = df_with_stations[feature_cols].copy()
    y = df_with_stations[target_cols].copy()
    
    # Remove any remaining non-numeric columns
    X = X.select_dtypes(include=[np.number])
    
    print(f"Features shape: {X.shape}")
    return X, y

# Prepare features and targets
X, y = prepare_features_and_targets(df_processed)

In [None]:
def create_time_series_split(X, y, train_pct=0.8, val_pct=0.1, test_pct=0.1):
    """
    Create chronological 80-10-10 split (Train -> Validation -> Test).
    Assumes X and y are already sorted by date.
    """
    print("Creating chronological Train-Validation-Test split (80/10/10)...")
    
    total_len = len(X)
    train_end = int(total_len * train_pct)
    val_end = int(total_len * (train_pct + val_pct))
    
    # Train set (first 80%)
    X_train, y_train = X[:train_end], y[:train_end]
    
    # Validation set (next 10%)
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    
    # Test set (last 10%)
    X_test, y_test = X[val_end:], y[val_end:]
    
    print(f"Train samples: {len(X_train):,}")
    print(f"Validation samples: {len(X_val):,}")
    print(f"Test samples: {len(X_test):,}")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Perform the chronological split
X_train, X_val, X_test, y_train, y_val, y_test = create_time_series_split(X, y)

In [None]:
def train_xgboost_models(X_train, X_val, y_train, y_val):
    """
    Train XGBoost models for HI, TMAX, and RH prediction
    """
    print("\nTraining XGBoost Models...")
    
    xgb_params = {
        'n_estimators': 500,
        'max_depth': 8,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'random_state': 42,
        'n_jobs': -1,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'early_stopping_rounds': 50
    }
    
    models = {}
    target_cols = ['HI', 'TMAX', 'RH']
    
    for target in target_cols:
        model = RandomForestRegressor(**xgb_params)
        model.fit(
            X_train, 
            y_train[target],
            eval_set=[(X_val, y_val[target])],
            verbose=False
        )
        models[target] = model
        print(f"Trained {target} model. Best iteration: {model.best_iteration}")
    
    return models

In [None]:
def evaluate_models(models, X_test, y_test):
    """
    Evaluate trained models on the test set
    """
    print("\nEvaluating models on chronological test set...")
    
    table_data = []
    target_cols = ['HI', 'TMAX', 'RH']
    
    for target in target_cols:
        y_pred = models[target].predict(X_test)
        y_true = y_test[target]
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        
        table_data.append([
            target,
            f"{rmse:.4f}",
            f"{mae:.4f}",
            f"{r2:.4f}"
        ])
    
    print("\nModel Performance Metrics (Test Set):")
    print(tabulate(table_data, headers=['Target', 'RMSE', 'MAE', 'R²'], tablefmt='pipe'))

# Train and evaluate models
models = train_xgboost_models(X_train, X_val, y_train, y_val)
evaluate_models(models, X_test, y_test)

In [None]:
# %% [markdown]
# ## 5. Station-Specific Future Weather Prediction (Final Robust Logic)

# %%
def predict_next_days_by_station(models, df_processed, X_train, n_days=3):
    """
    Predict weather for next n days for every station.
    Uses station-specific median feature values from historical data.
    """
    print(f"\nPredicting Weather for Next {n_days} Days (By Station)...")
    
    target_cols = ['HI', 'TMAX', 'RH']
    predictions_list = []
    
    current_date = datetime.now().date()
    
    # ---------------------------------------------
    # 1. Pre-calculate station-specific median features
    # ---------------------------------------------
    
    # Identify non-temporal, non-target features to get medians for
    # We must exclude 'STATION' from this list to prevent errors in groupby
    feature_cols_for_median = [
        col for col in df_processed.columns 
        if col not in target_cols and 
           col not in ['DATE', 'STATION', 'YEAR', 'MONTH', 'DAY', 'SEASON']
    ]
    
    # Calculate median for each feature for each station
    # We must handle potential NaN/empty feature sets
    station_medians = df_processed.groupby('STATION')[feature_cols_for_median].median()
    station_medians_dict = station_medians.T.to_dict()
    
    # Get the list of all station names and model's dummy columns
    station_names = station_medians.index.unique() # Use only stations that have median features
    station_cols = [col for col in X_train.columns if col.startswith('STATION_')]

    # ---------------------------------------------
    # 2. Prediction Loop
    # ---------------------------------------------
    for day_offset in range(n_days):
        target_date = current_date + timedelta(days=day_offset)
        date_str = target_date.strftime("%Y-%m-%d")
        month = target_date.month

        temporal_features = {
            'DAY_OF_YEAR': target_date.timetuple().tm_yday,
            'IS_DRY_SEASON': 1 if get_philippines_season(month) == 0 else 0,
            'IS_WET_SEASON': 1 if get_philippines_season(month) == 1 else 0,
            'IS_SOUTHWEST_MONSOON': 1 if month in [6, 7, 8, 9] else 0,
            'IS_NORTHEAST_MONSOON': 1 if month in [12, 1, 2, 3] else 0,
        }
        
        for station in station_names:
            station_dummy_col = f'STATION_{station}'
            
            # CRITICAL CHECK: Ensure the station has median features AND a corresponding dummy column
            if station not in station_medians_dict or station_dummy_col not in station_cols:
                 continue
            
            # 1. Start with station's median non-temporal features
            station_base_features = station_medians_dict[station].copy()
            
            # 2. Overwrite with temporal features
            station_base_features.update(temporal_features)
            
            # 3. Create a dictionary that contains ALL features in X_train
            # Initialize all feature values to 0 (for dummies) or median (for continuous)
            
            # Start with a clean dictionary, populating all required features
            feature_vector = {}
            for col in X_train.columns:
                if col.startswith('STATION_'):
                    feature_vector[col] = 0
                else:
                    # Use a default value (like median) if a feature is somehow missing from the base dict
                    feature_vector[col] = station_base_features.get(col, 0) 
            
            # 4. Set the correct station dummy to 1
            feature_vector[station_dummy_col] = 1
            
            # 5. Convert to DataFrame, ensuring order matches X_train
            X_station = pd.DataFrame([feature_vector], columns=X_train.columns)
            
            # 6. Make predictions and build the result dictionary
            day_predictions = {
                'Date': date_str, 
                'Day': target_date.strftime("%A"), 
                'Station': station,
                'HI': models['HI'].predict(X_station)[0],
                'TMAX': models['TMAX'].predict(X_station)[0],
                'RH': models['RH'].predict(X_station)[0]
            }
            predictions_list.append(day_predictions)

    # Convert to DataFrame
    df_pred = pd.DataFrame(predictions_list)

    if df_pred.empty:
        print("FATAL ERROR: Prediction DataFrame is empty.")
        print("Reason: No station successfully matched its historical features to the model's feature set.")
        return pd.DataFrame()
        
    return df_pred.sort_values(by=['Date', 'Station']).reset_index(drop=True)

# Make 3-day predictions for all stations
future_predictions_by_station = predict_next_days_by_station(models, df_processed, X_train, n_days=3)

# ----------------------------------------------------------------------
# FINAL OUTPUT DISPLAY LOGIC (Unchanged, as it was already robust)
# ----------------------------------------------------------------------

if future_predictions_by_station.empty:
    print("Cannot display forecast: Prediction DataFrame is empty. See FATAL ERROR message above.")
else:
    # Get the unique dates and their corresponding labels
    unique_dates = future_predictions_by_station['Date'].unique()
    period_labels = ['Today', 'Tomorrow', 'Day After Tomorrow']
    date_to_period = dict(zip(unique_dates, period_labels[:len(unique_dates)]))

    # --- TODAY'S FORECAST (first unique date) ---
    today_date = unique_dates[0]
    
    print(f"\n--- {date_to_period[today_date]}'S FORECAST ({today_date}) (By Station) ---")
    today_forecast = future_predictions_by_station[
        future_predictions_by_station['Date'] == today_date 
    ].sort_values(by='HI', ascending=False)

    today_forecast_display = today_forecast[['Station', 'HI', 'TMAX', 'RH']].copy()
    today_forecast_display['HI'] = today_forecast_display['HI'].round(1).astype(str) + '°C'
    today_forecast_display['TMAX'] = today_forecast_display['TMAX'].round(1).astype(str) + '°C'
    today_forecast_display['RH'] = today_forecast_display['RH'].round(1).astype(str) + '%'

    print(tabulate(today_forecast_display, headers=['Station', 'Heat Index', 'Max Temp', 'Humidity'], 
                   tablefmt='pipe', showindex=False))

    # --- 3-Day National Average Summary ---
    print("\n--- 3-Day National Average Summary (For Comparison) ---")
    daily_summary = future_predictions_by_station.groupby('Date').agg({
        'HI': 'mean',
        'TMAX': 'mean',
        'RH': 'mean',
        'Day': 'first'
    }).reset_index()

    daily_summary['Period'] = daily_summary['Date'].map(date_to_period)

    daily_summary_display = daily_summary[['Period', 'Day', 'HI', 'TMAX', 'RH']].copy()
    daily_summary_display['HI'] = daily_summary_display['HI'].round(1).astype(str) + '°C'
    daily_summary_display['TMAX'] = daily_summary_display['TMAX'].round(1).astype(str) + '°C'
    daily_summary_display['RH'] = daily_summary_display['RH'].round(1).astype(str) + '%'

    print(tabulate(daily_summary_display, headers=['Period', 'Day', 'Avg. Heat Index', 'Avg. Max Temp', 'Avg. Humidity'], 
                   tablefmt='pipe', showindex=False))

In [None]:
# Debug: Check data types of station dummies
station_dummies = pd.get_dummies(df_processed['STATION'], prefix='STATION', dummy_na=False)
print("Station dummy data types:")
print(station_dummies.dtypes.iloc[:5])

print("\nSample values:")
print(station_dummies.iloc[0, :5])

# Check if they are numeric
print(f"\nAre station dummies numeric? {station_dummies.dtypes.apply(lambda x: x in ['int64', 'float64', 'int32', 'float32']).all()}")

# Fix: Convert to numeric explicitly
station_dummies_numeric = station_dummies.astype(int)
print(f"\nAfter conversion - data types:")
print(station_dummies_numeric.dtypes.iloc[:5])

print(f"\nWould survive numeric filter: {len(station_dummies_numeric.select_dtypes(include=[np.number]).columns)}")

In [None]:
from datetime import timedelta

# Assume 'X_test' contains the last available feature row(s)
# and 'model' is the trained regressor for HI, TMAX, RH

last_date = pd.to_datetime(f"{int(data['YEAR'].iloc[-1])}-{int(data['MONTH'].iloc[-1])}-{int(data['DAY'].iloc[-1])}")

# Predict tomorrow and day after tomorrow
future_dates = [last_date + timedelta(days=1), last_date + timedelta(days=2)]
preds = model.predict(X_test.tail(1))

# If predicting multiple targets, preds may be 2D
if preds.ndim == 1:
    preds = preds.reshape(1, -1)

results = pd.DataFrame(preds, columns=['Pred_HI', 'Pred_TMAX', 'Pred_RH'])
results['Date'] = future_dates

# Reorder columns
results = results[['Date', 'Pred_HI', 'Pred_TMAX', 'Pred_RH']]
print(results)
