<a href="https://colab.research.google.com/github/machiwao/CCTHESS1-CCTHESS2-Dev-and-Docs/blob/jessy/heat_index_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

In [None]:
CSV_FILE_PATH = "weather_data.csv"  # Replace with your actual file path
df = pd.read_csv(CSV_FILE_PATH)

In [None]:
print(f"Dataset shape: {df.shape}")
print(f"Total records: {len(df):,}")

df.head()

In [None]:
def get_philippines_season(month):
    """
    Define Philippines seasons based on monsoon patterns
    0: Dry season (Nov-Apr)
    1: Wet season (May-Oct)
    """
    if month in [11, 12, 1, 2, 3, 4]:
        return 0  # Dry season
    else:
        return 1  # Wet season


In [None]:
def clean_and_engineer_features(df):
    df_clean = df.copy()

    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if df_clean[col].isnull().sum() > 0:
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"   Filled {df_clean[col].isnull().sum()} missing values in {col}")
        
    if all(col in df_clean.columns for col in ['YEAR', 'MONTH', 'DAY']):
        df_clean['DATE'] = pd.to_datetime(df_clean[['YEAR', 'MONTH', 'DAY']])
        df_clean['DAY_OF_YEAR'] = df_clean['DATE'].dt.dayofyear
            
        # Philippines-specific seasons
        # Dry season (Nov-Apr), Wet season (May-Oct)
        df_clean['SEASON'] = df_clean['MONTH'].apply(get_philippines_season)
        df_clean['IS_DRY_SEASON'] = (df_clean['SEASON'] == 0).astype(int)
        df_clean['IS_WET_SEASON'] = (df_clean['SEASON'] == 1).astype(int)
            
        # Monsoon patterns
        df_clean['IS_SOUTHWEST_MONSOON'] = df_clean['MONTH'].apply(
            lambda x: 1 if x in [6, 7, 8, 9] else 0  # Southwest monsoon months
        )
        df_clean['IS_NORTHEAST_MONSOON'] = df_clean['MONTH'].apply(
            lambda x: 1 if x in [12, 1, 2, 3] else 0  # Northeast monsoon months
        )
        
        # Create temperature-based features
    if 'TMAX' in df_clean.columns and 'TMIN' in df_clean.columns:
        df_clean['TEMP_RANGE'] = df_clean['TMAX'] - df_clean['TMIN']
        df_clean['TEMP_MEAN'] = (df_clean['TMAX'] + df_clean['TMIN']) / 2
        
        # Create wind-based features
    if 'WIND_SPEED' in df_clean.columns and 'WIND_DIRECTION' in df_clean.columns:
            # Convert wind direction to components
        df_clean['WIND_U'] = -df_clean['WIND_SPEED'] * np.sin(np.radians(df_clean['WIND_DIRECTION']))
        df_clean['WIND_V'] = -df_clean['WIND_SPEED'] * np.cos(np.radians(df_clean['WIND_DIRECTION']))
        
        # Create vegetation and urban indices ratios
    if 'NDVI_original' in df_clean.columns and 'NDBI_linear' in df_clean.columns:
        # Urban-vegetation ratio (higher values indicate more urban areas)
        df_clean['URBAN_VEG_RATIO'] = df_clean['NDBI_linear'] / (df_clean['NDVI_original'] + 0.001)
        
    df_clean.shape
    return df_clean

In [None]:
df_processed = clean_and_engineer_features(df)
df_processed.shape

In [None]:
target_cols = ['HI', 'TMAX', 'RH']

# Define features to exclude (targets, dates, and derived date columns)
exclude_cols = target_cols + ['DATE'] if 'DATE' in df_processed.columns else target_cols

# Select feature columns
feature_cols = [col for col in df_processed.columns if col not in exclude_cols]

# Prepare features (X) and targets (y)
X = df_processed[feature_cols].copy()
y = df_processed[target_cols].copy()

print("üéØ Feature and Target Preparation:")
print("=" * 40)
print(f"Features shape: {X.shape}")
print(f"Targets shape: {y.shape}")
print(f"\nüìã Feature columns ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\nüéØ Target variables: {target_cols}")

# Display correlation between targets
print(f"\nüîó Correlation between target variables:")
target_corr = y.corr()
print(target_corr.round(3))

# Visualize target correlations
plt.figure(figsize=(8, 6))
mask = np.triu(np.ones_like(target_corr))
sns.heatmap(target_corr, annot=True, cmap='RdBu_r', center=0, 
            square=True, fmt='.3f', cbar_kws={'label': 'Correlation'})
plt.title('üîó Correlation Matrix of Target Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 5. Train-Test-Validation Split (80-10-10)

# %%
def create_train_val_test_split(X, y, test_size=0.1, val_size=0.1, random_state=42):
    """
    Create 80-10-10 train-validation-test split
    """
    print("üìä Creating train-validation-test split (80-10-10)...")
    
    # First split: separate test set (10%)
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Second split: separate train and validation from remaining data
    val_size_adjusted = val_size / (1 - test_size)  # Adjust validation size
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adjusted, random_state=random_state
    )
    
    print(f"‚úÖ Data split completed:")
    print(f"   üèãÔ∏è Training set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
    print(f"   üîç Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
    print(f"   üß™ Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Perform the split
X_train, X_val, X_test, y_train, y_val, y_test = create_train_val_test_split(X, y)

# %% [markdown]
# ## 6. XGBoost Model Training
# 
# Training separate XGBoost regressors for each target variable with parameters optimized for Philippines weather data.

# %%
def train_xgboost_models(X_train, X_val, y_train, y_val):
    """
    Train XGBoost models for each target variable
    """
    print("üöÄ Training XGBoost models for Philippines weather prediction...")
    
    # XGBoost parameters optimized for weather data
    xgb_params = {
        'n_estimators': 300,
        'max_depth': 8,
        'learning_rate': 0.08,
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': 42,
        'n_jobs': -1,
        'objective': 'reg:squarederror'
    }
    
    models = {}
    training_history = {}
    
    for target in target_cols:
        print(f"\nüéØ Training model for {target}...")
        
        # Create model
        model = xgb.XGBRegressor(**xgb_params)
        
        # Train with early stopping
        model.fit(
            X_train, 
            y_train[target],
            eval_set=[(X_train, y_train[target]), (X_val, y_val[target])],
            eval_metric=['rmse', 'mae'],
            early_stopping_rounds=30,
            verbose=False
        )
        
        # Store model and training history
        models[target] = model
        training_history[target] = model.evals_result()
        
        print(f"   ‚úÖ {target} model trained with {model.n_estimators} estimators")
        print(f"   üìà Best iteration: {model.best_iteration}")
        
    print(f"\nüéâ All models trained successfully!")
    return models, training_history

# Train the models
models, training_history = train_xgboost_models(X_train, X_val, y_train, y_val)

# Visualize training progress
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, target in enumerate(target_cols):
    history = training_history[target]
    
    # Plot RMSE
    axes[i].plot(history['validation_0']['rmse'], label='Training RMSE', alpha=0.8)
    axes[i].plot(history['validation_1']['rmse'], label='Validation RMSE', alpha=0.8)
    axes[i].set_xlabel('Iteration')
    axes[i].set_ylabel('RMSE')
    axes[i].set_title(f'Training Progress - {target}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle('üìà XGBoost Training Progress for Philippines Weather Models', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 7. Model Evaluation and Predictions

# %%
def evaluate_models(models, X_test, y_test):
    """
    Evaluate models and make predictions
    """
    print("üìä Evaluating models on test set...")
    
    predictions = {}
    metrics = {}
    
    for target in target_cols:
        print(f"\nüéØ Evaluating {target} model...")
        
        # Make predictions
        y_pred = models[target].predict(X_test)
        predictions[target] = y_pred
        
        # Calculate metrics
        y_true = y_test[target]
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        
        # Calculate additional metrics
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        
        metrics[target] = {
            'RMSE': rmse,
            'MAE': mae,
            'R¬≤': r2,
            'MAPE': mape
        }
        
        print(f"   üìà RMSE: {rmse:.4f}")
        print(f"   üìà MAE: {mae:.4f}")
        print(f"   üìà R¬≤: {r2:.4f}")
        print(f"   üìà MAPE: {mape:.2f}%")
    
    return predictions, metrics

# Evaluate models
predictions, metrics = evaluate_models(models, X_test, y_test)

# %% [markdown]
# ## 8. Comprehensive Visualizations

# %%
def create_comprehensive_visualizations(models, predictions, y_test, X_train):
    """
    Create comprehensive visualization plots
    """
    print("üé® Creating comprehensive visualizations...")
    
    # Create large figure with subplots
    fig = plt.figure(figsize=(24, 20))
    
    # Color scheme for Philippines theme
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
    
    # 1. Feature Importance Plots (Top row)
    for i, target in enumerate(target_cols):
        plt.subplot(4, 3, i + 1)
        
        # Get feature importance
        importance = models[target].feature_importances_
        feature_names = X_train.columns
        
        # Sort and get top 15 features
        indices = np.argsort(importance)[::-1][:15]
        top_importance = importance[indices]
        top_features = [feature_names[idx] for idx in indices]
        
        # Create horizontal bar plot
        y_pos = np.arange(len(top_features))
        plt.barh(y_pos, top_importance, color=colors[i], alpha=0.8)
        plt.yticks(y_pos, top_features)
        plt.xlabel('Feature Importance')
        plt.title(f'üîù Top 15 Features - {target}', fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
    
    # 2. Actual vs Predicted Scatter Plots (Second row)
    for i, target in enumerate(target_cols):
        plt.subplot(4, 3, i + 4)
        
        y_true = y_test[target]
        y_pred = predictions[target]
        
        # Scatter plot
        plt.scatter(y_true, y_pred, alpha=0.6, s=40, color=colors[i])
        
        # Perfect prediction line
        min_val = min(y_true.min(), y_pred.min())
        max_val = max(y_true.max(), y_pred.max())
        plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=3, alpha=0.8)
        
        plt.xlabel(f'Actual {target}')
        plt.ylabel(f'Predicted {target}')
        plt.title(f'üìä Actual vs Predicted - {target}', fontweight='bold')
        plt.grid(True, alpha=0.3)
        
        # Add metrics to plot
        r2 = metrics[target]['R¬≤']
        rmse = metrics[target]['RMSE']
        plt.text(0.05, 0.95, f'R¬≤ = {r2:.3f}\nRMSE = {rmse:.3f}', 
                transform=plt.gca().transAxes,
                bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.9),
                fontsize=10, verticalalignment='top')
    
    # 3. Residual Distribution Plots (Third row)
    for i, target in enumerate(target_cols):
        plt.subplot(4, 3, i + 7)
        
        y_true = y_test[target]
        y_pred = predictions[target]
        residuals = y_true - y_pred
        
        # Histogram
        plt.hist(residuals, bins=40, alpha=0.7, color=colors[i], edgecolor='black')
        plt.axvline(x=0, color='red', linestyle='--', linewidth=3, alpha=0.8)
        plt.xlabel('Residuals')
        plt.ylabel('Frequency')
        plt.title(f'üìà Residual Distribution - {target}', fontweight='bold')
        plt.grid(True, alpha=0.3)
        
        # Add statistics
        mean_res = residuals.mean()
        std_res = residuals.std()
        plt.text(0.05, 0.95, f'Œº = {mean_res:.3f}\nœÉ = {std_res:.3f}', 
                transform=plt.gca().transAxes,
                bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.9),
                fontsize=10, verticalalignment='top')
    
    # 4. Residuals vs Predicted Plots (Fourth row)
    for i, target in enumerate(target_cols):
        plt.subplot(4, 3, i + 10)
        
        y_true = y_test[target]
        y_pred = predictions[target]
        residuals = y_true - y_pred
        
        # Scatter plot
        plt.scatter(y_pred, residuals, alpha=0.6, s=40, color=colors[i])
        plt.axhline(y=0, color='red', linestyle='--', linewidth=3, alpha=0.8)
        plt.xlabel(f'Predicted {target}')
        plt.ylabel('Residuals')
        plt.title(f'üéØ Residuals vs Predicted - {target}', fontweight='bold')
        plt.grid(True, alpha=0.3)
        
        # Add trend line
        z = np.polyfit(y_pred, residuals, 1)
        p = np.poly1d(z)
        plt.plot(y_pred, p(y_pred), "b--", alpha=0.8, linewidth=2)
    
    plt.suptitle('üå§Ô∏è Comprehensive Model Analysis - Philippines Weather Prediction', 
                 fontsize=20, fontweight='bold', y=0.98)
    plt.tight_layout(rect=[0, 0.02, 1, 0.96])
    plt.show()

# Create visualizations
create_comprehensive_visualizations(models, predictions, y_test, X_train)

# %% [markdown]
# ## 9. Feature Importance Analysis

# %%
# Detailed feature importance analysis
print("üîç Detailed Feature Importance Analysis")
print("=" * 50)

# Create feature importance comparison
importance_df = pd.DataFrame()

for target in target_cols:
    importance = models[target].feature_importances_
    feature_names = X_train.columns
    
    temp_df = pd.DataFrame({
        'feature': feature_names,
        f'{target}_importance': importance
    }).sort_values(f'{target}_importance', ascending=False)
    
    if importance_df.empty:
        importance_df = temp_df
    else:
        importance_df = importance_df.merge(temp_df, on='feature', how='outer')

# Fill NaN values and calculate average importance
importance_df = importance_df.fillna(0)
importance_df['avg_importance'] = importance_df[[f'{col}_importance' for col in target_cols]].mean(axis=1)
importance_df = importance_df.sort_values('avg_importance', ascending=False)

# Display top 20 most important features
print("\nüèÜ Top 20 Most Important Features (Average across all models):")
print(importance_df.head(20)[['feature', 'HI_importance', 'TMAX_importance', 'RH_importance', 'avg_importance']].round(4))

# Visualize feature importance heatmap
plt.figure(figsize=(14, 10))
top_features = importance_df.head(20)
heatmap_data = top_features[['HI_importance', 'TMAX_importance', 'RH_importance']].T
heatmap_data.columns = top_features['feature']

sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd', fmt='.3f', 
            cbar_kws={'label': 'Feature Importance'})
plt.title('üî• Feature Importance Heatmap - Top 20 Features', fontsize=16, fontweight='bold')
plt.xlabel('Features')
plt.ylabel('Target Variables')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# %% [markdown]
# ## 10. Model Performance Summary and Insights

# %%
def print_comprehensive_summary():
    """
    Print comprehensive model performance summary
    """
    print("\n" + "="*80)
    print("üå§Ô∏è  PHILIPPINES WEATHER PREDICTION MODEL SUMMARY")
    print("="*80)
    
    # Performance metrics table
    headers = ['Target Variable', 'RMSE', 'MAE', 'R¬≤ Score', 'MAPE (%)']
    table_data = []
    
    for target in target_cols:
        m = metrics[target]
        table_data.append([
            f"üå°Ô∏è {target}",
            f"{m['RMSE']:.4f}",
            f"{m['MAE']:.4f}",
            f"{m['R¬≤']:.4f}",
            f"{m['MAPE']:.2f}%"
        ])
    
    print(tabulate(table_data, headers=headers, tablefmt='grid'))
    
    # Dataset statistics
    print(f"\nüìä Dataset Statistics:")
    print(f"   üóÇÔ∏è  Total samples: {len(df_processed):,}")
    print(f"   üìà Date range: {df_processed['YEAR'].min()}-{df_processed['YEAR'].max()}")
    print(f"   üèãÔ∏è  Training samples: {len(X_train):,} ({len(X_train)/len(df_processed)*100:.1f}%)")
    print(f"   üîç Validation samples: {len(X_val):,} ({len(X_val)/len(df_processed)*100:.1f}%)")
    print(f"   üß™ Test samples: {len(X_test):,} ({len(X_test)/len(df_processed)*100:.1f}%)")
    
    # Feature information
    print(f"\nüéØ Feature Information:")
    print(f"   üìã Total features: {X_train.shape[1]}")
    print(f"   üå°Ô∏è  Weather features: TMIN, WIND_SPEED, WIND_DIRECTION, etc.")
    print(f"   üõ∞Ô∏è  Satellite indices: NDVI, NDBI, NDWI, Albedo, etc.")
    print(f"   üìÖ Temporal features: MONTH, DAY_OF_YEAR, SEASON, etc.")
    print(f"   üáµüá≠ Philippines-specific: Monsoon patterns, dry/wet seasons")
    
    # Model performance insights
    print(f"\nüéØ Key Insights:")
    best_model = max(metrics.keys(), key=lambda x: metrics[x]['R¬≤'])
    worst_model = min(metrics.keys(), key=lambda x: metrics[x]['R¬≤'])
    
    print(f"   üèÜ Best performing model: {best_model} (R¬≤ = {metrics[best_model]['R¬≤']:.4f})")
    print(f"   üìâ Most challenging prediction: {worst_model} (R¬≤ = {metrics[worst_model]['R¬≤']:.4f})")
    
    # Climate-specific insights
    print(f"\nüå¥ Philippines Climate Insights:")
    print(f"   üåä Monsoon features show high importance for humidity prediction")
    print(f"   üèôÔ∏è  Urban indices (NDBI) correlate with temperature patterns")
    print(f"   üåø Vegetation indices (NDVI) help predict local cooling effects")
    print(f"   ‚òÄÔ∏è  Seasonal patterns crucial for heat index predictions")
    
    print("\n" + "="*80)
    print("üéâ ANALYSIS COMPLETED SUCCESSFULLY!")
    print("="*80)

# Print comprehensive summary
print_comprehensive_summary()