# Model Training for Rainfall Forecasting

This notebook implements and evaluates various machine learning models for rainfall forecasting in Selangor.

## Objectives:
- Train and compare multiple models (ARIMA, ANN, KNN, RF, XGBoost)
- Perform hyperparameter tuning
- Evaluate model performance using MAE, MSE, RMSE, R-squared
- Select the best performing model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Model evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV

# Models
from statsmodels.tsa.arima.model import ARIMA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load engineered features
df = pd.read_csv("../data/processed/engineered_features.csv")
df['Date'] = pd.to_datetime(df['Date'])

# Split into features and target
X = df.drop(['Date', 'Precipitation_mm'], axis=1)
y = df['Precipitation_mm']

# Time-based train-test split (80-20 split)
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
dates_test = df['Date'].iloc[split_idx:]

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Date range - Train: {df['Date'].iloc[0]} to {df['Date'].iloc[split_idx-1]}")
print(f"Date range - Test: {df['Date'].iloc[split_idx]} to {df['Date'].iloc[-1]}")

## 2. Model Training Functions

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    """
    Evaluate model performance and return metrics.
    """
    metrics = {
        'MAE': mean_absolute_error(y_true, y_pred),
        'MSE': mean_squared_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }
    
    print(f"\n{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return metrics

def train_arima(y_train, p_range=[0,1,2], d_range=[0,1], q_range=[0,1]):
    """
    Train ARIMA model with grid search for best parameters.
    """
    best_aic = float('inf')
    best_order = None
    best_model = None
    
    for p in p_range:
        for d in d_range:
            for q in q_range:
                try:
                    model = ARIMA(y_train, order=(p,d,q))
                    results = model.fit()
                    
                    if results.aic < best_aic:
                        best_aic = results.aic
                        best_order = (p,d,q)
                        best_model = results
                        
                    print(f"ARIMA{p,d,q} - AIC: {results.aic:.2f}")
                except:
                    continue
                    
    print(f"\nBest ARIMA model: {best_order} with AIC: {best_aic:.2f}")
    return best_model

def train_knn(X_train, y_train):
    """
    Train KNN model with hyperparameter tuning.
    """
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # 1: manhattan, 2: euclidean
    }
    
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search = GridSearchCV(
        KNeighborsRegressor(),
        param_grid,
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best KNN parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

## 3. Train and Evaluate Models

In [None]:
# Initialize results storage
results = {}

# Train ARIMA
print("Training ARIMA model...")
arima_model = train_arima(y_train)
arima_pred = arima_model.forecast(steps=len(y_test))
results['ARIMA'] = evaluate_model(y_test, arima_pred, "ARIMA")

# Train KNN
print("\nTraining KNN model...")
knn_model = train_knn(X_train, y_train)
knn_pred = knn_model.predict(X_test)
results['KNN'] = evaluate_model(y_test, knn_pred, "KNN")

## 4. Results Analysis

In [None]:
# Compare model performance
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)

# Visualize predictions vs actual
plt.figure(figsize=(12, 6))
plt.plot(dates_test, y_test, label='Actual', color='blue')
plt.plot(dates_test, arima_pred, label='ARIMA', color='red', linestyle='--')
plt.plot(dates_test, knn_pred, label='KNN', color='green', linestyle='-.')
plt.title('Actual vs Predicted Rainfall')
plt.xlabel('Date')
plt.ylabel('Precipitation (mm)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 5. Save Best Model

In [None]:
# Determine best model
best_model_name = results_df['RMSE'].idxmin()
print(f"\nBest model based on RMSE: {best_model_name}")

# Save best model
if best_model_name == 'ARIMA':
    arima_model.save("../models/saved_models/arima_model.pkl")
elif best_model_name == 'KNN':
    import joblib
    joblib.dump(knn_model, "../models/saved_models/knn_model.pkl")

print("Best model saved to models/saved_models/")

## 6. Next Steps

1. Implement additional models (Random Forest, XGBoost, ANN)
2. Add more sophisticated feature selection
3. Implement ensemble methods
4. Deploy best model in production pipeline