# Random Forest
The nature of our models'
 training, testing, and evaluation is that the data is split into training, testing, and validation sets. 
 
2014-2022 will be used as training and testing while 2023-2024 will be used for validation.
 
## Number of models
The models to be made are:
- 1 week ahead
- 2 weeks ahead
- 3 weeks ahead
- 4 weeks ahead
- 12 weeks ahead

Two main types of model will be made:
- Municipal Level
- Provincial Level

There are 42 municipals in the province + 1 main city, Iloilo City. Hence, there will be 10x43 = 430 Local Models and 1x5 = 5 Provincial Model, making a total of 435 models.

Each n-week ahead model will have the same hyperparameters for different municipalities to avoid having too many models and avoiding unnecessary complexity.


## 10-Fold Cross Validation
The training and testing data will be used for the 10-Fold Cross Validation.

## Utilities
Functions that serve as utilities for the models will be made in a separate file. These functions will be used to:
- Save the predicted and actual observations to a CSV file.
- save the evaluation reports to a CSV file.


In [25]:
# import necessary modules
import pandas as pd
import sys
sys.path.append('..')
import models_utils
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Cross Validation with GridSearchCV
To find the optimal hyperparameter of the Random Forest model, a GridSearchCV will be used with a cv=10.
Since 

In [26]:
def get_best_hyperparameters(df, n_week_ahead, features=["Temperature", "Humidity", "Precipitation"], target="Cases", date_col="Year-Week"):
    """
    Function to get the best hyperparameters for the Random Forest model using GridSearchCV.
    """
    # Create a Random Forest Regressor
    data = models_utils.prepare_dataframe(df,["Temperature", "Humidity", "Precipitation"], "Cases", "Year-Week",1)
    X_train = data.drop(columns=['Cases', 'Year-Week'])
    y_train = data['Cases']
    rf = RandomForestRegressor()

    # Create a parameter grid to search
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    }

    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, n_jobs=-1)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters
    best_hyperparameters = grid_search.best_params_

    return best_hyperparameters

In [27]:
municipals = []
with (open("../municipals.txt", "r") as f):
    for line in f:
        municipals.append(line.strip())
municipals

['Ajuy',
 'Alimodian',
 'Anilao',
 'Badiangan',
 'Balasan',
 'Banate',
 'Barotac Nuevo',
 'Barotac Viejo',
 'Batad',
 'Bingawan',
 'Cabatuan',
 'Calinog',
 'Carles',
 'Passi City',
 'Concepcion',
 'Dingle',
 'Duenas',
 'Dumangas',
 'Estancia',
 'Guimbal',
 'Iloilo City',
 'Igbaras',
 'Janiuay',
 'Lambunao',
 'Leganes',
 'Lemery',
 'Leon',
 'Maasin',
 'Miagao',
 'Mina',
 'New Lucena',
 'Oton',
 'Pavia',
 'Pototan',
 'San Dionisio',
 'San Enrique',
 'San Joaquin',
 'San Rafael',
 'Santa Barbara',
 'Sara',
 'Tigbauan',
 'Tubungan',
 'Zarraga']

## For Municipal Level

In [29]:
best_parameters = []
n_weeks_ahead = [1,2,3,4,8,12]
for n in n_weeks_ahead:
    for municipal in municipals:
        municipal_df = pd.read_csv(f"../../data/Merged Data/{municipal}_merged.csv")
        best_params_model = get_best_hyperparameters(df = municipal_df, n_week_ahead=n)
        best_parameters.append({
            "Municipal": municipal,
            "Weeks Ahead": n,
            "Best Hyperparameters": best_params_model
        })


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_lag_{lag}'] = data[feature].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_lag_{lag}'] = data[feature].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f'{feature}_lag_{lag}'] = data[feature].shift(lag)
A value is trying to be set on a copy

KeyboardInterrupt: 

In [None]:
# Checking the count of best hyperparameters
best_parameters_count = {}
for param in best_parameters:
    if param["Best Hyperparameters"] not in best_parameters_count:
        best_parameters_count[param["Best Hyperparameters"]] = 1
    else:
        best_parameters_count[param["Best Hyperparameters"]] += 1

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

def train_RF(data, features, target, date_col, n_ahead, year_test_start, year_test_end):
    # Select relevant columns
    rel_col = features + [target] + [date_col]
    data = data[rel_col]
    data[date_col] = pd.to_datetime(data[date_col])

    # Define lags
    env_lags = [1, 2, 3, 4]
    cases_lag = range(n_ahead, 12)

    # Create lagged features for environment and target variables
    for lag in env_lags:
        for feature in features:
            data[f'{feature}_lag_{lag}'] = data[feature].shift(lag)

    for lag in cases_lag:
        data[f'{target}_lag_{lag}'] = data[target].shift(lag)

    # Remove any rows with missing values due to lagging
    data = data.dropna()

    # Split data into training and testing sets
    train_data = data[data[date_col] < year_test_start]
    test_data = data[(data[date_col] >= year_test_start) & (data[date_col] < year_test_end)]

    X_train = train_data.drop(columns=[target, date_col])
    y_train = train_data[target]
    X_test = test_data.drop(columns=[target, date_col])
    y_test = test_data[target]

    # Initialize and train the Random Forest model
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)

    # Predict on test data
    predict_data = rf_model.predict(X_test)
    actual_data = y_test.values

    # Make predictions integer
    predict_data = [int(round(x)) for x in predict_data]

    # Calculate Mean Absolute Error and Mean Squared Error
    MAE = mean_absolute_error(actual_data, predict_data)
    MSE = mean_squared_error(actual_data, predict_data)
    # get feature importance in a dictionary
    feature_importance = rf_model.feature_importances_
    feature_importance_dict = {}
    for i, feature in enumerate(X_train.columns):
        feature_importance_dict[feature] = feature_importance[i]
    return predict_data, MAE, MSE, feature_importance_dict


In [2]:
municipals = []
with (open("../municipals.txt", "r") as f):
    for line in f:
        municipals.append(line.strip())
municipals

['Ajuy',
 'Alimodian',
 'Anilao',
 'Badiangan',
 'Balasan',
 'Banate',
 'Barotac Nuevo',
 'Barotac Viejo',
 'Batad',
 'Bingawan',
 'Cabatuan',
 'Calinog',
 'Carles',
 'Passi City',
 'Concepcion',
 'Dingle',
 'Duenas',
 'Dumangas',
 'Estancia',
 'Guimbal',
 'Iloilo City',
 'Igbaras',
 'Janiuay',
 'Lambunao',
 'Leganes',
 'Lemery',
 'Leon',
 'Maasin',
 'Miagao',
 'Mina',
 'New Lucena',
 'Oton',
 'Pavia',
 'Pototan',
 'San Dionisio',
 'San Enrique',
 'San Joaquin',
 'San Rafael',
 'Santa Barbara',
 'Sara',
 'Tigbauan',
 'Tubungan',
 'Zarraga']

In [3]:
import sys
sys.path.append('..')
import models_utils
n_weeks_ahead = [1,2,3,4,8,12]
for municipal in municipals:
    for n in n_weeks_ahead:
        municipal_df = pd.read_csv(f"../../data/Merged Data/{municipal}_merged.csv")
        municipal_df["Year-Week"] = pd.to_datetime(municipal_df["Year-Week"])
        predicted, MAE, MSE, feature_importance_dict = train_RF(municipal_df, features=["Temperature", "Precipitation", "Humidity"], target="Cases", date_col="Year-Week", n_ahead=n, year_test_start="2023-01-01", year_test_end="2024-12-31")
        models_utils.save_data(municipal, n, MSE, MAE, predicted, municipal_df)
        models_utils.save_feat_imp(municipal, n, feature_importance_dict)