In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from joblib import dump
from math import sqrt
import os


def perform_random_forest_regression(X, y):
    imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    X = imputer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    with open("random_forest_results.txt", "w") as file:
        file.write(f"Mean Absolute Error: {mae:.2f}\n")
        file.write(f"Mean Squared Error: {mse:.2f}\n")
        file.write(f"Root Mean Squared Error: {rmse:.4f}\n")
        file.write(f"R^2 Score: {r2:.4f}\n")

    return rf_model


csv_file = os.path.join("..", "data", "climate_change_AQI.csv")
target_var = "Ozone"
features = [
    "Longitude",
    "Latitude",
    "Sine",
    "Cosine",
    "Land_Surface_Temp(C)",
    "Sea_Ice_Extent(10^6 sq km)",
    "Sea_Surface_Temp(C)",
]
df = pd.read_csv(csv_file)

X = df[features]
y = df["Ozone"]

rf_model = perform_random_forest_regression(X, y)
dump(rf_model, "random_forest_model.joblib")
print(f"Random forest model and results saved")