In [1]:
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
def train_df(oaqm, cond_raw, pred):
    """
    Feature engineers training dataset from sensor data. Merges weather data, creates cyclical time 
    features, removes feature not being trained on.
    oaqm: oaqm sensor data
    cond_raw: weathe target variable
    Returns: training dataset
    """
    # currenlty handling missing PM2.5 data by just removing it  
    data = oaqm.dropna(axis=0, subset=[pred])
    
    data = data[['temperature','pressure','humidity','dB','mslPressure', pred]]
    
    cond = cond_raw[['rain','wdsp','wddir','sun','vis','clht','clamt']]
    
    # Creating the cyclical features
    data['hour_sin'] = np.sin(2 * np.pi * data.index.hour/24)
    data['hour_cos'] = np.cos(2 * np.pi * data.index.hour/24)
    data['day_of_week_sin'] = np.sin(2 * np.pi * data.index.dayofweek/7)
    data['day_of_week_cos'] = np.cos(2 * np.pi * data.index.dayofweek/7)
    
    x = pd.merge(data, cond, how='inner', right_index=True, left_index=True)
    
    # interpolate missing data
    columns_with_nans = x.columns[x.isnull().any()].tolist()
    for column in columns_with_nans:
#         print(f'Filling NA in: {column}')
        x[column] = x[column].interpolate(method='time')
        # After interpolation, if NaNs remain, forward fill or backward fill
        x[column].fillna(method='ffill', inplace=True)
        x[column].fillna(method='bfill', inplace=True)

    y = x[pred]
    x.drop(columns=pred, inplace=True)
    
    return x, y

In [3]:
def rf_train(X_train, y_train, n_estimators=100, max_features=4):
    """
    """
    # Training the model
    rf = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, random_state=42, oob_score=True)
    rf.fit(X_train, y_train)

    return rf

In [4]:
def rf_parameter_tune(X_train, y_train):
    """
    Tunes 
    """
    n_estimators_options = [100, 200, 300, 500, 1000]
    max_features_options = range(2,8)
    #max depthtrees??

    # Store the performance results
    performance_results = {}

    # Loop over the parameters
    for n_estimators in n_estimators_options:
        for max_features in max_features_options:
            rf = rf_train(X_train, y_train, n_estimators=n_estimators, max_features=max_features)
            # Store the OOB score
            performance_results[(n_estimators, max_features)] = rf.oob_score_

    # Find the best parameters based on OOB score
    best_params = max(performance_results, key=performance_results.get)
    best_oob_score = performance_results[best_params]

    print("Best Parameters (n_estimators, max_features):", best_params)
    print("Best OOB Score:", best_oob_score)
    
    return best_params[0], best_params[1], best_oob_score

In [7]:
def rf_train_all(oaqms, pred):
    """
    Trains, hyperparamter tunes, and predicts list of sensors
    Returns: dictionary of sensors with nested dictionary of attributes: train_split_test, parameters, 
    obb_score, feature_importances, predictions,
    """
    scrf = dict()
    
    for sensor in oaqms:
        rfattr = dict()
        
        data = oaqms[sensor]
        X, y = train_df(data, weather, pred)
        
        # Splitting the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        print(f"{sensor}\n")
        # hyperparameter tuning
        n_est, max_feature, obb = rf_parameter_tune(X_train, y_train)
        # fitting the model
        rf = rf_train(X_train, y_train, n_est, max_feature)
        
        # Making predictions on the test set
        predictions = rf.predict(X_test)

        # storing into dict
        rfattr['X_train'] = X_train
        rfattr['y_train'] = y_train
        rfattr['X_test'] = X_test
        rfattr['y_test'] = y_test
        rfattr['n_estimator'] = n_est
        rfattr['max_feature'] = max_feature
        rfattr['obb_score'] = obb
        rfattr['rf'] = rf
        rfattr['feature_scores'] = rf.feature_importances_
        rfattr['y_preds'] = predictions

        scrf[sensor] = rfattr
        
    return scrf

In [6]:
def evaluate_rf(scrf):
    """
    Evaluates the test prediction scores for all sensors in the scrf dictionary.
    scrf: A dictionary with sensors as keys and dictionaries containing 'y_preds' and 'y_test' as values.
    Returns: A dictionary with sensors as keys and their evaluation scores as values.
    """
    evaluation_results = {}
    
    for sensor, data in scrf.items():
        # Retrieve predictions and actual values
        y_preds = data['y_preds']
        y_test = data['y_test']
        
        # Calculate evaluation metrics
        mae = mean_absolute_error(y_test, y_preds)
        mse = mean_squared_error(y_test, y_preds)
        rmse = mean_squared_error(y_test, y_preds, squared=False)  # Set squared=False for RMSE
        r2 = r2_score(y_test, y_preds)
        
        # Store the results
        evaluation_results[sensor] = {
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2
        }
        
    return evaluation_results

In [None]:
def plot_feature_importances(sensorsrf):
    """
    sensorsrf: dictionary of random forest results of every sensor
    """
    # Setup the subplot grid
    num_rows = 2
    num_cols = 3
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))  # Adjust the figure size as needed
    axes = axes.flatten()  # Flatten the array of axes for easy iterating

    # Iterate over each sensor and create a subplot for its feature importances
    for i, (sensor, data) in enumerate(sensorsrf.items()):
        # Extract feature importances and sort them
        importances = pd.Series(data['feature_scores'], index=data['X_train'].columns)
        importances_sorted = importances.sort_values()

        # Plot bar chart on the appropriate subplot
        axes[i].barh(importances_sorted.index, importances_sorted, color='skyblue')
        axes[i].set_title(f'{sensor}')
        axes[i].set_xlabel('Relative Importance')
        axes[i].set_ylabel('Features')

    # Hide any unused subplots if you have less than 6 sensors
    for j in range(i + 1, num_rows * num_cols):
        fig.delaxes(axes[j])

    # Adjust layout for better readability
    plt.tight_layout()
    plt.show()