# Random Forest: Extreme Precipitation

This notebook builds upon code from [rf_regress_christman.ipynb](https://github.com/eabarnes1010/course_ml_ats/blob/main/code/rf_regress_christman.ipynb).

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.inspection import permutation_importance
import pydot
import matplotlib.pyplot as plt

## Import input data

In [None]:
# Read labels csv as pandas DataFrame object 
labels_df = pd.read_csv("../data/input_data_preprocessed/precip_classes.csv")

# Read features csv as pandas DataFrame object
features_df = pd.read_csv("../data/input_data_preprocessed/slp_hgt_anoms.csv")

# Merge DataFrames on time column 
# Will ensure that the data has the same time index 
input_df = labels_df.merge(features_df, on="time")

# Format time 
datetime_np = pd.to_datetime(input_df["time"].values)
input_df["year"] = datetime_np.year
input_df["month"] = datetime_np.month
input_df["day"] = datetime_np.day
input_df = input_df.drop(labels="time", axis="columns")

display(input_df)

In [None]:
# Descriptive statistics for each column
input_df.describe()

In [None]:
# Format input data 
labels_np = input_df["precip_classes"].values

# Convert to numpy array
features_list = ["slp_anom","hgt_detrended_anom","year","month","day"]
features_np = input_df[features_list].values

Confirm that the time index matches for both dataframes 

## Split dataset

In [None]:
# Split the data into training and testing sets

# Tunable Parameter: Describes the proportion of the dataset we want to use for testing. 1 - split_size is used for training. 
split_size = 0.25

# PARAMETERS:
#     test_size: fraction of testing/validation datasets
#     random_state: random parameter
train_features, test_features, train_labels, test_labels = train_test_split(
    features_np, 
    labels_np, 
    test_size=split_size, 
    random_state=42
)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

## Train model

In [None]:
# Tunable Parameters for Model
number_of_trees = 10
tree_depth = None 
node_split = 2       # minimum number of training samples needed to split a node
leaf_samples = 1     # minimum number of training samples required to make a leaf node
criterion = 'squared_error'    # variance reduction, alternatively 'mae'
RAND_STATE = 24

# Instantiate model with number of decision trees prescribed above
# PARAMETERS:
#     n_estimators: number of trees/ensembles
#     random_state: random seed
#     max_depth: maximum depth of each tree
#     criterion: evaluation statistic to split a mode, 'mse'  or 'mae'
#     min_samples_split: minimum number of samples needed to split a node
#     min_samples_leaf: minimum number of samples needed to make a leaf
#     for more, see: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
rf = RandomForestRegressor(n_estimators = number_of_trees, 
                           random_state = RAND_STATE,
                           min_samples_split = node_split,
                           min_samples_leaf = leaf_samples,
                           criterion = criterion,
                           max_depth = tree_depth)

# Train the model on training data
rf.fit(train_features, train_labels)

## Establish a baseline 
What to use here? 

In [None]:
# I NEED A BASELINE 

## Make predictions

In [None]:
# # Use the forest's predict method on the test data
# predictions = rf.predict(test_features)

# # Use testing set to validate the performance
# # Print out the mean absolute error (MAE)
# mae_errors = abs(predictions - test_labels)
# print('Baseline error (MAE): ', round(np.mean(mae_baseline_errors), 2))
# print('Error (MAE): ', round(np.mean(mae_errors), 2))

# # See its performance (mean squared errors)
# mse_errors = np.sqrt(np.mean((predictions - test_labels)**2))
# print('Baseline error (MSE): ', round( mse_baseline_errors, 2))
# print('Error (MSE): ', round(mse_errors, 2))