# Random Forest: Extreme Precipitation

This notebook builds upon code from [rf_class_palmerpenguins.ipynb](https://github.com/eabarnes1010/course_ml_ats/blob/main/code/rf_class_palmerpenguins.ipynb).

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import pydot
import sklearn
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.tree import export_graphviz
from graphviz import Source # To plot trees

## Import input data

In [None]:
# Class 0: no extreme precip 
# Class 1: extreme precip
classes = ["0","1"]

# Columns to use for labels vs. features 
labels_list = ["precip_classes"]
features_list = ["slp_anom","hgt_detrended_anom","year","month","day"]

In [None]:
# Directory for input data 
data_dir = "../data/input_data_preprocessed/"

# Read csv as pandas DataFrame object, then convert to numpy 

X_train = pd.read_csv(data_dir+"training/features_training.csv")
y_train = pd.read_csv(data_dir+"training/labels_training.csv") 

X_val = pd.read_csv(data_dir+"/validation/features_validation.csv")
y_val = pd.read_csv(data_dir+"/validation/labels_validation.csv") 

X_test = pd.read_csv(data_dir+"/testing/features_testing.csv")
y_test = pd.read_csv(data_dir+"/testing/labels_testing.csv") 

## Split dataset into training and validation

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Validation Features Shape:', X_val.shape)
print('Validation Labels Shape:', y_val.shape)

## Train model and make predictions

In [None]:
# Dictionary of hyperparameters 
fd = {
    "tree_number": 2,    # number of trees to "average" together to create a random forest
    "tree_depth": 25,     # maximum depth allowed for each tree
    "node_split": 20,     # minimum number of training samples needed to split a node
    "leaf_samples": 1,    # minimum number of training samples required to make a leaf node
    "criterion": 'gini',  # information gain metric, 'gini' or 'entropy'
    "bootstrap": False,   # whether to perform "bagging=bootstrap aggregating" or not
    "max_samples": None,  # number of samples to grab when training each tree IF bootstrap=True, otherwise None 
    "random_state": 13    # set random state for reproducibility
}

# Build the random forest 
rf = RandomForestClassifier(
   n_estimators = fd["tree_number"],
   random_state = fd["random_state"],
   min_samples_split = fd["node_split"],
   min_samples_leaf = fd["leaf_samples"],
   criterion = fd["criterion"],
   max_depth = fd["tree_depth"],
   bootstrap = fd["bootstrap"],
   max_samples = fd["max_samples"]
)

# Train the model on training data
rf.fit(X_train, y_train) # Runs the forest classifier
y_pred = rf.predict(X_train)

## Establish a baseline 
My baseline will be that 0 extreme precip days are predicted

## Make confusion matrix 

In [None]:
num_extremes_train = y_train.sum()
num_extremes_pred = y_pred.sum() 

print("Num extreme precip days in training data: {0}".format(num_extremes_train))
print("Num extreme precip days in predicted data: {0}".format(num_extremes_pred))

In [None]:
acc = metrics.accuracy_score(y_train, y_pred)
print("training accuracy: ", np.around(acc*100), '%')

def confusion_matrix(predclasses, targclasses):
    class_names = np.unique(targclasses)
    table = []
    for pred_class in class_names:
      row = []
      for true_class in class_names:
          row.append(100 * np.mean(predclasses[targclasses == true_class] == pred_class))
      table.append(row)
    class_titles_t = classes
    class_titles_p = classes
    conf_matrix = pd.DataFrame(table, index=class_titles_p, columns=class_titles_t)
    return conf_matrix

confusion_matrix(y_train, y_pred)

## Plot the tree

In [None]:
local_path = "../figs/"
fig_savename = "rf_extreme_precip"
tree_to_plot = 0 # Enter the value of the tree that you want to see!

tree = rf[tree_to_plot] # Obtain the tree to plot
tree_numstr = str(tree_to_plot) # Adds the tree number to filename

complete_savename = fig_savename + "_" + tree_numstr + ".dot"
out_file = local_path + complete_savename
export_graphviz(tree,
                out_file=out_file,
                filled=True,
                proportion=False,
                leaves_parallel=False,
                class_names=classes,
                feature_names=features_list)

Source.from_file(local_path + complete_savename)

## Establish a baseline 
What to use here? 

In [None]:
# I NEED A BASELINE 

## Make predictions