# Random Forest: Extreme Precipitation

This notebook builds upon code from [rf_class_palmerpenguins.ipynb](https://github.com/eabarnes1010/course_ml_ats/blob/main/code/rf_class_palmerpenguins.ipynb).

In [1]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt
import pydot
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.tree import export_graphviz
from graphviz import Source # To plot trees

# Import helper functions 
import sys
sys.path.insert(0, '../../utils')
from rf_utils import (
    confusion_matrix, 
    style_cm, 
    confusion_matrix_key, 
    rf_metrics
)

## Import input data

In [2]:
# Class 0: no extreme precip 
# Class 1: extreme precip
classes = [0,1]

# Columns to use for labels vs. features 
labels = "precip_classes"
features_list = ["slp_anom","hgt_detrended_anom"]

In [3]:
# Directory for input data 
data_dir = "../data/input_data_preprocessed/"

# Read csv as pandas DataFrame object
X_train_df = pd.read_csv(data_dir+"training/training_features.csv", index_col=False)
y_train_df = pd.read_csv(data_dir+"training/training_labels.csv", index_col=False)

X_val_df = pd.read_csv(data_dir+"/validation/validation_features.csv", index_col=False)
y_val_df = pd.read_csv(data_dir+"/validation/validation_labels.csv", index_col=False)

X_test_df = pd.read_csv(data_dir+"/testing/testing_features.csv", index_col=False)
y_test_df = pd.read_csv(data_dir+"/testing/testing_labels.csv", index_col=False)

# Confirm that the time index is equal 
for X,y in [(X_train_df,y_train_df),(X_val_df,y_val_df),(X_test_df,y_test_df)]:
    equal_bool = np.array_equal(X.time.values, y.time.values)
    if equal_bool is False: 
        print("Time indices are not equal") 
        print(X.time.values)
        print(y.time.values)

# Convert to numpy 
X_train = X_train_df[features_list].values
y_train = y_train_df[labels].values

X_val = X_val_df[features_list].values
y_val = y_val_df[labels].values

X_test = X_test_df[features_list].values
y_test = y_test_df[labels].values

In [4]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Validation Features Shape:', X_val.shape)
print('Validation Labels Shape:', y_val.shape)

Training Features Shape: (9131, 2)
Training Labels Shape: (9131,)
Validation Features Shape: (3287, 2)
Validation Labels Shape: (3287,)


## Train model and make predictions

### Optimize based on recall

In [5]:
vals = []
for tree_num in tqdm(np.arange(10,25,1)): 
    for tree_depth in np.arange(3,10,1):
        for node_split in np.arange(10,30,1): 

            # Build the random forest 
            rf = RandomForestClassifier(
                n_estimators = tree_num,   
                random_state = 13,    
                min_samples_split = node_split,
                min_samples_leaf = 1,
                criterion = "gini",
                max_depth = tree_depth,
                bootstrap = False,
                max_samples = None, 
                class_weight = {0:0.05, 1:0.95} 
            
            )
            # Train the model on training data
            rf.fit(X_train, y_train) # Runs the forest classifier
            y_pred = rf.predict(X_train)
            acc, recall, prec = rf_metrics(y_train, y_pred, print_console=False)
            vals.append([tree_num, tree_depth, node_split, acc, recall, prec])

100%|███████████████████████████████████████████| 15/15 [05:55<00:00, 23.72s/it]


In [6]:
# Put output into a dataframe 
df_op = pd.DataFrame(vals, columns=["Tree number","Tree depth","Node split","Accuracy","Recall","Precision"])

# Get values where recall is maximized
optimized_vals = df_op.loc[df_op["Recall"].idxmax()][["Tree number","Tree depth","Node split"]]
tree_num, tree_depth, node_split = [int(x) for x in optimized_vals.values]
print("Number of trees: {0}\nTree depth: {1}\nNode split: {2}".format(tree_num, tree_depth, node_split))

Number of trees: 17
Tree depth: 9
Node split: 28


## Build my model with my optimized hyperparameters 

In [7]:
hyperparameters  = {
    "tree_number": tree_num,    # number of trees to "average" together to create a random forest
    "tree_depth": tree_depth,      # maximum depth allowed for each tree
    "node_split": node_split,     # minimum number of training samples needed to split a node
    "leaf_samples": 1,    # minimum number of training samples required to make a leaf node
    "criterion": 'gini',  # information gain metric, 'gini' or 'entropy'
    "bootstrap": False,   # whether to perform "bagging=bootstrap aggregating" or not
    "max_samples": None,  # number of samples to grab when training each tree IF bootstrap=True, otherwise None 
    "random_state": 13,    # set random state for reproducibility
    "class_weight": "{0:0.05, 1:0.95}" 
}

In [8]:
forest = RandomForestClassifier(
     n_estimators = hyperparameters["tree_number"],
     random_state = hyperparameters["random_state"],
     min_samples_split = hyperparameters["node_split"],
     min_samples_leaf = hyperparameters["leaf_samples"],
     criterion = hyperparameters["criterion"],
     max_depth = hyperparameters["tree_depth"],
     bootstrap = hyperparameters["bootstrap"],
     max_samples = hyperparameters["max_samples"], 
     class_weight = eval(hyperparameters["class_weight"])
)
rf.fit(X_train, y_train) # Runs the forest classifier

## Make confusion matrix key 
Generate a helpful dataframe key for understanding a confusion matrix 

In [9]:
cm_key = style_cm(confusion_matrix_key())
cm_key

Unnamed: 0,Negative,Positive
Negative,True Negative,False Positive
Positive,False Negative,True Positive


### Try it with the training data 

In [10]:
y_pred_train = rf.predict(X_train)
acc, recall, prec = rf_metrics(y_train, y_pred_train)
conf_matrix_df = confusion_matrix(y_train, y_pred_train)
style_cm(conf_matrix_df)

Accuracy:  60.0 %
Recall:  88.0 %
Precision:  10.0 %


Unnamed: 0,0,1
0,98.860287,89.782286
1,1.139713,10.217714


### Try it with the validation data 

In [11]:
y_pred_val = rf.predict(X_val)
acc, recall, prec = rf_metrics(y_val, y_pred_val)
conf_matrix_df = confusion_matrix(y_val, y_pred_val)
style_cm(conf_matrix_df)

Accuracy:  57.0 %
Recall:  54.0 %
Precision:  6.0 %


Unnamed: 0,0,1
0,95.844576,93.723849
1,4.155424,6.276151


### Try it with the baseline

In [12]:
y_baseline = np.zeros(y_train.shape)
acc, recall, prec = rf_metrics(y_train, y_baseline)
conf_matrix_df = confusion_matrix(y_train, y_baseline)
style_cm(conf_matrix_df)

Accuracy:  95.0 %
Recall:  0.0 %
Precision:  0.0 %


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1
0,94.841748,0
1,5.158252,0


## Try with the testing data

In [13]:
y_pred_test = rf.predict(X_test)
acc, recall, prec = rf_metrics(y_test, y_pred_test)
conf_matrix_df = confusion_matrix(y_test, y_pred_test)
style_cm(conf_matrix_df)

Accuracy:  59.0 %
Recall:  58.0 %
Precision:  6.0 %


Unnamed: 0,0,1
0,96.777717,93.778111
1,3.222283,6.221889


## Generate figures for write up 

In [14]:
def make_pretty(styler):
    styler.set_caption("Hyperparameters")
    return styler

hp_df = pd.DataFrame(hyperparameters, index={"value":[0],}).T
hp_df.style.pipe(make_pretty)

Unnamed: 0,value
tree_number,17
tree_depth,9
node_split,28
leaf_samples,1
criterion,gini
bootstrap,False
max_samples,
random_state,13
class_weight,"{0:0.05, 1:0.95}"
