# Charts for paper - Random Forest Regressor

## Purpose and Context

This notebook is for creating all the regressor charts and data utilized in the final paper

## Setup

Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import tqdm.notebook
tqdm.notebook.tqdm_notebook.pandas()

import dask
import dask.dataframe as dd

from dask.diagnostics import ProgressBar
pbar = ProgressBar()                
pbar.register() # global registration

import sys
import multiprocessing as mp

if (sys.platform == 'win32' and mp.cpu_count() >= 61):
    dask.config.set(num_workers = 61)
    
import utils
import train
import labels

### Set Styles and colors

In [None]:
sns.set_theme(style = "whitegrid", font_scale = 1.1, font = 'Calibri')
sns.despine(left = True)

colors = ['#e66101', '#fdb863', '#b2abd2', '#5e3c99']
sns.set_palette(sns.color_palette(colors))
figureSize = (4, 3)
padInches = 0.05

## Load Data

In [None]:
development = utils.LoadDataFromOutput('dataset-development')
validation = utils.LoadDataFromOutput('dataset-validation')
print('Developement Dataset Count: ' + str(len(development)))
print('Validate Dataset Count: ' + str(len(validation)))

data = pd.concat([development, validation]).reset_index(drop = True)
print('Total Count: ' + str(len(data)))
print('Number of Training Features: ' + str(len(development.columns)))
development.head(1)

### Load Model

In [None]:
import ast

regressorModelUsed = pd.read_parquet('modelUsed-RandomForestRegressor.gzip.parquet').iloc[0]
regressorModelUsed['Thresholds'] = ast.literal_eval(regressorModelUsed['Thresholds'])

regressorModelUsed['Model'] = train.TrainRandomForestRegressor(regressorModelUsed['Model Params'], regressorModelUsed['Trial Type'], regressorModelUsed['High Epsilon Weight']
                                             , regressorModelUsed['Thresholds'], data)

print('Model Used:')
print(regressorModelUsed['Trial Type'])
if (regressorModelUsed['Trial Type'] == 'Thresholds Trial'):
    print('Thresholds: ' + str(regressorModelUsed['Thresholds']))
else:
    print('High Epsilon Weight: ' + str(regressorModelUsed['High Epsilon Weight']))
display(regressorModelUsed['Model'])

### Graph

In [None]:
def GraphResults(data, model, title, ax):
    X, y = train.GetXandY(data)
    y_weights = train.ComputeWeightsForRegressor(y, model['Trial Type'], model['High Epsilon Weight'], model['Thresholds'])
    
    predict_y = model['Model'].predict(X)
    score = model['Model'].score(X, y, y_weights)
    chart = sns.scatterplot(x = y / 1000, y = predict_y / 1000, ax = ax)
    chart.set(title = title + ' Score: ' + format(score, '.2f'))
    chart.xaxis.set_label_text('Actual ' + labels.EpsilonFull)
    chart.yaxis.set_label_text('Predicted ' + labels.EpsilonFull)
    chart.axvline(150, color = '#5e3c99')
    chart.axhline(150, color = '#5e3c99')

In [None]:
fig, axes = plt.subplots(ncols = 2, figsize = (8, 4), constrained_layout = True, sharey = True, sharex = True)
GraphResults(development, regressorModelUsed, 'Development', axes[0])
GraphResults(validation, regressorModelUsed, 'Validation', axes[1])

fig.savefig('../output/chart-overall-RandomForestRegressor.png', bbox_inches = 'tight', dpi = 600)