# 3. Modeling - Random Forest

In [None]:
#imports
import pandas as pd
import os
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import csv
import itertools

#folders
data_folder = "data"

#machine learning
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import RandomForestRegressor as rfr

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import TimeSeriesSplit as tsp
from sklearn.model_selection import GridSearchCV as gscv

#warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
run_optim = False #runtime: n min

In [None]:
#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#defualt plot size 
size = {
    "width" : 1500 ,
    "height" : 750 ,
}

#function for plotting
def scale_show(fig, width = 1500, height = 750):

    #set font
    fig.update_layout(
        font = dict(size=16),
        title_font = dict(size=20),
        xaxis_title_font = dict(size=18),
        yaxis_title_font = dict(size=18),
    )

    #set size
    fig.update_layout(
        width=width,
        height=height,
    )

    #show
    fig.show()

    return

In [None]:
df = pd.read_csv(os.path.join(data_folder, "df_main.csv"))
df.head().T

In [None]:
#sources for reasoning

#data splitting: https://towardsdatascience.com/time-series-from-scratch-train-test-splits-and-evaluation-metrics-4fd654de1b37

#data standardizing (train, test, valid): 
#   https://stats.stackexchange.com/questions/202287/why-standardization-of-the-testing-set-has-to-be-performed-with-the-mean-and-sd
#   https://medium.com/analytics-vidhya/why-it-makes-a-difference-how-to-standardize-training-and-test-set-e95bf350bed3
#   https://stats.stackexchange.com/questions/248543/standardize-training-and-validation-data
#   https://www.kaggle.com/questions-and-answers/159183

#formula for standardizing: https://www.statisticshowto.com/standardized-values-examples/


In [None]:
class Base(): #parent

    def __init__(self, df : object, y_col : list, data_folder : str, results_file : str, model_metric : str, n_jobs : int = 1, window : int = 30):

        #save raw df
        self.df_raw         = df.copy()

        #drop forbidden cols
        df = df.copy() #pass by value
        df = Base.__drop_forbidden_cols(df, y_col)

        #set dataframe for refferencing
        self.df             = df.copy() #windowed df, copy because obj is passed by refference

        #get and get x_col and y_col
        self.y_col          = y_col
        self.x_col          = list(df.drop(labels = y_col, axis = 1, inplace = False).columns.to_list())

        #misc params
        self.random_state   = 42
        self.n_jobs         = n_jobs
        self.data_folder    = data_folder
        self.results_file   = os.path.join(data_folder,results_file)
        self.model_metric   = model_metric

        #windowing parameters
        self.x_window       = window #number of shifting window input features

        self.__setup()

        return

    def __setup(self):

        #order was chosen to minimize data loss, at the cost of more needed processing power

        #data preparation
        self.__windowing()
        self.__split_data()
        self.__standardize_data()

        #setup of metrics and results
        self.__set_assesment()

        return

    @staticmethod
    def __drop_forbidden_cols(df, y_col):

        forbidden_cols = ['date','t2m_t1', 't2m_t2', 't2m_t1_mean', 't2m_t2_mean', 't2m_t1_cat', 't2m_t2_cat']

        #prevent y_cols from being dropped from the data frame
        for y in y_col:
            if y in forbidden_cols:
                forbidden_cols.remove(y)

        #drop forbidden cols, to prevent adding future information to the time series
        print(f"Removed forbidden cols:\n{forbidden_cols}")
        df.drop(labels = forbidden_cols, axis = 1, inplace = True)

        return df

    def __windowing(self):
        """creates the windowed data frame"""

        self.x_col_windowed = self.x_col.copy() #copy, becaus lists are past by refference

        #input fetures: x
        for i in range(1, self.x_window + 1):
            for x_col in self.x_col: #inefficient but works just fine

                x_col_i             = f"{x_col}_-{i}"
                self.df[x_col_i]    = df[x_col].shift(i)

                self.x_col_windowed.append(x_col_i)

        #clean na columns, which were caused by the shifts
        self.df.dropna(inplace = True)
        print(f"\nApplying shifitng window:\nx_window: -{self.x_window}")

        return

    def __split_data(self):

        #reset index for splitting data
        self.df.reset_index(inplace = True, drop = True)
        length = self.df.shape[0]

        #setting split value fractions
        valid_frac_0      = 0.1
        test_frac_1       = 0.05
        train_frac_2      = 0.7
        valid_frac_3      = 0.1
        test_frac_4       = 0.05

        #get end indexes
        index_end_list = []
        cum_frac = 0

        for frac in [valid_frac_0, test_frac_1, train_frac_2, valid_frac_3, test_frac_4]:
            cum_frac += frac
            index_end_list.append(round(length * cum_frac))

        #get indexes (ugly code)
        df_indexes = self.df.index.tolist()
        train_i     = df_indexes[index_end_list[1] : index_end_list[2]]
        valid_i     = df_indexes[ : index_end_list[0]]                      + df_indexes[index_end_list[2] : index_end_list[3]]
        test_i      = df_indexes[index_end_list[0] : index_end_list[1]]     + df_indexes[index_end_list[3] : index_end_list[4]]

        #get df from indexes
        self.df_train_x = self.df[self.x_col_windowed].loc[self.df.index.isin(train_i)]
        self.df_train_y = self.df[self.y_col].loc[self.df.index.isin(train_i)]

        #create valid df
        self.df_valid_x = self.df[self.x_col_windowed].loc[self.df.index.isin(valid_i)]
        self.df_valid_y = self.df[self.y_col].loc[self.df.index.isin(valid_i)]

        #create valid df
        self.df_test_x = self.df[self.x_col_windowed].loc[self.df.index.isin(test_i)]
        self.df_test_y = self.df[self.y_col].loc[self.df.index.isin(test_i)]

        #check
        print("\nSplitting data:")
        for df, df_type in zip (
            [self.df_train_y,self.df_valid_y, self.df_test_y],
            "train,valid,test".split(",")
            ):

            print(f"{df_type} size:\t{round(df.shape[0] / length,2)}\t{df.shape[0]}")

        #set data for plotting in raw df
        self.df_raw["set"] = None
        for index_items, set_type in zip([train_i, valid_i, test_i],["train", "valid", "test"]):
            self.df_raw.loc[self.df_raw.index.isin(index_items), "set"] = set_type

        return

    def plot_set_distribution(self, plotter, style, plt_style):

        if (style == "histogram") and (self.model_metric == "c"): #only plotlable with classificaiton model

            fig = px.histogram(
                data_frame = self.df_raw,
                x = "set",
                color = "t2m_t2_cat",
                histfunc = "count",

                barmode = "group",
                title = "Categorical distribution of sets",
                color_discrete_sequence = plt_style,
            )

        elif (style == "scatter"):

            fig = px.scatter(
                data_frame = self.df_raw,
                x = "date",
                y = "t2m",
                color = "set",

                title = "Trend distribution of sets",
                color_discrete_sequence = plt_style,
            )

        plotter(fig)

    def __split_data_deprecated(self):

        #df length
        length = self.df.shape[0]

        #setting split values
        valid_frac     = 0.2
        test_frac      = 0.1

        #get indexes
        train_end       = round(length * (1 - (valid_frac + test_frac)))
        valid_end       = round(length * (1 - (test_frac)))
        test_end        = round(length * (1))

        #create train df
        self.df_train_x = self.df[self.x_col_windowed].iloc[:train_end]
        self.df_train_y = self.df[self.y_col].iloc[:train_end]

        #create valid df
        self.df_valid_x = self.df[self.x_col_windowed].iloc[train_end:valid_end]
        self.df_valid_y = self.df[self.y_col].iloc[train_end:valid_end]

        #create valid df
        self.df_test_x = self.df[self.x_col_windowed].iloc[valid_end:test_end]
        self.df_test_y = self.df[self.y_col].iloc[valid_end:test_end]

        #check
        print("\nSplitting data:")
        for df, df_type in zip (
            [self.df_train_y,self.df_valid_y, self.df_test_y],
            "train,valid,test".split(",")
            ):

            print(f"{df_type} size:\t{round(df.shape[0] / length,2)}\t{df.shape[0]}")

        return

    def __standardize_data(self):

        label_cat = [0,1]; label_cat.sort()
        self.standardizing_values = {
            "x" : {},
            "y" : {},
            #    col1 : {"mean" : value, "std"  : value},
            #    col2 : {"mean" : value, "std"  : value},
            #}
            #"y" : ...
        }

        print("\nStandardizing values:")
        for col in self.df.columns:

            distinct_values = list(self.df[col].unique())
            distinct_values.sort()

            if label_cat == distinct_values: #skip categorical values
                continue

            #get mean and std for all columns across both data both data frames
            if col in self.x_col_windowed:

                self.standardizing_values["x"][col]             = {}
                self.standardizing_values["x"][col]["mean"]     = self.df_train_x[col].mean()
                self.standardizing_values["x"][col]["std"]      = self.df_train_x[col].std()

            elif col in self.y_col:

                self.standardizing_values["y"][col]             = {}
                self.standardizing_values["y"][col]["mean"]     = self.df_train_y[col].mean()
                self.standardizing_values["y"][col]["std"]      = self.df_train_y[col].std()

        #apply values
        for df, col_type in zip([self.df_train_x, self.df_valid_x, self.df_test_x, self.df_train_y, self.df_valid_y, self.df_test_y], ["x","x","x","y","y","y"]):
            for col in self.standardizing_values[col_type].keys():

                mean    = self.standardizing_values[col_type][col]["mean"]
                std     = self.standardizing_values[col_type][col]["std"]
                df[col] = (df[col] - mean) / std #standardization

        #check sum
        print(f"Checksum train x: {self.df_train_x[list(self.standardizing_values['x'].keys())].mean().round(2).sum()}")
        print(f"Checksum train y: {self.df_train_y[list(self.standardizing_values['y'].keys())].mean().round(2).sum()}")

        return

    def unstandardize_data(self):

        #apply values
        for df, col_type in zip([self.df_train_x, self.df_valid_x, self.df_test_x, self.df_train_y, self.df_valid_y, self.df_test_y], ["x","x","x","y","y","y"]):
            for col in self.standardizing_values[col_type].keys():

                mean    = self.standardizing_values[col_type][col]["mean"]
                std     = self.standardizing_values[col_type][col]["std"]
                df[col] = df[col] * std + mean #reversed standardization

        return

    def __unstanardize_y(self, y_t1, y_t2):

        mean_t1        = self.standardizing_values["y"]["t2m_t1"]["mean"]
        mean_t2        = self.standardizing_values["y"]["t2m_t2"]["mean"]
    
        std_t1        = self.standardizing_values["y"]["t2m_t1"]["std"]
        std_t2        = self.standardizing_values["y"]["t2m_t2"]["std"]

        y_t1_unst  = y_t1 * std_t1 + mean_t1
        y_t2_unst  = y_t2 * std_t1 + mean_t2

        return y_t1_unst, y_t2_unst

    def __set_assesment(self):

        if self.model_metric == "c":
            self.get_model_score = self.__get_model_score_c

        elif self.model_metric == "r":
            self.get_model_score = self.__get_model_score_r

    def __get_model_score_c(self, model = None, get_test_score = False): #used, when model_metric == "c"

        #default
        get_conf_mat = False
        mat_labels = [0,1]

        #if a model is passed, the function is calles from run_optim, otherwise,
        if model is None:
            model = self.model
            get_conf_mat = True #only get confuciton matrix when a single model is created
        if model is None:
            print("No model has been set. Create a model first or pass one as a param")
            return

        score = {}

        #create predictions
        y_train_pred    = model.predict(self.df_train_x)
        y_valid_pred    = model.predict(self.df_valid_x)
        y_test_pred     = model.predict(self.df_test_x)

        #seperate t1 and t2 for individual scoring
        for raw_key, y_pred, y in zip(
            ["train",           "valid",            "test"],
            [y_train_pred,      y_valid_pred,       y_test_pred],
            [self.df_train_y,   self.df_valid_y,    self.df_test_y],
        ):

            #not fetting test accurarcy if not set
            if (get_test_score == False) and raw_key == "test":
                continue

            #split
            y_pred_t1 = y_pred[:,0]
            y_pred_t2 = y_pred[:,1]

            y_t1 = y[self.y_col[0]]
            y_t2 = y[self.y_col[1]]

            #get acc
            score[f"{raw_key}_accuracy_t1"]     = round(accuracy_score(y_true = y_t1, y_pred = y_pred_t1),3)
            score[f"{raw_key}_accuracy_t2"]     = round(accuracy_score(y_true = y_t2, y_pred = y_pred_t2),3)
            score[f"{raw_key}_accuracy"]        = round(accuracy_score(y_true = y, y_pred = y_pred),3)

            #get conf mat
            if get_conf_mat is True:
                score[f"{raw_key}_mat_t1"]     = confusion_matrix(y_true = y_t1, y_pred = y_pred_t1, labels = mat_labels)
                score[f"{raw_key}_mat_t2"]     = confusion_matrix(y_true = y_t2, y_pred = y_pred_t2, labels = mat_labels)
                #score[f"{raw_key}_mat"]        = confusion_matrix(y_true = y, y_pred = y_pred) #multi labels are not supported

        #return metrics
        if get_conf_mat is True:
            self.score = score
            [print(f"{key} :\t\t{score[key]}") for key in score.keys() if isinstance(score[key],float)]
            return score

        return score

    def __get_model_score_r(self, model = None, get_test_score = False, unstandardize_score = False): #used, when model_metric == "c"

        #if a model is passed, the function is calles from run_optim, otherwise,
        set_score = False

        if model is None: #model is not none when automation is run
            model = self.model
            set_score = True
        if model is None:
            print("No model has been set. Create a model first or pass one as a param")
            return

        score = {}

        #create predictions
        y_train_pred    = model.predict(self.df_train_x)
        y_valid_pred    = model.predict(self.df_valid_x)
        y_test_pred     = model.predict(self.df_test_x)

        #seperate t1 and t2 for individual scoring
        for raw_key, y_pred, y in zip(
            ["train",           "valid",            "test"],
            [y_train_pred,      y_valid_pred,       y_test_pred],
            [self.df_train_y,   self.df_valid_y,    self.df_test_y],
        ):

            #not fetting test accurarcy if not set
            if (get_test_score == False) and raw_key == "test":
                continue

            #split
            y_pred_t1 = y_pred[:,0]
            y_pred_t2 = y_pred[:,1]

            y_t1 = y[self.y_col[0]]
            y_t2 = y[self.y_col[1]]

            #unstandardize data (ugly code go brrrr)
            if unstandardize_score:
                y_t1, y_t2              = self.__unstanardize_y(y_t1 = y_t1, y_t2 = y_t2)
                y_pred_t1, y_pred_t2    = self.__unstanardize_y(y_t1 = y_pred_t1, y_t2 = y_pred_t2)

                y_pred[:,0], y_pred[:,1]                = y_pred_t1, y_pred_t2
                y[self.y_col[0]], y[self.y_col[1]]      = y_t1, y_t2

            #get r^2
            score[f"{raw_key}_r^2_t1"]      = round(r2_score(y_true = y_t1, y_pred = y_pred_t1),3)
            score[f"{raw_key}_r^2_t2"]      = round(r2_score(y_true = y_t2, y_pred = y_pred_t2),3)
            score[f"{raw_key}_r^2"]         = round(r2_score(y_true = y, y_pred = y_pred),3)

            #get rmse
            score[f"{raw_key}_rmse_t1"]      = round(np.sqrt(mean_squared_error(y_true = y_t1, y_pred = y_pred_t1)),3)
            score[f"{raw_key}_rmse_t2"]      = round(np.sqrt(mean_squared_error(y_true = y_t2, y_pred = y_pred_t2)),3)
            score[f"{raw_key}_rmse"]         = round(np.sqrt(mean_squared_error(y_true = y, y_pred = y_pred)),3)

        #return metrics
        if set_score:
            self.score = score
            [print(f"{key} :\t\t{score[key]}") for key in score.keys() if isinstance(score[key],float)]
            return

        return score

    def __get_model_score_r_deprecated(self, model = None, get_test_score = False): #used, when model_metric == "c"

        #if a model is passed, the function is calles from run_optim, otherwise,
        if model is None:
            model = self.model
            set_score = True
        if model is None:
            print("No model has been set. Create a model first or pass one as a param")
            return

        score = {}

        #create predictions
        y_train_pred    = model.predict(self.df_train_x)
        y_valid_pred    = model.predict(self.df_valid_x)
        y_test_pred     = model.predict(self.df_test_x)

        #seperate t1 and t2 for individual scoring
        for raw_key, y_pred, y in zip(
            ["train",           "valid",            "test"],
            [y_train_pred,      y_valid_pred,       y_test_pred],
            [self.df_train_y,   self.df_valid_y,    self.df_test_y],
        ):

            #not fetting test accurarcy if not set
            if (get_test_score == False) and raw_key == "test":
                continue

            #split
            y_pred_t1 = y_pred[:,0]
            y_pred_t2 = y_pred[:,1]

            y_t1 = y[self.y_col[0]]
            y_t2 = y[self.y_col[1]]

            #get r^2
            score[f"{raw_key}_r^2_t1"]      = round(r2_score(y_true = y_t1, y_pred = y_pred_t1),3)
            score[f"{raw_key}_r^2_t2"]      = round(r2_score(y_true = y_t2, y_pred = y_pred_t2),3)
            score[f"{raw_key}_r^2"]         = round(r2_score(y_true = y, y_pred = y_pred),3)

            #get rmse
            score[f"{raw_key}_rmse_t1"]      = round(np.sqrt(mean_squared_error(y_true = y_t1, y_pred = y_pred_t1)),3)
            score[f"{raw_key}_rmse_t2"]      = round(np.sqrt(mean_squared_error(y_true = y_t2, y_pred = y_pred_t2)),3)
            score[f"{raw_key}_rmse"]         = round(np.sqrt(mean_squared_error(y_true = y, y_pred = y_pred)),3)

        #return metrics
        if set_score:
            self.score = score
            [print(f"{key} :\t\t{score[key]}") for key in score.keys() if isinstance(score[key],float)]
            return

        return score

    def plot_confusion_mat(self, set = "valid"):
        """set = 'train', 'valid', 'test'"""

        mat_keys = [key for key in self.score.keys() if ("mat" in key) and (set in key)]

        for mat_key in mat_keys:

            mat = self.score[mat_key]
            title = str(mat_key).replace("_mat_", " ")

            fig  = px.imshow(
                mat,
                color_continuous_scale = px.colors.sequential.haline_r,
                text_auto = True,
            )

            #labels and layout
            fig.update_layout(

                title = f"Confusion matrix: {title}",

                width=500,
                height=500,
                
                xaxis_title="Predicted label",
                yaxis_title="True label",

                xaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1],
                    ticktext = ["above", "below"]
                ),

                yaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1],
                    ticktext = ["above", "below"],
                ),
            )

            #set font
            fig.update_layout(
                font = dict(size=16),
                title_font = dict(size=20),
                xaxis_title_font = dict(size=18),
                yaxis_title_font = dict(size=18),
            )

            fig.show()

    def save_result(self, param, score):

        #merge and create a dataframe
        param.update(score); data = param
        df_result = pd.DataFrame([data])

        #create results file and set header length as param to negate reading file
        if os.path.isfile(self.results_file) is True:
            df_saved_result = pd.read_csv(self.results_file)
            df_result = df_saved_result.append(df_result)

        df_result.to_csv(self.results_file, index = False)

        return

    def get_results(self):

        df = pd.read_csv(self.results_file)
        return df

In [None]:
class RF(Base): #child

    def run_optim(self, n_trees = 12, n_depth=3, n_leafs = 5):

        self.model = None #clear any models if there should be one

        param_list = self.create_param_list(
            n_trees = n_trees,
            n_depth = n_depth,
            n_leafs = n_leafs,
        )

        for param in param_list:

            print( f"Progress of optim:\t{round((param_list.index(param) / len(param_list)) * 100,1)}",end = "\r")

            score = self.create_model(param = param, single_model = False)
            self.save_result(param = param, score = score)

        print("Optim successfull. Read results with self.get_results()")
        return

    def create_param_list(self, n_trees, n_depth, n_leafs):
        """return list: [{param_1 : value_1},{},]"""

        param_list = []

        #get individual numbers
        n_estimators        = [2**n for n in range(1,n_trees+1)]
        max_depths          = [10*n for n in range(1,n_depth+1)]
        min_sample_leafs    = [2*n  for n in range(1,n_leafs+1)]

        combinations = list(itertools.product(n_estimators, max_depths, min_sample_leafs))

        for combination in combinations:

            params = {
                "n_estimators"      : combination[0],
                "max_depth"         : combination[1],
                "min_samples_leaf"  : combination[2],
            }

            param_list.append(params)

        print(f"\nGenerated param combinations: {len(param_list)}")
        return param_list


    def create_model(self, param, single_model = True):
        """if single_model == False:
            the scores get retuned
            self.mode is not set
        elif single_model == True:
            scores do not get returned
            seld.model is set"""

        #create model
        if self.model_metric == "r":
            ml_model = rfr

        elif self.model_metric == "c":
            ml_model = rfc

        model = ml_model(
            n_jobs          = self.n_jobs,
            random_state    = self.random_state,
            **param, #unpack the dict and dumps its values
        )

        #fit model
        model.fit(X = self.df_train_x, y = self.df_train_y)

        #set according metrics
        if single_model is True:
            self.model = model
            print(self.model)
            return

        elif single_model is False:
            score = self.get_model_score(model)
            return score

## 3.1 RF Calssification

### 3.1.1 Modeling and hyper parameter tuning

In [None]:
rfc_obj = RF(
    df              = df,
    y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

    n_jobs          = 4,

    data_folder     = data_folder,
    results_file    = "optim_reults_rfc.csv",

    model_metric    =  "c" # r = regression, c = classification
)

In [None]:
plot_dist = False

if plot_dist:
    rfc_obj.plot_set_distribution(plotter = scale_show, style = "histogram", plt_style = plt_style_s)
    rfc_obj.plot_set_distribution(plotter = scale_show, style = "scatter", plt_style = plt_style_s)

In [None]:
if run_optim is True:
    rfc_obj.run_optim()

In [None]:
df_results = rfc_obj.get_results()
df_results.sort_values(by = "valid_accuracy", ascending = False, inplace = True)
df_results.head(3)

In [None]:
df_results.sort_values(by = "valid_accuracy_t1", ascending = False, inplace = True)
df_results.head(3)

In [None]:
df_results.sort_values(by = "valid_accuracy_t2", ascending = False, inplace = True)
df_results.head(3)

In [None]:
for frame in ["", "_t1", "_t2"]:

    title = "Fitting graph: estimators"

    if frame != "":
        title = f"Fitting graph {frame[1:]}: estimators"

    fig = px.scatter(

        data_frame = df_results,
        x = "n_estimators",
        y = [f"train_accuracy{frame}", f"valid_accuracy{frame}"],

        color_discrete_sequence = plt_style_s,
        title = title,
        log_x  = True,

        labels = {"value": "accuracy"},
        range_y = [0,1.1]

    )

    scale_show(fig)

### 3.1.2 Top model evaluation

In [None]:
#creating sinlge model
optimal_param = {
    "n_estimators"      : 128,
    "max_depth"         : 10,
    "min_samples_leaf"   : 8
}

rfc_obj.create_model(param = optimal_param)
rfc_obj.get_model_score()

https://stackoverflow.com/questions/60860121/plotly-how-to-make-an-annotated-confusion-matrix-using-a-heatmap

In [None]:
rfc_obj.plot_confusion_mat(set = "valid")

In [None]:
# confusion mat
#add plotting to class

In [None]:
# feature importance and further improvment
weights         = rfc_obj.model.feature_importances_
cols_window     = rfc_obj.x_col_windowed
n_col           = len(rfc_obj.x_col)
window_size     = rfc_obj.x_window

mat_head    = rfc_obj.x_col
mat         = []

last_satrt = 0

for i in range(1, 1+window_size):

    end = i * n_col
    mat.append(list(weights[last_satrt:end]))

    last_satrt = end

df_feature_importance = pd.DataFrame(mat, columns = rfc_obj.x_col)

In [None]:
df_feature_importance_by_f = df_feature_importance.sum()
df_feature_importance_by_f.sort_values(inplace = True, ascending = False)

fig = px.histogram(
    x = df_feature_importance_by_f,
    y = df_feature_importance_by_f.index,
    nbins = n_col,
    histfunc = "sum",
    color_discrete_sequence = plt_style_s,
    title = "Feature importance by feature",

    labels = {"x" : "weights", "y" : "feature"}
)

fig.update_yaxes(autorange="reversed")

scale_show(fig, width = 750, height = 750)

In [None]:
df_feature_importance_by_t = df_feature_importance.T.sum()

In [None]:
df_feature_importance_by_t = df_feature_importance.T.sum()
#df_feature_importance_by_t.sort_values(inplace = True, ascending = False)

fig = px.histogram(
    x = df_feature_importance_by_t,
    y = df_feature_importance_by_t.index.astype(str),
    nbins = n_col,
    histfunc = "sum",
    color_discrete_sequence = plt_style_s,
    title = "Feature importance by time",

    labels = {"x" : "weights", "y" : "time offset"}
)

fig.update_yaxes(autorange="reversed")

scale_show(fig, width = 750, height = 750)

In [None]:
fig = px.imshow(df_feature_importance.T.to_numpy(), color_continuous_scale = plt_style_c)

fig.update_layout(

    title = f"Feature importance",

    width=1000,
    height=1000,
    
    yaxis_title="Feature",
    xaxis_title="Time",

    yaxis = dict(
        tickmode = 'array',
        tickvals = list(range(len(rfc_obj.x_col))),
        ticktext =  rfc_obj.x_col,
    ),
)

fig.update_layout(
    font = dict(size=16),
    title_font = dict(size=20),
    xaxis_title_font = dict(size=18),
    yaxis_title_font = dict(size=18),
)

fig.show()

In [None]:
#apply to testing set

In [None]:
rfc_obj.get_model_score( get_test_score = True)

In [None]:
rfc_obj.plot_confusion_mat(set = "test")

## 3.1.3 Exploration of missing features

In [None]:
df_mis = df.drop(labels = "year", axis = 1, inplace = False)

In [None]:
rfc_obj_mis = RF(
    df              = df_mis,
    y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

    n_jobs          = 4,

    data_folder     = data_folder,
    results_file    = "optim_reults_rfc.csv",

    model_metric    =  "c" # r = regression, c = classification
)

In [None]:
rfc_obj_mis.create_model(param = optimal_param)
rfc_obj_mis.get_model_score(get_test_score = True)

In [None]:
rfc_obj_mis.plot_confusion_mat(set = "test")

## 3.2 Dimensionality reduction

In [None]:
#dropping feautres
set_0 = df_feature_importance_by_f.index[:5].tolist() + ['date', 't2m_t1', 't2m_t2', 't2m_t1_mean', 't2m_t2_mean', "t2m_t1_cat", "t2m_t2_cat"]
set_1 = df_feature_importance_by_f.index[:10].tolist() + ['date', 't2m_t1', 't2m_t2', 't2m_t1_mean', 't2m_t2_mean', "t2m_t1_cat", "t2m_t2_cat"]
set_2 = df_feature_importance_by_f.index[:15].tolist() + ['date', 't2m_t1', 't2m_t2', 't2m_t1_mean', 't2m_t2_mean', "t2m_t1_cat", "t2m_t2_cat"]
set_3 = df.drop(labels = ["nao", "ao", "t2m", "mjo_amplitude", "sp", "day", "soi"], axis = 1, inplace = False).columns

In [None]:
df_0 = df[set_0]
df_1 = df[set_1]
df_2 = df[set_2]
df_3 = df[set_3]

In [None]:
def magic(df, set_name):

    rfc_set = RF(
        df              = df,
        y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

        n_jobs          = 4,

        data_folder     = data_folder,
        results_file    = "optim_reults_rfc.csv",

        model_metric    =  "c" # r = regression, c = classification
    )

    #creating sinlge model
    optimal_param = {
            "n_estimators"      : 128,
            "max_depth"         : 10,
            "min_samples_leaf"   : 8
        }

    rfc_set.create_model(param = optimal_param)

    print(f"\n{set_name}")
    score = rfc_set.get_model_score(get_test_score = False)

    return score

In [None]:
results = {}

for df_set, set_name in zip([df_0, df_1, df_2, df_3], ["top 5 features", "top 10 features", "top 15 features", "handpicked"]):

    result = magic(df_set, set_name)
    results[set_name] = result

In [None]:
results

In [None]:
df_result = pd.DataFrame(results)
df_result.drop(labels = [ind for ind in df_result.index if "mat" in ind], axis = 0, inplace = True)
df_result.T


## 3.3 Overfitting prevention

In [None]:
def magic(df, param):

    rfc_param = RF(
        df              = df,
        y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

        n_jobs          = 4,

        data_folder     = data_folder,
        results_file    = "optim_reults_rfc.csv",

        model_metric    =  "c" # r = regression, c = classification
    )

    rfc_param.create_model(param = param)
    score = rfc_param.get_model_score(get_test_score = False)

    return score

In [None]:
df.T

In [None]:
#creating sinlge model
param_set0 = {
        "n_estimators"      : 100,
        "max_depth"         : 10,
        "min_samples_leaf"  : 8,
    }

#creating sinlge model
param_set1 = {
        "n_estimators"      : 128,
        "max_depth"         : 5,
        "min_samples_leaf"  : 8,
    }

#creating sinlge model
param_set2 = {
        "n_estimators"      : 128,
        "max_depth"         : 10,
        "min_samples_leaf"  : 200,
    }

#creating sinlge model
param_set3 = {
        "n_estimators"      : 128,
        "max_depth"         : 10,
        "min_samples_leaf"  : 8,
        "min_samples_split" : 200,
    }

results = []

for param in [param_set0, param_set1, param_set2, param_set3]:

    result = magic(df_2, param)
    result.update(param)
    results.append(result)

In [None]:
df_results = pd.DataFrame(results)
df_results.T

In [None]:
#overwrite RF param generator
class RFOP(RF):

    def create_param_list(self, n_trees, n_depth, n_leafs):
        """return list: [{param_1 : value_1},{},]"""

        param_list = []

        #get individual numbers
        n_estimators        = [128]
        max_depths          = [2*n for n in range(1,n_depth+1)]
        min_sample_leafs    = [25*n  for n in range(1,n_leafs+1)]

        combinations = list(itertools.product(n_estimators, max_depths, min_sample_leafs))

        for combination in combinations:

            params = {
                "n_estimators"      : combination[0],
                "max_depth"         : combination[1],
                "min_samples_leaf"  : combination[2],
            }

            param_list.append(params)

        print(f"\nGenerated param combinations: {len(param_list)}")
        return param_list

In [None]:
rfc_op_obj = RFOP(
    df              = df,
    y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

    n_jobs          = 4,

    data_folder     = data_folder,
    results_file    = "optim_reults_rfc_op.csv",

    model_metric    =  "c" # r = regression, c = classification
)

In [None]:
df

In [None]:
if run_optim:
     rfc_op_obj.run_optim(n_depth = 7, n_leafs = 12)

In [None]:
df_results_op = rfc_op_obj.get_results()
df_results_op.sort_values(by = "valid_accuracy", ascending = False, inplace = True)

df_results_op.head(10)

In [None]:
fig = px.scatter(
    data_frame = df_results_op,
    x = "train_accuracy",
    y = "valid_accuracy",
    color = "min_samples_leaf",
    size = "max_depth",

    title = "Fitting graph RF Classifier",
    trendline = "lowess",
    color_continuous_scale = plt_style_c,
)

fig.add_hline(
    y = 0.55,
)
fig.add_vline(
    x = 0.5
)

fig.add_scatter()

scale_show(fig)

In [None]:
optimal_param = {
    "n_estimators" : 128,
    "max_depth": 8,
    "min_samples_leaf": 225 #225
}

In [None]:
#create optimal model
rfc_op_obj.create_model(param = optimal_param)
rfc_op_obj.get_model_score(get_test_score = True)
rfc_op_obj.plot_confusion_mat(set = "test")

In [None]:
[item for item in df.columns.tolist() if item not in df_2.columns.tolist()]

In [None]:
df_2.columns.tolist()

In [None]:
rfc_op_obj = RFOP(
    df              = df_2,
    y_col           = ["t2m_t1_cat", "t2m_t2_cat"], #or ["t2m_t1_cat", "t2m_t1_cat"]

    n_jobs          = 4,

    data_folder     = data_folder,
    results_file    = "optim_reults_rfc_op.csv",

    model_metric    =  "c" # r = regression, c = classification
)

In [None]:
#create optimal model
rfc_op_obj.create_model(param = optimal_param)
rfc_op_obj.get_model_score(get_test_score = True)
rfc_op_obj.plot_confusion_mat(set = "test")