# 3 Modelling - Random forest classifier

In [1]:
#general
import pandas as pd
import numpy as np
import os
import plotly.express as px
from datetime import datetime
import csv

#ml
import sklearn
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import RandomForestRegressor as rfr

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import TimeSeriesSplit as tsp
from sklearn.model_selection import GridSearchCV as gscv

#data folder
data_folder : str = "data"

#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#decide if the data gets saved or not
run_optim : bool = False #takes approx. 20 min. with n_jobs = 1

## 3.1 Create model and hyper parameter tuning

In [2]:
df_main = pd.read_csv(os.path.join("data", "df.csv"), index_col = "index")
df_main.shape

(527, 27)

In [3]:
#create test set
n_years_test = 4
years = list(set(df_main["year"].to_list()))
years.sort()

#set relevant years
indexes = [int(len(years) * (i/n_years_test)) - 1 for i in range(1, 1 + n_years_test)]
test_years = [years[i-1] for i in indexes] ##added minus 1 to get a better distributed data set

#create df_test und df
df = df_main[~df_main["year"].isin(test_years)]
df_test = df_main[df_main["year"].isin(test_years)]

print(f"test years:\t{test_years}")
print(f"test set:\t{df_test.shape[0]}\ntrain set:\t{df.shape[0]}")


test years:	[1988, 1999, 2010, 2021]
test set:	48
train set:	479


In [4]:
df.shape

(479, 27)

In [5]:
#check data distribution
fig = px.histogram(
    data_frame = df,
    x = "t2m_cat_offset",
    nbins = 2,
    color = "month",
    histfunc = "count",

    title = "train set - amount per category",
    width = 1000,
    height = 500,
    color_discrete_sequence = plt_style_c,
)

fig.show()

In [6]:
fig = px.histogram(
    data_frame = df_test,
    color = "t2m_cat_offset",
    x = "year",
    color_discrete_sequence = plt_style_s,
    title = "distribution of categories over time",
    histfunc= "count",
    nbins = len(set(df["year"].to_list())) + 4,

    width = 1000,
    height = 500,
)

fig.show()

In [7]:
fig = px.histogram(
    data_frame = df,
    color = "t2m_cat_offset",
    x = "year",
    color_discrete_sequence = plt_style_s,
    title = "distribution of categories over time",
    histfunc= "count",
    nbins = len(set(df["year"].to_list())) + 4,

    width = 1000,
    height = 500,
)

fig.show()

In [8]:
#check data distribution
fig = px.histogram(
    data_frame = df_test,
    x = "t2m_cat_offset",
    nbins = 2,
    color = "month",
    histfunc = "count",

    title = "test set - amount per category",
    width = 1000,
    height = 500,
    color_discrete_sequence = plt_style_c,
)

fig.show()

- not optimal distribution

In [9]:
fib = []
pow2 = []

n = list(range(1,15))
list(map(lambda i: fib.append(fib[-2] + fib[-1]) if len(fib) >= 2 else fib.append(1), n))
list(map(lambda i: pow2.append(2**i), n))

fig = px.line(
    x = n,
    y = [fib, pow2],
    color_discrete_sequence = plt_style_s,
    title = "2**n vs fib"
)

fig.show()

Finding:
- use n^2 for optimization list
- should yield sufficient percise enough results

In [10]:
#https://scikit-learn.org/stable/modules/cross_validation.html#timeseries-cv
#see chapter 3.1.2.6. Cross validation of time series data for time series data

#create time series plit list
#pass list into cross valid as cv

class RFC():

    def __init__ (self, y_col : str, df : object, run_optim : bool, data_folder : str, n_estim : int = 0, n_depth : int = 0, n_jobs : int = 1):

        #set base infromation
        self.y_col : str            = y_col
        self.df : object            = df
        self.run_optim : bool       = run_optim
        self.n_estim : int          = n_estim
        self.n_depth : int          = n_depth
        self.n_jobs : int           = n_jobs

        #fixed params
        self.n_folds : int          = 4
        self.random_state : int     = 42

        #set saving infos
        self.log_file : str         = os.path.join(data_folder, "optim_log.txt")
        self.result_file : str      = os.path.join(data_folder, "rfc_results.csv")

        #run optim if asked
        if run_optim:
            self.__run_optim()
        else:
            self.__df_split()

        #read results file
        self.__get_results()

        return

    def __run_optim(self):

        self.__generate_param_list()
        self.__log(message = f"Started optim with parameters: estim {self.n_estim}, depth {self.n_depth}")
        self.__df_split()
        self.__cross_valid()
        self.__log(message = f"Finished optim with parameters: estim {self.n_estim}, depth {self.n_depth}")

        return

    def __generate_param_list(self):
        """list of tuples: (estim, depth)"""

        self.param_list : list = []

        #generate the param list for n_estimattors and max depth
        for estim in range(1, self.n_estim + 1):
            for depth in range(1, self.n_depth + 1):

                n_estim = 2**estim
                n_depth = 2**depth
                self.param_list.append((n_estim, n_depth))

        return

    def __df_split(self):

        #create x and y
        self.X = self.df.drop(labels = self.y_col, axis = 1)
        self.y = self.df[self.y_col]

        #instanciate time splitter
        self.cv = tsp(n_splits=self.n_folds)

        return

    def __cross_valid(self):

        for param in self.param_list:

            print(f"Progress: {round((self.param_list.index(param) +1)/ len(self.param_list) * 100)}%", end = "\r")

            n_estim : int       = param[0]
            max_depth : int     = param[1]

            model = rfc(
                n_jobs = self.n_jobs,
                n_estimators = n_estim,
                max_depth = max_depth,
                random_state = self.random_state
            )

            result = cross_validate(
                estimator = model,
                X = self.X,
                y = self.y,
                cv = self.cv,
                return_train_score = True
            )

            train_score = round(np.mean(result["train_score"]),4)
            test_score = round(np.mean(result["test_score"]),4)

            self.__save_result(n_folds = self.n_folds, n_estim = n_estim, n_depth = max_depth, test_score = test_score, train_score = train_score)

        return

    def __save_result(self, n_folds, n_estim, n_depth, test_score, train_score):

        #check if file exists and create if it does not
        if os.path.isfile(self.result_file) is False:

            file = open(self.result_file, "w", newline='')
            writer = csv.writer(file)
            writer.writerow(["n_folds", "n_estim", "n_depth", "test_score", "train_score"])
            file.close()

        #save data
        file = open(self.result_file, "a", newline='')
        writer = csv.writer(file)
        writer.writerow([n_folds, n_estim, n_depth, test_score, train_score])
        file.close()

        return

    def __log(self, message):

        #create log entry
        log_time : str = datetime.now()
        message = f"source_rfc,{log_time},{message}\n"

        #write log entry
        file_object = open(self.log_file, 'a')
        file_object.write(message)
        file_object.close()

        return

    def __get_results(self):

        self.results = pd.read_csv(self.result_file)

    def __single_model_confusion_matrix(self, test_df):

        #set labels
        self.conf_mat_labels = [0, 1]

        #test conf mat
        if test_df is not None:

            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]

            self.y_hat_test = self.model.predict(X_test)
            self.confussion_mat_test = sklearn.metrics.multilabel_confusion_matrix(y_test, self.y_hat_test, labels = self.conf_mat_labels)

        #train conf mat
        self.y_hat_train = self.model.predict(self.X_train)
        self.confussion_mat_train = sklearn.metrics.multilabel_confusion_matrix(self.y_train, self.y_hat_train, labels = self.conf_mat_labels)

        #valid conf mat
        self.y_hat_valid = self.model.predict(self.X_valid)
        self.confussion_mat_valid = sklearn.metrics.multilabel_confusion_matrix(self.y_valid, self.y_hat_valid, labels = self.conf_mat_labels)

        return

    def __single_model_score(self, test_df):

        #get the test score, if a test set is provided
        if test_df is not None:
            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]
            test_score = self.model.score(X_test, y_test)

        else:
            test_score = None

        #get model socres
        train_score : float = self.model.score(self.X_train, self.y_train)
        valid_score : float = self.model.score(self.X_valid, self.y_valid)

        self.single_model_scores = {
            "train_score" : train_score,
            "valid_score" : valid_score,
            "test_score" : test_score,
        }

        return

    def __single_model_split_v1(self, valid_frac):
        """[deprecated] use v2 instead"""

        index = round(self.df.shape[0] * valid_frac)

        self.X_train = self.X.iloc[index:]
        self.y_train = self.y.iloc[index:]

        self.X_valid = self.X.iloc[:index]
        self.y_valid = self.y.iloc[:index]

        return

    def __single_model_split_v2(self, valid_frac):

        years = list(set(self.df["year"].to_list()))
        years.sort()

        n_years = int(len(years) * valid_frac)
        n_years_half = ((n_years % 2) + n_years) / 2 #round to even numbers and split in half

        #get target year list
        valid_years = years[round(((len(years) - 1) / 2) - n_years_half) : round((len(years) / 2 ) -1 )] + years[int(-n_years_half):]
        train_years = [year for year in years if year not in valid_years]

        #generate valid and train dfs
        df_valid = self.df[self.df["year"].isin(valid_years)]
        df_train =self.df[self.df["year"].isin(train_years)]

        #generate x and y
        self.X_train = df_train.drop(labels = self.y_col, axis = 1)
        self.y_train = df_train[self.y_col]

        self.X_valid = df_valid.drop(labels = self.y_col, axis = 1)
        self.y_valid = df_valid[self.y_col]

        #free up memory
        del df_valid, df_train, valid_years, train_years #free up memory

        return

    def __read_top_result(self , n : int = 0):

        #sort results df to get the "best" result
        df_top = self.results.sort_values(
                by = ["test_score", "train_score", "n_estim", "n_depth"],
                ascending = [False, False, False, True]
            )

        n_estim = int(df_top.iloc[n]["n_estim"])
        n_depth = int(df_top.iloc[n]["n_depth"])

        return n_estim, n_depth

    def create_model(self, n_estim : int = 1, n_depth : int = 1, top_result : bool = True, valid_frac : float = 0.15, test_df : object = None, params : dict = None):

        #split data
        #self.__single_model_split_v1(valid_frac = valid_frac)
        self.__single_model_split_v2(valid_frac = valid_frac)

        #get top performing values
        if top_result:
            n_estim, n_depth = self.__read_top_result()

        #create and fit model
        if params is None:
            self.model = rfc(
                n_jobs = self.n_jobs,
                n_estimators = n_estim,
                max_depth = n_depth,
                random_state = self.random_state,
            )

        else:
            self.model = rfc(
                n_jobs = self.n_jobs,
                random_state = self.random_state,
                **params,
            )

        self.model.fit(X = self.X_train, y = self.y_train)

        #generate results
        self.__single_model_score(test_df)
        self.__single_model_confusion_matrix(test_df)

        return

    def run_grid_search(self):

        #get fixed params
        n_estim_0, n_depth_0 = self.__read_top_result(n = 0)
        n_estim_1, n_depth_1 = self.__read_top_result(n = 1)

        #create grid
        param_grid : dict = {
            "max_depth" : [n_depth_0],
            "n_estimators" : [n_estim_0, n_estim_1],
            "min_samples_split" : [2, 4, 6], #2 : default
            "min_samples_leaf" : [1, 2, 4], #1 : default
            "max_features" : ["auto", "sqrt", "log2"], #sqrt : default
            "bootstrap" : [True, False] #defaulf false
        }

        #create esimator
        model = rfc(
            n_jobs = self.n_jobs,
            random_state = self.random_state,
        )

        grid_search = gscv(estimator = model, param_grid = param_grid, cv = self.cv)
        grid_search.fit(self.X, self.y)

        # Print the best parameters and score
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best score: {grid_search.best_score_}")

        return grid_search.best_params_


In [11]:
#create the optimizer object

rfc_optim = RFC(

    y_col = "t2m_cat_offset",
    df = df,
    run_optim = run_optim,
    data_folder = data_folder,

    n_estim = 12,
    n_depth = 12,

    n_jobs = 2 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

In [12]:
#top 10
rfc_optim.results.sort_values(
                by = ["test_score", "train_score", "n_estim", "n_depth"],
                ascending = [False, False, False, True]
            ).head(20)

Unnamed: 0,n_folds,n_estim,n_depth,test_score,train_score
62,4,64,8,0.5868,0.9978
86,4,256,8,0.5842,0.9978
64,4,64,32,0.5816,1.0
65,4,64,64,0.5816,1.0
66,4,64,128,0.5816,1.0
67,4,64,256,0.5816,1.0
68,4,64,512,0.5816,1.0
69,4,64,1024,0.5816,1.0
70,4,64,2048,0.5816,1.0
71,4,64,4096,0.5816,1.0


In [13]:
#bottom 10
rfc_optim.results.sort_values(
                by = ["test_score", "train_score", "n_estim", "n_depth"],
                ascending = [True, True, True, False]
            ).head(10)

Unnamed: 0,n_folds,n_estim,n_depth,test_score,train_score
0,4,2,2,0.4605,0.6765
36,4,16,2,0.4737,0.6824
13,4,4,4,0.4763,0.779
1,4,2,4,0.4895,0.7413
12,4,4,2,0.4921,0.6976
35,4,8,4096,0.4921,0.953
34,4,8,2048,0.4921,0.953
33,4,8,1024,0.4921,0.953
32,4,8,512,0.4921,0.953
31,4,8,256,0.4921,0.953


In [14]:
#create a single model from the top results
rfc_optim.create_model(top_result = True)
rfc_optim.single_model_scores

{'train_score': 1.0, 'valid_score': 0.6197183098591549, 'test_score': None}

In [15]:
#create strin values as categories to make plots overviewable
df_result_plot = rfc_optim.results.copy()
df_result_plot[["n_depth", "n_estim"]] = df_result_plot[["n_depth", "n_estim"]].astype(str)

In [16]:
fig = px.box(
    data_frame = df_result_plot,
    x = "n_depth",
    y = "test_score",

    title = "fitting graph: max depth",
    color_discrete_sequence = plt_style_s,
    width = 1000,
    height = 500,
)

fig.show()

In [17]:
fig = px.box(
    data_frame = df_result_plot,
    x = "n_estim",
    y = "test_score",

    title = "fitting graph: n estimators",
    color_discrete_sequence = plt_style_s,
    width = 1000,
    height = 500,
)

fig.show()

In [18]:
fig = px.scatter(
    data_frame = df_result_plot,
    x = "train_score",
    y = "test_score",

    title = "train vs test score",
    color_discrete_sequence = plt_style_s,
    width = 1000,
    height = 500,
)

fig.show()

Findings:
- The more n_estim, the better the results
- The deeper the rees, the worse the performance (overfitting)
- Model is useless

## 3.2 Feature importance and confusion matrix

In [19]:
#data prep for plotting feature importance
feat_imp = {
    "importance" : rfc_optim.model.feature_importances_,
    "feature" : rfc_optim.X_train.columns
}

df_feature_importance = pd.DataFrame(feat_imp)
df_feature_importance.sort_values(by = "importance", ascending = False, inplace = True)

In [20]:
#set categories for the 3x used
df_feature_importance["type"] = "mjo"

types = {
    "time_related" : ["year", "month", "year_cos", "year_sin", "decade_cos", "decade_sin"],
    "t2m_related" : [col for col in df.columns.to_list() if "t2m" in col],
    "enso_related" : [col for col in df.columns.to_list() if "enso" in col],
    "pv_related" : [col for col in df.columns.to_list() if "pv" in col],
}

for key in types.keys():

    for col in types[key]:
        df_feature_importance.loc[df_feature_importance["feature"] == col, "type"] = key

    #df.loc[(df["t2m_mean"] <= lower_bound) & (df["month"] == month), "t2m_cat"] = target_cat["below"]

In [21]:
fig = px.bar(
    data_frame = df_feature_importance,
    x = "feature",
    y = "importance",

    title = "feature importance (by weight)",
    width = 1000,
    height = 500,
    color_discrete_sequence = plt_style_c,

    #category_orders = {"group" : df_feature_importance["feature"].to_list(),},
    color = "type",
)

fig.update_xaxes(categoryorder='array', categoryarray= df_feature_importance["feature"].to_list())

fig.show()

In [22]:
fig = px.bar(
    data_frame = df_feature_importance,
    x = "feature",
    y = "importance",

    title = "feature importance (by type)",
    width = 1000,
    height = 500,
    color_discrete_sequence = plt_style_c,

    #category_orders = {"group" : df_feature_importance["feature"].to_list(),},
    color = "type",
)

fig.show()

In [23]:
label_translation : dict = {
    -1 : "Valid: below",
    0 : "Valid: avg",
    1 : "Valid: above",
}

for i in range(len(rfc_optim.conf_mat_labels)):

    #get data
    mat = rfc_optim.confussion_mat_valid[i]
    cat_num = rfc_optim.conf_mat_labels[i]
    cat_str = label_translation[cat_num]

    fig = px.imshow(
        mat,
        title = cat_str,
        text_auto = True,
        color_continuous_scale = plt_style_c,
        )
    fig.show()



## 3.3 Hyper parameter tuning (grid search)

In [24]:
rfc_gs = RFC(
    y_col = "t2m_cat_offset",
    df = df,
    run_optim = False,
    data_folder = data_folder,
    n_jobs = 2
)

In [25]:
opt_params : dict = rfc_gs.run_grid_search()

#output should:
#Best parameters: 'bootstrap': True, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 256
#Best score: 0.5868421052631579


`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.


`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.


`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.


`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and 

Best parameters: {'bootstrap': False, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 64}
Best score: 0.5947368421052631



`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.



In [26]:
opt_params

{'bootstrap': False,
 'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 64}

In [27]:
rfc_gs.create_model(
    params = opt_params,
    top_result = False,
    test_df = df_test
)

rfc_gs.single_model_scores



`max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.



{'train_score': 0.9852941176470589,
 'valid_score': 0.7323943661971831,
 'test_score': 0.4375}

Performs worse in single model training

## 3.3 Test set

In [28]:
rfc_optim.create_model(
    test_df = df_test,
    top_result = True,
)

rfc_optim.single_model_scores

{'train_score': 1.0,
 'valid_score': 0.6197183098591549,
 'test_score': 0.4166666666666667}

In [29]:
label_translation : dict = {
    -1 : "below",
    0 : "avg",
    1 : "above",
}

for i in range(len(rfc_optim.conf_mat_labels)):

    #get data
    mat = rfc_optim.confussion_mat_test[i]
    cat_num = rfc_optim.conf_mat_labels[i]
    cat_str = label_translation[cat_num]

    fig = px.imshow(
        mat,
        title = f"test set {cat_str}",
        text_auto = True,
        color_continuous_scale = plt_style_c,

        )
    fig.show()