# 4 Modelling - Multi layer perceptron

In [21]:
#general
import pandas as pd
import numpy as np
import os
import plotly.express as px
from datetime import datetime
import csv
import re

#ml
import sklearn
from sklearn.model_selection import TimeSeriesSplit as tsp
from sklearn.neural_network import MLPClassifier as mlp
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

#data folder
data_folder : str = "data"

#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#decide if the data gets saved or not
run_optim : bool = False

In [22]:
import warnings
warnings.filterwarnings('ignore')

## 4.1 Create model and hyper parameter tuning

In [23]:
df_main = pd.read_csv(os.path.join("data", "df.csv"), index_col = "index")
df_main.shape

(527, 27)

In [24]:
#create test set
n_years_test = 4
years = list(set(df_main["year"].to_list()))
years.sort()

#set relevant years
indexes = [int(len(years) * (i/n_years_test)) - 1 for i in range(1, 1 + n_years_test)]
test_years = [years[i-1] for i in indexes] ##added minus 2 to get a better distributed data set

#create df_test und df
df = df_main[~df_main["year"].isin(test_years)]
df_test = df_main[df_main["year"].isin(test_years)]

print(f"test years:\t{test_years}")
print(f"test set:\t{df_test.shape[0]}\ntrain set:\t{df.shape[0]}")


test years:	[1988, 1999, 2010, 2021]
test set:	48
train set:	479


In [25]:
df.shape

(479, 27)

In [26]:
#see: https://scikit-learn.org/stable/auto_examples/neural_networks/plot_mlp_alpha.html#sphx-glr-auto-examples-neural-networks-plot-mlp-alpha-py
#see: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
#see: https://becominghuman.ai/what-does-feature-scaling-mean-when-to-normalize-data-and-when-to-standardize-data-c3de654405ed

class MLPR():

    def __init__ (self, y_col : str, df : object, run_optim : bool, data_folder : str, n_jobs : int):

        #set base infromation
        self.y_col : str            = y_col
        self.df : object            = df
        self.run_optim : bool       = run_optim
        self.n_jobs : int           = n_jobs

        #mlp parameters
        self.acitvation_func        = "relu"
        self.solver                 = "adam"
        self.alpha                  = 0.5 #optim with 0.5
        self.max_iter               = 500 # optim with 250

        #arch parameters
        self.node_step_fact         = 2
        self.max_node_multiple      = 3
        self.min_node_division      = 2
        self.n_layers               = 4

        #fixed params
        self.n_folds                 = 4
        self.random_state            = 42

        #set saving infos
        self.log_file : str         = os.path.join(data_folder, "optim_log.txt")
        self.result_file : str      = os.path.join(data_folder, "mlpc_results.csv")

        self.__standardize()

        #run optim if asked
        if run_optim:
            self.__run_optim()
        else:
            self.__df_split()

        #read results file
        self.__get_results()

        return

    def __run_optim(self):

        self.__log(message = f"Started optim with parameters")

        self.__df_split()
        #self.__standardize()
        self.__generate_arch_list()
        self.__cross_valid()

        self.__log(message = f"Finished optim with parameters")

        return

    def __standardize(self):

        #save an undstandardized version
        self.df_unstand = self.df.copy()

        #standardize columns
        x_cols = self.df.columns.to_list(); x_cols.remove(self.y_col)
        self.df[x_cols] = (self.df[x_cols] - self.df[x_cols].mean()) / self.df[x_cols].std()

        #clean up all columns with no variation
        #unneded_cols = self.df.describe().T["std"].loc[self.df.describe().T["std"] == 0].index.to_list()
        #self.df.drop(labels = unneded_cols, axis = 1, inplace = True)

        true_indexes = []

        for index, value in self.df.std().isna().items():
                if value == True:
                    true_indexes.append(index)

        self.df.drop(labels = true_indexes, axis = 1, inplace = True)

        return


    def __generate_arch_list(self):
        """list of tuples: (estim, depth)"""

        self.arch_list : list = []
        n_node_list : list = []
        n_input_nodes : int = len(self.X.columns)

        #multiple
        for i in range(1, self.max_node_multiple + 1):
            n_node_list.append(n_input_nodes * i)

        #divisonal
        div : int = 2 #steps

        while min(n_node_list) > self.min_node_division:
            n_node_list.append(round(n_input_nodes / div))
            div += 1

        #clean up
        n_node_list = list(set(n_node_list)); n_node_list.append(0); n_node_list.sort()

        #create archs
        for i in range(1, (len(n_node_list)**self.n_layers) + 1):

            arch = []

            for j in list(range(self.n_layers))[::-1]:

                v =  int((i % (len(n_node_list) ** (j + 1)) / (len(n_node_list) ** j)))
                arch.append(n_node_list[v])

            arch = [k for k in arch if k != 0] #remove zero value

            if arch in self.arch_list:
                continue
            self.arch_list.append(arch)

        return

    def __df_split(self):

        #create x and y
        self.X = self.df.drop(labels = self.y_col, axis = 1)
        self.y = self.df[self.y_col]

        #instanciate time splitter
        self.cv = tsp(n_splits=self.n_folds)

        return

    def __cross_valid(self):

        for arch in self.arch_list:

            print(f"Progress: {round((self.arch_list.index(arch) +1)/ len(self.arch_list) * 100, 4)}%", end = "\r")

            model = mlp(
                activation = self.acitvation_func,
                solver = self.solver,
                alpha = self.alpha,
                max_iter = self.max_iter,
                hidden_layer_sizes = arch
            )

            result = cross_validate(
                estimator = model,
                X = self.X,
                y = self.y,
                cv = self.cv,
                return_train_score = True
            )

            train_score = round(np.mean(result["train_score"]),4)
            test_score = round(np.mean(result["test_score"]),4)

            self.__save_result(n_folds = self.n_folds, test_score = test_score, train_score = train_score, arch = arch) #add features

        return

    def __save_result(self, n_folds, test_score, train_score, arch): #add features

        #create file if does not exist
        if os.path.isfile(self.result_file) is False:

            file = open(self.result_file, "w", newline='')
            writer = csv.writer(file)
            writer.writerow(["n_folds", "test_score", "train_score", "arch", "fixed_parameters"]) #add features
            file.close()

        fixed_params : dict = {
        "acitvation_func" : self.acitvation_func,
        "solver" : self.solver,
        "alpha" : self.alpha,
        "max_iter" : self.max_iter,
        }

        #save data
        file = open(self.result_file, "a", newline='')
        writer = csv.writer(file)
        writer.writerow([n_folds, test_score, train_score, arch, str(fixed_params)]) #add features
        file.close()

        return

    def __log(self, message):

        #create log entry
        log_time : str = datetime.now()
        message = f"source_mlp,{log_time},{message}\n"

        #write log entry
        file_object = open(self.log_file, 'a')
        file_object.write(message)
        file_object.close()

        return

    def __get_results(self):

        self.results = pd.read_csv(self.result_file)

    def __single_model_confusion_matrix(self, test_df):

        #set labels
        self.conf_mat_labels = [0, 1]

        #test conf mat
        if test_df is not None:

            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]

            self.y_hat_test = self.model.predict(X_test)
            self.confussion_mat_test = sklearn.metrics.multilabel_confusion_matrix(y_test, self.y_hat_test, labels = self.conf_mat_labels)

        #train conf mat
        self.y_hat_train = self.model.predict(self.X_train)
        self.confussion_mat_train = sklearn.metrics.multilabel_confusion_matrix(self.y_train, self.y_hat_train, labels = self.conf_mat_labels)

        #valid conf mat
        self.y_hat_valid = self.model.predict(self.X_valid)
        self.confussion_mat_valid = sklearn.metrics.multilabel_confusion_matrix(self.y_valid, self.y_hat_valid, labels = self.conf_mat_labels)

        return

    def __single_model_score(self, test_df):

        if test_df is not None:
            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]

            test_score = self.model.score(X_test, y_test)

        else:
            test_score = None

        train_score : float = self.model.score(self.X_train, self.y_train)
        valid_score : float = self.model.score(self.X_valid, self.y_valid)

        self.single_model_scores = {
            "train_score" : train_score,
            "valid_score" : valid_score,
            "test_score" : test_score,
        }

        return

    def __single_model_split_v1(self, valid_frac):

        index = round(self.df.shape[0] * valid_frac)

        self.X_train = self.X.iloc[index:]
        self.y_train = self.y.iloc[index:]

        self.X_valid = self.X.iloc[:index]
        self.y_valid = self.y.iloc[:index]

        return

    def __single_model_split_v2(self, valid_frac):

        years = list(set(self.df["year"].to_list()))
        years.sort()

        n_years = int(len(years) * valid_frac)
        n_years_half = ((n_years % 2) + n_years) / 2 #round to even numbers and split in half

        #get target year list
        valid_years = years[round(((len(years) - 1) / 2) - n_years_half) : round((len(years) / 2 ) -1 )] + years[int(-n_years_half):]
        train_years = [year for year in years if year not in valid_years]

        #generate valid and train dfs
        df_valid = self.df[self.df["year"].isin(valid_years)]
        df_train =self.df[self.df["year"].isin(train_years)]

        #generate x and y
        self.X_train = df_train.drop(labels = self.y_col, axis = 1)
        self.y_train = df_train[self.y_col]

        self.X_valid = df_valid.drop(labels = self.y_col, axis = 1)
        self.y_valid = df_valid[self.y_col]

        del df_valid, df_train, valid_years, train_years #free up memory

        return

    def __test_df_prep(self, test_df):

        #cleanup
        unneded_cols = [col for col in test_df.columns.to_list() if col not in list(self.df.columns.to_list())]
        test_df.drop(labels = unneded_cols, axis = 1, inplace = True)

        #standardize with valid df values
        x_cols = test_df.columns.to_list(); x_cols.remove(self.y_col)
        test_df[x_cols] = (test_df[x_cols] - self.df_unstand[x_cols].mean()) / self.df_unstand[x_cols].std()

        return test_df

    def create_model(self, top_result : bool = False, valid_frac : float = 0.15, test_df : object = None, arch : list = None): #add features

        #split data
        #self.__single_model_split_v1(valid_frac = valid_frac)
        test_df = self.__test_df_prep(test_df)
        self.__single_model_split_v2(valid_frac = valid_frac)

        #get top performing values
        if top_result:

            df_top = self.results.sort_values(
                by = ["test_score", "train_score"],
                ascending = [False, False]
            )

            arch : str = df_top.iloc[0]["arch"]
            arch : list = [int(i) for i in arch.strip('][').split(', ')]
            del df_top

        #create and fit model
        #add random state

        self.model = mlp(
                activation = self.acitvation_func,
                solver = self.solver,
                alpha = self.alpha,
                max_iter = self.max_iter,
                hidden_layer_sizes = arch,
                random_state = self.random_state,
            )

        self.model.fit(X = self.X_train, y = self.y_train)

        #generate results
        self.__single_model_score(test_df)
        self.__single_model_confusion_matrix(test_df)

        return

In [27]:
mlpc_optim = MLPR(

    y_col = "t2m_cat_offset",
    df = df,
    run_optim = run_optim,
    data_folder = data_folder,

    #add params for network architecture

    n_jobs = 4 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

In [28]:
#to 10
mlpc_optim.results.sort_values(by = ["test_score"], ascending = False).head(25)

Unnamed: 0,n_folds,test_score,train_score,arch,fixed_parameters
3306,4,0.6421,0.8742,"[4, 3, 78, 13]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
6470,4,0.6316,0.7767,"[9, 5, 13, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
3292,4,0.6237,0.87,"[4, 3, 52, 4]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
5208,4,0.6211,0.9198,"[6, 2, 78, 52]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
7161,4,0.6211,0.7391,"[13, 2, 9, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
8554,4,0.6211,0.845,"[26, 6, 6, 6]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
4031,4,0.6211,0.6323,"[4, 78, 4, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
4720,4,0.6211,0.7608,"[5, 13, 3, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
4650,4,0.6184,0.7156,"[5, 9, 6, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
483,4,0.6184,0.8845,"[5, 26, 5]","{'acitvation_func': 'relu', 'solver': 'adam', ..."


In [29]:
#bottom 10
mlpc_optim.results.sort_values(by = ["test_score"], ascending = True).head(10)

Unnamed: 0,n_folds,test_score,train_score,arch,fixed_parameters
150,4,0.3711,0.5315,"[2, 6, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
1320,4,0.3711,0.5288,"[2, 4, 3, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
6241,4,0.3763,0.5855,"[9, 3, 5, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
9110,4,0.3816,0.5958,"[52, 2, 2, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
1771,4,0.3842,0.5442,"[2, 13, 13, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
2100,4,0.3895,0.5592,"[2, 78, 78, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
4071,4,0.3895,0.7611,"[4, 78, 13, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
2083,4,0.3895,0.547,"[2, 78, 26, 5]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
2070,4,0.3921,0.7425,"[2, 78, 13, 2]","{'acitvation_func': 'relu', 'solver': 'adam', ..."
1391,4,0.3921,0.5956,"[2, 4, 52, 3]","{'acitvation_func': 'relu', 'solver': 'adam', ..."


In [30]:
mlpc_optim.results.loc[mlpc_optim.results["arch"] == "[2]", "n_layers"] = 1

In [31]:
#adding data for plotting the perfomrance
mlpc_optim.results["n_layers"] = None
mlpc_optim.results["n_nodes"] = None
mlpc_optim.results["n_node_layer_avg"] = None

for arch in mlpc_optim.results["arch"].to_list():

    #convert to list
    arch_list = eval(arch)

    #set values
    try:
        mlpc_optim.results.loc[mlpc_optim.results["arch"] == arch, "n_layers"]          = len(arch_list)
        mlpc_optim.results.loc[mlpc_optim.results["arch"] == arch, "n_nodes"]            = sum(arch_list)
        mlpc_optim.results.loc[mlpc_optim.results["arch"] == arch, "n_node_layer_avg"]  = sum(arch_list) / len(arch_list)

    #chat empty arch
    except ZeroDivisionError:
        continue

mlpc_optim.results.head(5)

In [None]:
fig = px.box(
    data_frame = mlpc_optim.results,
    y = "test_score",
    x = "n_layers",

    title = "fitting graph: hidden layers",
    width = 1000,
    height = 500,
    color_discrete_sequence = plt_style_s,
)

fig.show()


In [None]:
fig = px.scatter(
    data_frame = mlpc_optim.results,
    y = "test_score",
    x = "n_layers",
    color = "train_score",

    opacity = 0.2,
    trendline = "ols",

    title = "fitting graph: n layers",
    width = 1000,
    height = 500,
    color_continuous_scale = plt_style_c,
)

fig.update_traces(marker={'size': 15})

fig.show()

In [None]:
fig = px.scatter(
    data_frame = mlpc_optim.results,
    y = "test_score",
    x = "n_nodes",
    color = "train_score",

    opacity = 0.2,
    trendline = "ols",

    title = "fitting graph: n nodes",
    width = 1000,
    height = 500,
    color_continuous_scale = plt_style_c,
)

fig.show()

In [None]:
fig = px.scatter(
    data_frame = mlpc_optim.results,
    y = "test_score",
    x = "n_node_layer_avg",
    color = "train_score",

    opacity = 0.2,
    trendline = "ols",

    title = "fitting graph: n node per layer average",
    width = 1000,
    height = 500,
    color_continuous_scale = plt_style_c,
)

fig.show()

Finding:
- no clear tren in architecure which can be spotted
- further optimization for a fourth layer needed?

A closer look at three different architecutres and their non corss validation performance

In [None]:
model_0 = MLPR(
    y_col = "t2m_cat_offset",
    df = df,
    run_optim = False,
    data_folder = data_folder,
    n_jobs = 1 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

model_1 = MLPR(
    y_col = "t2m_cat_offset",
    df = df,
    run_optim = False,
    data_folder = data_folder,
    n_jobs = 1 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

model_2 = MLPR(
    y_col = "t2m_cat_offset",
    df = df,
    run_optim = False,
    data_folder = data_folder,
    n_jobs = 1 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

model_3 = MLPR(
    y_col = "t2m_cat_offset",
    df = df,
    run_optim = False,
    data_folder = data_folder,
    n_jobs = 1 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

In [None]:

model_0.create_model(top_result = True, test_df = df_test)
model_1.create_model(arch = [3, 13, 6], test_df = df_test)
model_2.create_model(arch = [4, 3, 6], test_df = df_test)
model_3.create_model(arch = [26,26,26,26], test_df = df_test)

Findings:
- very inconsisten across the board
- no clear pattern can be found
- not the right model for the job

In [None]:
def plot_confusions_mat(model, model_i):

    print(f"\n\nmodel_{model_i}")

    for key in model.single_model_scores.keys():
        print(f"{key}:\t{round(model.single_model_scores[key],3)}")


    label_translation : dict = {
        0 : "below",
        1 : "above",
    }

    for i in range(len(model.conf_mat_labels)):

        #get data
        mat = model.confussion_mat_test[i]
        cat_num = model.conf_mat_labels[i]
        cat_str = f"model_{model_i}: {label_translation[cat_num]}"

        fig = px.imshow(
            mat,
            title = f"test set {cat_str}",
            text_auto = True,
            color_continuous_scale = plt_style_c,
            )

        fig.show()

In [None]:
list(map(plot_confusions_mat, [model_0, model_1, model_2, model_3], [0,1,2,3]))



model_0
train_score:	0.672
valid_score:	0.704
test_score:	0.438




model_1
train_score:	0.598
valid_score:	0.62
test_score:	0.438




model_2
train_score:	0.664
valid_score:	0.732
test_score:	0.438




model_3
train_score:	0.713
valid_score:	0.718
test_score:	0.438


[None, None, None, None]

Findings:
- Models do not work
- Classify everything as either above or below