# 6 Modelling - Support Vector Machines
- paper: https://www.eecis.udel.edu/~shatkay/Course/papers/USVMHeartBeatClassifier.pdf
- scikit: https://scikit-learn.org/stable/modules/svm.html

Pipelines: https://c3.ai/glossary/machine-learning/machine-learning-pipeline/

In [1]:
#general
import pandas as pd
import numpy as np
import os
import plotly.express as px
from datetime import datetime
import csv
import itertools

#ml
import sklearn
from sklearn.model_selection import TimeSeriesSplit as tsp
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.svm import NuSVC as nusvc

#data folder
data_folder : str = "data"

#plot styles
plt_style_c = px.colors.sequential.haline #complex
plt_style_s = px.colors.diverging.Portland #simple

#decide if the data gets saved or not
run_optim : bool = False

In [2]:
import warnings
warnings.filterwarnings('ignore')

## 6.1 NuSVC Base model

In [3]:
df_main = pd.read_csv(os.path.join("data", "df.csv"), index_col = "index")
df_main.shape

(527, 27)

In [4]:
#create test set
n_years_test = 4
years = list(set(df_main["year"].to_list()))
years.sort()

#set relevant years
indexes = [int(len(years) * (i/n_years_test)) - 1 for i in range(1, 1 + n_years_test)]
test_years = [years[i-1] for i in indexes] ##added minus 1 to get a better distributed data set

#create df_test und df
df = df_main[~df_main["year"].isin(test_years)]
df_test = df_main[df_main["year"].isin(test_years)]

print(f"test years:\t{test_years}")
print(f"test set:\t{df_test.shape[0]}\ntrain set:\t{df.shape[0]}")

test years:	[1988, 1999, 2010, 2021]
test set:	48
train set:	479


In [5]:

class RFC():

    def __init__ (self, y_col : str, df : object, run_optim : bool, data_folder : str, n_jobs : int = 2):

        #set base infromation
        self.y_col : str            = y_col
        self.df : object            = df
        self.run_optim : bool       = run_optim
        self.n_jobs : int           = n_jobs

        #fixed params
        self.n_folds : int          = 4
        self.random_state : int     = 42

        #set saving infos
        self.log_file : str         = os.path.join(data_folder, "optim_log.txt")
        self.result_file : str      = os.path.join(data_folder, "nusvc_results.csv")

        #run optim if asked
        if run_optim:
            self.__run_optim()
        else:
            self.__df_split()

        #read results file
        self.__get_results()

        return

    def __run_optim(self):

        self.__generate_param_list()
        self.__log(message = f"NuSVC optim started")
        self.__df_split()
        self.__normal_valid()
        self.__cross_valid()
        self.__log(message = f"NuSVC optim started")

        return

    def __generate_param_list(self):
        """list of dicts with the needed params"""

        #set values to optimze for with the model
        kernels     = ["linear", "rbf", "sigmoid"]
        nus         = [i / 10 for i in range(1,10)]
        degrees     = list(range(1,6))

        #create kernel dicts
        combinations = list(itertools.product(kernels, nus, [0])) + list(itertools.product(["poly"], nus, degrees))

        # Convert the tuples to dictionaries with keys 'nu', 'kernel', and 'degree'
        self.param_list  = [dict(kernel=kernel, nu=nu, degree=int(degree)) for (kernel, nu, degree) in combinations]

        return

    def __df_split(self):

        #remove zero value columns
        zeor_cols = []

        for index, value in df.sum().items():
                if value == 0:
                    zeor_cols.append(index)

        self.df.drop(labels = zeor_cols, axis = 1, inplace = True)

        #create x and y
        self.X = self.df.drop(labels = self.y_col, axis = 1)
        self.y = self.df[self.y_col]

        #instanciate time splitter
        self.cv = tsp(n_splits=self.n_folds)

        return

    def __normal_valid(self):

        print("Optimizing with without cross validaiton")

        self.__single_model_split_v2(valid_frac = 0.15)

        for param in self.param_list:
            print(f"Progress: {round((self.param_list.index(param) +1)/ len(self.param_list) * 100)}% \t{param}", end = "\r")


            #creat the model
            model = nusvc(
                **param,
                random_state = self.random_state,
            )

            #fit the model
            model.fit(self.X_train, self.y_train)

            #get modle scores
            train_score : float = model.score(self.X_train, self.y_train)
            valid_score : float = model.score(self.X_valid, self.y_valid)

            self.__save_result(n_folds = 0, params = param, test_score = valid_score, train_score = train_score)

    def __cross_valid(self):

        print("Optimizing with with cross validaiton")

        for param in self.param_list:
            print(f"Progress: {round((self.param_list.index(param) +1)/ len(self.param_list) * 100)}%\t{param}", end = "\r")

            model = nusvc(
                **param,
                random_state = self.random_state
            )

            result = cross_validate(
                estimator = model,
                X = self.X,
                y = self.y,
                cv = self.cv,
                return_train_score = True
            )

            train_score = round(np.mean(result["train_score"]),4)
            test_score = round(np.mean(result["test_score"]),4)

            self.__save_result(n_folds = self.n_folds, params = param, test_score = test_score, train_score = train_score)

        return

    def __save_result(self, n_folds, params, test_score, train_score):

        if os.path.isfile(self.result_file) is False:

            file = open(self.result_file, "w", newline='')
            writer = csv.writer(file)
            writer.writerow(["n_folds", "kernel", "degree", "nu", "test_score", "train_score"])
            file.close()

        file = open(self.result_file, "a", newline='')
        writer = csv.writer(file)
        writer.writerow([n_folds, params["kernel"], params["degree"], params["nu"], test_score, train_score])
        file.close()

        return

    def __log(self, message):

        #create log entry
        log_time : str = datetime.now()
        message = f"source_rfc,{log_time},{message}\n"

        #write log entry
        file_object = open(self.log_file, 'a')
        file_object.write(message)
        file_object.close()

        return

    def __get_results(self):

        self.results = pd.read_csv(self.result_file)

    def __single_model_confusion_matrix(self, test_df):

        #set labels
        self.conf_mat_labels = [0, 1]

        #test conf mat
        if test_df is not None:

            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]

            self.y_hat_test = self.model.predict(X_test)
            self.confussion_mat_test = sklearn.metrics.multilabel_confusion_matrix(y_test, self.y_hat_test, labels = self.conf_mat_labels)

        #train conf mat
        self.y_hat_train = self.model.predict(self.X_train)
        self.confussion_mat_train = sklearn.metrics.multilabel_confusion_matrix(self.y_train, self.y_hat_train, labels = self.conf_mat_labels)

        #valid conf mat
        self.y_hat_valid = self.model.predict(self.X_valid)
        self.confussion_mat_valid = sklearn.metrics.multilabel_confusion_matrix(self.y_valid, self.y_hat_valid, labels = self.conf_mat_labels)

        return

    def __single_model_score(self, test_df):

        if test_df is not None:
            X_test = test_df.drop(labels = self.y_col, axis = 1)
            y_test = test_df[self.y_col]
            test_score = self.model.score(X_test, y_test)

        else:
            test_score = None

        train_score : float = self.model.score(self.X_train, self.y_train)
        valid_score : float = self.model.score(self.X_valid, self.y_valid)

        self.single_model_scores = {
            "train_score" : train_score,
            "valid_score" : valid_score,
            "test_score" : test_score,
        }

        return

    def __single_model_split_v1(self, valid_frac):
        """[deprecated] use v2 instead"""
        #change this to a random selection of years

        index = round(self.df.shape[0] * valid_frac)

        self.X_train = self.X.iloc[index:]
        self.y_train = self.y.iloc[index:]

        self.X_valid = self.X.iloc[:index]
        self.y_valid = self.y.iloc[:index]

        return

    def __single_model_split_v2(self, valid_frac):

        years = list(set(self.df["year"].to_list()))
        years.sort()

        n_years = int(len(years) * valid_frac)
        n_years_half = ((n_years % 2) + n_years) / 2 #round to even numbers and split in half

        #get target year list
        valid_years = years[round(((len(years) - 1) / 2) - n_years_half) : round((len(years) / 2 ) -1 )] + years[int(-n_years_half):]
        train_years = [year for year in years if year not in valid_years]

        #generate valid and train dfs
        df_valid = self.df[self.df["year"].isin(valid_years)]
        df_train =self.df[self.df["year"].isin(train_years)]

        #generate x and y
        self.X_train = df_train.drop(labels = self.y_col, axis = 1)
        self.y_train = df_train[self.y_col]

        self.X_valid = df_valid.drop(labels = self.y_col, axis = 1)
        self.y_valid = df_valid[self.y_col]

        del df_valid, df_train, valid_years, train_years #free up memory

        return

    def create_model(self, param : dict , valid_frac : float = 0.15, test_df : object = None, params : dict = None):

        #split data
        self.__single_model_split_v2(valid_frac = valid_frac)

        #create and fit model
        self.model = nusvc(
            random_state = self.random_state,
            **param,
        )

        self.model.fit(X = self.X_train, y = self.y_train)

        #generate results
        self.__single_model_score(test_df)
        self.__single_model_confusion_matrix(test_df)

        return

    def run_grid_search(self, fixed_param : dict):

        #get fixed params
        

        #create grid
        param_grid : dict = {
            #"param_name" : [list of values]
        }

        #create esimator
        model = nusvc(
            random_state = self.random_state,
            **fixed_param,
        )

        grid_search = gscv(estimator = model, param_grid = param_grid, cv = self.cv)
        grid_search.fit(self.X, self.y)

        # Print the best parameters and score
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best score: {grid_search.best_score_}")

        return grid_search.best_params_

    def clean_results (self):
        """Remove all same guess anwsers form the results tabel"""

        self.__single_model_split_v2(valid_frac = 0.15)
        self.results[["train_score", "test_score"]] = self.results[["train_score", "test_score"]].round(2)

        #same_guess_prop_train = [self.y_train.describe()["mean"].round(2), 1 - self.y_train.describe()["mean"].round(2)]
        same_guess_prop_valid = [self.y_valid.describe()["mean"].round(2), 1 - self.y_valid.describe()["mean"].round(2)]

        #clean up results df
        self.results["same_guess"] = 0
        self.results.loc[self.results["test_score"].isin(same_guess_prop_valid), "same_guess"] = 1

        self.results = self.results.loc[self.results["same_guess"] != 1].sort_values(by = "test_score", axis = 0, ascending = False)

        print(same_guess_prop_valid)

        return


In [6]:
nusvc_optim = RFC(

    y_col = "t2m_cat_offset",
    df = df,
    run_optim = run_optim,
    data_folder = data_folder,

    n_jobs = 4 #n or cpu cores used, use at own risk of overheating and memory bottleneck
)

## 6.2 NuSVC results

In [7]:
nusvc_optim.clean_results()

[0.63, 0.37]


In [8]:
#top 10
nusvc_optim.results.loc[nusvc_optim.results["n_folds"] == 0].sort_values(
                by = ["test_score", "train_score"],
                ascending = [False, False]
            ).head(20)

Unnamed: 0,n_folds,kernel,degree,nu,test_score,train_score,same_guess
46,0,poly,5,0.4,0.77,0.65,0
63,0,poly,2,0.8,0.75,0.65,0
7,0,linear,0,0.8,0.75,0.64,0
22,0,sigmoid,0,0.5,0.75,0.64,0
21,0,sigmoid,0,0.4,0.75,0.63,0
6,0,linear,0,0.7,0.73,0.64,0
8,0,linear,0,0.9,0.73,0.64,0
17,0,rbf,0,0.9,0.72,0.64,0
52,0,poly,1,0.6,0.72,0.62,0
23,0,sigmoid,0,0.6,0.72,0.62,0


In [9]:
#top 10
nusvc_optim.results.loc[nusvc_optim.results["n_folds"] != 0].sort_values(
                by = ["test_score", "train_score"],
                ascending = [False, False]
            ).head(20)

Unnamed: 0,n_folds,kernel,degree,nu,test_score,train_score,same_guess
94,4,sigmoid,0,0.5,0.62,0.61,0
93,4,sigmoid,0,0.4,0.61,0.61,0
85,4,rbf,0,0.5,0.61,0.6,0
109,4,poly,1,0.3,0.61,0.57,0
92,4,sigmoid,0,0.3,0.61,0.54,0
119,4,poly,1,0.5,0.6,0.63,0
104,4,poly,1,0.2,0.6,0.58,0
83,4,rbf,0,0.3,0.6,0.58,0
101,4,poly,3,0.1,0.6,0.58,0
84,4,rbf,0,0.4,0.59,0.61,0


In [10]:
#top 10
nusvc_optim.results.sort_values(
                by = ["test_score", "train_score"],
                ascending = [True, True]
            ).head(20)

Unnamed: 0,n_folds,kernel,degree,nu,test_score,train_score,same_guess
26,0,sigmoid,0,0.9,0.25,0.37,0
25,0,sigmoid,0,0.8,0.28,0.37,0
4,0,linear,0,0.5,0.28,0.4,0
24,0,sigmoid,0,0.7,0.3,0.36,0
3,0,linear,0,0.4,0.3,0.41,0
35,0,poly,4,0.2,0.31,0.38,0
45,0,poly,4,0.4,0.31,0.41,0
40,0,poly,4,0.3,0.31,0.42,0
31,0,poly,5,0.1,0.32,0.43,0
2,0,linear,0,0.3,0.34,0.41,0


In [11]:
nusvc_optim.results.loc[nusvc_optim.results["n_folds"] == 0].sort_values(
                by = ["test_score", "train_score"],
                ascending = [False, False]
            ).head(20)

Unnamed: 0,n_folds,kernel,degree,nu,test_score,train_score,same_guess
46,0,poly,5,0.4,0.77,0.65,0
63,0,poly,2,0.8,0.75,0.65,0
7,0,linear,0,0.8,0.75,0.64,0
22,0,sigmoid,0,0.5,0.75,0.64,0
21,0,sigmoid,0,0.4,0.75,0.63,0
6,0,linear,0,0.7,0.73,0.64,0
8,0,linear,0,0.9,0.73,0.64,0
17,0,rbf,0,0.9,0.72,0.64,0
52,0,poly,1,0.6,0.72,0.62,0
23,0,sigmoid,0,0.6,0.72,0.62,0


In [12]:
fig = px.scatter(
    data_frame = nusvc_optim.results,
    x = "train_score",
    y = "test_score",
    color = "kernel",
    size = "nu",

    color_discrete_sequence = plt_style_c,
    title = "validation vs train accuracy nusvc",
    width = 1000,
    height = 500,

    #labels = {"test_score" : "validation_acc"},
)

fig.show()

In [13]:
fig = px.box(
    data_frame = nusvc_optim.results,
    x = "nu",
    y = "test_score",

    color_discrete_sequence = plt_style_s,
    title = "fitting graph: nu",
    width = 1000,
    height = 500,

    #labels = {"test_score" : "validation_acc"},
)

fig.show()

In [14]:
fig = px.box(
    data_frame = nusvc_optim.results,
    x = "degree",
    y = "test_score",

    color_discrete_sequence = plt_style_s,
    title = "fitting graph: poly degree",
    width = 1000,
    height = 500,

    #labels = {"test_score" : "validation_acc"},
)

fig.show()

In [15]:
#comparison of kernel performance
fig = px.box(
    data_frame = nusvc_optim.results,
    x = "kernel",
    y = "test_score",

    color_discrete_sequence = plt_style_s,
    title = "fitting graph: kernel",
    width = 1000,
    height = 500,

    #labels = {"test_score" : "validation_acc"}
)

fig.show()

## 6.3 Test set

In [16]:
#create top models from each kernel

class Build():

    def main(nusvc_optim, df_test):

        #get all unique kernels
        kernels = nusvc_optim.results["kernel"].unique().tolist()

        #itterate trough kernels
        for kernel in kernels:

            print(f"Model: {kernel}")

            params : dict = Build.get_params(nusvc_optim, kernel)
            nusvc_optim.create_model(test_df = df_test, param = params)

            print(nusvc_optim.single_model_scores)
            Build.plot_mat(nusvc_optim, kernel)

    def get_params(nusvc_optim, kernel):

        #get top params for given kernel
        params : dict = nusvc_optim.results.loc[
            (nusvc_optim.results["n_folds"] == 0) &
            (nusvc_optim.results["kernel"] == kernel)].sort_values(
                        by = ["test_score", "train_score"],
                        ascending = [False, False]
                    ).iloc[0].to_dict()

        #remove unneded cols
        for key in ["n_folds", "test_score", "train_score", "same_guess"]:
            params.pop(key)

        return params

    def plot_mat(nusvc_optim, kernel):

        label_translation : dict = {
            -1 : "below",
            0 : "avg",
            1 : "above",
        }

        for i in range(len(nusvc_optim.conf_mat_labels)):

            #get data
            mat = nusvc_optim.confussion_mat_test[i]
            cat_num = nusvc_optim.conf_mat_labels[i]
            cat_str = label_translation[cat_num]

            fig = px.imshow(
                mat,
                title = f"Model (test): {kernel} - {cat_str}",
                text_auto = True,
                color_continuous_scale = plt_style_c,
                )

            fig.show()


In [17]:
Build.main(nusvc_optim = nusvc_optim, df_test = df_test)

Model: poly
{'train_score': 0.6495098039215687, 'valid_score': 0.7746478873239436, 'test_score': 0.5}


Model: sigmoid
{'train_score': 0.6372549019607843, 'valid_score': 0.7464788732394366, 'test_score': 0.5208333333333334}


Model: linear
{'train_score': 0.6421568627450981, 'valid_score': 0.7464788732394366, 'test_score': 0.4791666666666667}


Model: rbf
{'train_score': 0.6397058823529411, 'valid_score': 0.7183098591549296, 'test_score': 0.5}
