In [1]:
from gaitlink.data import load_mobilised_matlab_format, get_all_lab_example_data_paths
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

from mlxtend.plotting import plot_decision_regions
from sklearn.decomposition import PCA

# ARNE: String enum should be available in the standard library
from enum import StrEnum
# from strenum import StrEnum
from typing import Union

import pandas as pd
from sklearn.base import ClassifierMixin
from sklearn.linear_model import Lasso
from sklearn.utils.validation import check_is_fitted
from tpcp import Algorithm

import os
import joblib

from sklearn import svm
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier

from gaitlink.data import LabExampleDataset


In [2]:
# ---------------------------------------------
#           Load some example data
# ---------------------------------------------

dataset = LabExampleDataset(reference_system="INDIP")
datapoint = dataset.get_subset(cohort="HA", participant_id="001", test="Test11", trial="Trial1")
imu_data = datapoint.data["LowerBack"]

ref_data = datapoint.reference_parameters_
ref_walking_bouts = ref_data.walking_bouts
sampling_rate_hz = datapoint.sampling_rate_hz
sampling_rate_hz_ref = datapoint.reference_sampling_rate_hz_

ref_ics = ref_data.initial_contacts

Segment the data according to Gait Sequences.

In [3]:
# ok, we can now write a function to iterate through all the walking bouts (or gait sequences (GS)) and extract the reference IC locations and LR labels, relative to the start of each walking bout.
# also, keep in mind that event_list is no longer necessary here..

data_list = []
ic_list = []
label_list = []

for gs in range(len(ref_walking_bouts)):
    gs_start = ref_walking_bouts.iloc[gs].start
    gs_end = ref_walking_bouts.iloc[gs].end
    
    data_list.append(imu_data.iloc[gs_start : gs_end].reset_index(drop = True))
    ic_list.append(ref_ics.loc[ref_ics.index.get_level_values('wb_id') == gs + 1, ['ic']].reset_index(drop = True) - gs_start) 
    label_list.append(ref_ics.loc[ref_ics.index.get_level_values('wb_id') == gs + 1, ['lr_label']].reset_index(drop = True))

# McCamley Test

In [6]:
from gaitlink.lr_detection._lr_detect_McCamley import McCamleyLRDetection

my_algo = McCamleyLRDetection()
my_algo.detect(data_list[0], ic_list[0])

my_algo.ic_lr

# then, you can simply loop through all the gait sequences. This 


Unnamed: 0,predicted_lr_label
0,Left
1,Left
2,Left
3,Right
4,Left
5,Right
6,Left


In [8]:
label_list[0]

# this works, but is misclassifying the second IC.

Unnamed: 0,lr_label
0,Left
1,Right
2,Left
3,Right
4,Left
5,Right
6,Left


In [7]:
import os


class PretrainedModel(StrEnum):
    """
    Enum class for the pretrained models.
    """
    # ARNE: I think the pretrained Models that Martin had all used the same ML Algo, but where trained on different datasets
    # ALEX: Correct, but the reference paper states all of them, so I just assumed that the other options might be available as well.
    svm_linear = "svm_linear"
    svm_rbf = "svm_rbf"
    knn = "knn"
    rfc = "rfc"
    
    @staticmethod
    def load_pretrained_model(model_name):
        
        if model_name == PretrainedModel.svm_linear:
            # model_path = "C:\\Users\\mer20as\\Documents\\PhD_Repos\\gaitlink\\gaitlink\\lr_detection\\pretrained_models\\msproject_ms_model.gz"
            base_dir = Path(os.getenv('MODEL_DIR', './pretrained_models'))
            model_path = base_dir / 'msproject_ms_model.gz'
            return joblib.load(model_path)
    
        # ARNE: These are not really pretrained models right? They are just optimized Hyper Paras
        # ALEX: Yes, I have put them here for convenience, since I did not have the other pre-trained models.
        elif model_name == "svm_rbf":
            return svm.SVC(kernel='rbf',
                           C=100, gamma=0.001,
                           probability=True)
            
        elif model_name == PretrainedModel.knn:
            return neighbors.KNeighborsClassifier(n_neighbors = 5)
        
        elif model_name == PretrainedModel.rfc:
            return RandomForestClassifier(n_estimators = 100,
                                          max_depth = 2,
                                          random_state = 0)
        else:
            raise NotImplementedError("The model specified is not supported yet.")
        
        
class BaseLRDetection(Algorithm):
    lr_list_: list
    
    def detect(self, data: pd.DataFrame):
        raise NotImplementedError

A couple of things to remember:
* Optimization of the `parameters` specified in the `__init__` of the the algorithm is performed via _internal_ optimization implemented in a `self_optimize` method.
* There should also be an option to perform _external_ optimization using the `GridSearch` wrapper from `tpcp`.


    

In [8]:
from sklearn.exceptions import NotFittedError
from gaitlink.data_transform import ButterworthFilter

class UllrichLRDetection(BaseLRDetection):
    # ARNE: Having this here is just for convenience -> This way users don't need to import another thing, but can just do UllrichLRDetection.PRETRAINED_MODEL. ... to get the models.
    PRETRAINED_MODEL = PretrainedModel # why is this here???
    
    def __init__(self, model: Union[PretrainedModel, ClassifierMixin] = PretrainedModel.svm_linear): # ok, so the default model is model_1
        
        self.model = model
        
        # I don't really think that additional parameters are required here. If you are using a pretrained model, then you should not be able to change the model, i.e. you should not provide a model parameter separately.
        # also, sklearn models can be assigned with different parameters after initialization using the _set_params method. Then you can just provide a dictionary of the required parameters.
        # self.param_1 = param_1
        # self.param_2 = param_2...


        # ARNE: Move the model check into the action method. 
        # ALEX: I think that this should actually be here, since PretrainedModel class does not have any set_params methods, users might attempt to change hyperparameters before doing anything else.
        # self.model = self._check_and_init_model(self.model)


    # ARNE: No need for that.
    # UllrichLRDetection.set_params(model__para_name=c) does the same thing. You just need to prefix all parameters with `model__`
    # def _set_params(self, param_dict: dict):
    #     """
    #     Set the parameters of the model.
    #     """
    #     self.model.set_params(**param_dict)
    #     return self
    
    @staticmethod
    def _check_and_init_model(model):
        """
        This is an utility function for checking the type of the given model and initializing it, if it is a PretrainedModel.
        If the model is an instance of ClassifierMixin, it simply returns the model.
        If the model is not an instance of either PretrainedModel or ClassifierMixin, it raises a TypeError.

        Parameters:
        model (PretrainedModel or ClassifierMixin): The model to check and possibly initialize.

        Returns:
        model (ClassifierMixin): The checked and possibly initialized model.

        Raises:
        TypeError: If the model is not an instance of either PretrainedModel or ClassifierMixin.
        """
        print('Model checking...')
        if isinstance(model, PretrainedModel):
            model = PretrainedModel.load_pretrained_model(model.value)
        if isinstance(model, ClassifierMixin):
            return model
        raise TypeError(f"Unknown model type {type(model)}. The model must be of type {PretrainedModel} or {ClassifierMixin}")
    
    # ARNE: sampling_rate -> sampling_rate_hz
    def detect(self, data: pd.DataFrame, ics: pd.Series, sampling_rate_hz: float = 100):
        """
        This function applies the model to the provided data to make predictions. 

        Parameters:
        data (pd.DataFrame): A dataframe representing data from a GS.
        ics (pd.Series): A series representing a list of ICs within a GS, 0-index at the start of the GS.
        sampling_rate_hz (float, optional): The sampling rate in Hz. Defaults to 100.

        Returns:
        self: The instance of the class with the predictions stored in the ic_lr attribute.

        Raises:
        RuntimeError: If the model is not fitted.
        """
        # Alex: I do not agree that we should be providing data and ics as instances of this class. They should be kept independent and only be called when fitting the model or when predicting. Other than that, we do not need to drag them around.
        self.sampling_rate_hz = sampling_rate_hz
        self.data = data
        self.ics = ics

        self.model = self._check_and_init_model(self.model)

        model = self.model
        
        try:
            check_is_fitted(model)
        except NotFittedError:
            raise RuntimeError("Model is not fitted. Call self_optimize before calling detect.")
        
        # preprocess data
        # TODO: for consistency, you might want to export a data frame here as well
        processed_data = self.extract_features(data, ics, sampling_rate_hz)
        
        # ALEX: It might be more elegant to only return the value of the prediction and nothing else.
        self.ic_lr = model.predict(processed_data)

        return self
    
    # ARNE: sampling_rate -> sampling_rate_hz
    def self_optimize(self, data: list[pd.DataFrame], ic_list: list[pd.Series], label_list: list[pd.Series], sampling_rate_hz: float = 100):
        """
        This function optimizes the model by fitting it on the provided data.

        Parameters:
        data (list[pd.DataFrame]): List of dataframes, each representing data from a GS. Data from multiple GSs is preferred as it results in a larger feature set.
        ic_list (list[pd.Series]): List of series, each representing a list of ICs within a GS, with sample 0 as the start of GS.
        label_list (list[pd.Series]): List of series, each representing a list of IC labels.
        sampling_rate_hz (float, optional): The sampling rate in Hz. Defaults to 100.

        Returns:
        self: The instance of the class with the optimized model.

        Raises:
        Exception: If there is an error during the concatenation of features or labels.
        """
        # data: data from either one GS, or for a list of all GSs. The latter is preferred, as it will results in a bigger feature set.
        # ic_list: a list of lists of ICs within a GS, sample 0 start of GS
        # label_list: a list of IC labels
        
        model = self.model

        # ARNE: I am not a big fan of doing convenience conversion. I would suggest to check if it is a list and force the user to make the conversion
        # ALEX: ok, I'll change that.
        if not isinstance(data, list):
            raise TypeError("data must be a list")
            
        if not isinstance(ic_list, list):
            raise TypeError("ic_list must be a list")
        
        # preprocess data
        features = []
        for gs in range(len(data)):
            features.append(self.extract_features(data[gs], ic_list[gs], sampling_rate_hz))

        # ARNE: What is the logic here? Do you need the except, if there is only 1 GS? If yes make that check explicit
        # ALEX: this has designed to handle both single and multiple GSs. if you really want to optimize your model, then you should provide a list of GSs (the more data, the better). However, if they only provide one GS, then it will still work. (normally it will fail during concatenation).
        try:    
            all_features = pd.concat(features, axis = 0, ignore_index = True)
        except:
            all_features = features
            
        try:
            all_labels = pd.concat(label_list, axis = 0, ignore_index = True)
        except:
            
            all_labels = label_list
        
        self.model = model.fit(all_features, all_labels)

        return self
    
    def extract_features(self, data, ics, sampling_rate_hz):
        """
        This function extracts features from the provided data. These are the filtered vertical and anterior-posterior angular velocity, plus their first and second derivative @ the location of the detected initial contacts.

        Parameters:
        data (pd.DataFrame): A dataframe representing data from a GS.
        ics (pd.Series): A corresponding series representing the initial contacts per GS.
        sampling_rate_hz (float): The signal sampling rate.

        Returns:
        feature_df (pd.DataFrame): A dataframe containing the extracted features.

        Note:
        The function currently uses fixed lower and upper bands for the Butterworth filter. 
        These parameters should be exposed in the future to allow for more flexibility. 
        The function also currently shifts the last IC by 3 samples to make the second derivative work. 
        This approach may need to be revised in the future.
        """

        # ARNE: We should probably expose these parameters, but than we also need to "store" them together with the pretrained models, as the models are specific to the preprocssing paras.
        #       -> We need to think about a good way to store models and other parameters together. I will come up with something
        lower_band = 0.5
        upper_band = 2
        # TODO: find a way of exposing these filtering parameters..

        # Apply Butterworth filtering and extract the first and second derivatives.
        butter_filter = ButterworthFilter(order=4, cutoff_freq_hz=(lower_band, upper_band), filter_type="bandpass")
        
        gyr = data[["gyr_x", "gyr_z"]].rename({"gyr_x": "v", "gyr_z": "ap"})
        gyr_filtered = butter_filter.filter(gyr, sampling_rate_hz = sampling_rate_hz).filtered_data_
        gyr_diff = gyr_filtered.diff()
        gyr_diff_2 = gyr_diff.diff()
        signal_paras = pd.concat({"filtered": gyr_filtered, "gradient": gyr_diff, "diff_2": gyr_diff_2}, axis=1)
        # Squash the multi index
        signal_paras.columns = ["_".join(c) for c in signal_paras.columns]

        ics -= 1
        # shift the last IC by 3 samples to make the second derivative work
        ics[ics < 2] = 2

        feature_df = signal_paras.loc[ics['ic'].values.tolist()]
  
        
        return feature_df

#### Model initialisation

In [9]:
# this is option 1, loading a pretrained model, but how about specifying a custom model?
# my_algo = UllrichLRDetection(model = PretrainedModel.svm_linear)

# this is option 2, specifying a custom model.
# my_algo = UllrichLRDetection(model = neighbors.KNeighborsClassifier(n_neighbors = 3))
my_algo = UllrichLRDetection(model = svm.SVC())

# Note that all models must be sklearn models for this to work.

# TODO: when using pre-trained models, the following inconsistency warning is expected: InconsistentVersionWarning: Trying to unpickle estimator SVC from version 0.23.1 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk.
# TODO: How can we avoid this?
my_algo.model


#### How ca we change model's hyperparameters?

In [10]:
# This is how you might change hyperparameters, nothing that this only works when using custom models.
# For PretrainedModels, this will only work once an action method has been triggered (i.e. my_algo.detect), which is expected, as users should not attempt changing pretrained models hyperparameters.

my_algo.set_params(model__C=25, model__gamma=0.002)

# Or
# my_paras = {'model__C': 25, 'model__gamma': 0.002, 'model__kernel': 'linear'}
# my_algo.set_params(**my_paras)

my_algo.model

#### How can we get the model's hyperparameters?

In [11]:
my_algo.model.get_params()

{'C': 25,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.002,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [94]:
# ARNE: another option might be:
my_algo.get_params()

{'model__C': 25,
 'model__break_ties': False,
 'model__cache_size': 200,
 'model__class_weight': None,
 'model__coef0': 0.0,
 'model__decision_function_shape': 'ovr',
 'model__degree': 3,
 'model__gamma': 0.002,
 'model__kernel': 'rbf',
 'model__max_iter': -1,
 'model__probability': False,
 'model__random_state': None,
 'model__shrinking': True,
 'model__tol': 0.001,
 'model__verbose': False,
 'model': SVC(C=25, gamma=0.002)}

#### Using the model for L/R detection

In [10]:
my_algo.detect(data = data_list[0],
               ics = ic_list[0],
               sampling_rate_hz = sampling_rate_hz)

# this will throw an error, because the model is not fitted yet.

RuntimeError: Model is not fitted. Call self_optimize before calling detect.

In [12]:
# fit the model
my_algo.self_optimize(data = data_list,
                      ic_list = ic_list,
                      label_list = label_list,
                      sampling_rate_hz = sampling_rate_hz)

  y = column_or_1d(y, warn=True)


UllrichLRDetection(model=SVC(C=25, gamma=0.002))

In [13]:
my_algo.detect(data = data_list[0], ics = ic_list[0], sampling_rate_hz = sampling_rate_hz)
# ok, so this seems to be working now...
# an alternative way would be to either do the processing before feeding the data_list above across all walking bouts, or allowing a list the be handled directly in the method, as in self_optimize.

Model checking...


UllrichLRDetection(model=SVC(C=25, gamma=0.002))

In [217]:
my_algo.ic_lr

array(['Left', 'Right', 'Left', 'Right', 'Left', 'Right', 'Left'],
      dtype=object)

In [97]:
my_algo.ic_lr # maybe this should be put into a DF, for consistency?
result_name = ["predicted_lr_label"]
results = pd.DataFrame(my_algo.ic_lr, columns = result_name)
results

Unnamed: 0,predicted_lr_label
0,Left
1,Right
2,Left
3,Right
4,Left
5,Right
6,Left


In [98]:
# check the results
label_list[0]

Unnamed: 0,lr_label
0,Left
1,Right
2,Left
3,Right
4,Left
5,Right
6,Left


### How can we optimize the model, using grid search?

In [99]:
# ARNE:
# This is a trainable model -> GridSearchCV (or custom Optuna Optimizer with internace CV)
# Optimization in tpcp only works with Pipelines not with algorithms (https://tpcp.readthedocs.io/en/latest/guides/algorithms_pipelines_datasets.html#pipelines)
# -> we need wrapper around dataset and Algorithm
#
# Dataset we already have for the example data (other data could be loaded with `GenericMobilisedDataset`)
from gaitlink.data import LabExampleDataset

dataset = LabExampleDataset(reference_system="INDIP").get_subset(cohort="HA", test="Test11")
dataset

Unnamed: 0,cohort,participant_id,time_measure,test,trial
0,HA,1,TimeMeasure1,Test11,Trial1
1,HA,2,TimeMeasure1,Test11,Trial1


In [16]:
# A helper -> we should probably move something like this into the data loader itself
import pandas as pd


def extract_ref_data(imu_data, ref_walking_bouts):

    data_list = []
    ic_list = []
    label_list = []

    for gs in range(len(ref_walking_bouts)):
        gs_start = ref_walking_bouts.iloc[gs].start
        gs_end = ref_walking_bouts.iloc[gs].end
        
        data_list.append(imu_data.iloc[gs_start : gs_end].reset_index(drop = True))
        ic_list.append(ref_ics.loc[ref_ics.index.get_level_values('wb_id') == gs + 1, ['ic']].reset_index(drop = True) - gs_start) 
        label_list.append(ref_ics.loc[ref_ics.index.get_level_values('wb_id') == gs + 1, ['lr_label']].reset_index(drop = True))
    
    return data_list, ic_list, label_list

In [17]:
ref_data

ReferenceData(walking_bouts=       start    end  n_strides  duration_s  length_m  avg_speed_mps  \
wb_id                                                                 
1        633    988          5        3.55  3.428989       0.975373   
2       2865   3325          4        4.60  1.452572       0.411857   
3       3854   5085         16       12.31  7.044042       0.617801   
4       7642   8621         12        9.79  4.396574       0.510108   
5       9452   9932          6        4.80  3.545277       0.755728   
6      11990  12517          6        5.27  3.514735       0.880632   

       avg_cadence_spm  avg_stride_length_m termination_reason  
wb_id                                                           
1           104.069084             1.124391              Pause  
2            81.296475             0.581029              Pause  
3            89.246331             0.838960              Pause  
4            94.370318             0.645176              Pause  
5            

In [18]:
ref_walking_bouts = ref_data.walking_bouts
ref_walking_bouts

Unnamed: 0_level_0,start,end,n_strides,duration_s,length_m,avg_speed_mps,avg_cadence_spm,avg_stride_length_m,termination_reason
wb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,633,988,5,3.55,3.428989,0.975373,104.069084,1.124391,Pause
2,2865,3325,4,4.6,1.452572,0.411857,81.296475,0.581029,Pause
3,3854,5085,16,12.31,7.044042,0.617801,89.246331,0.83896,Pause
4,7642,8621,12,9.79,4.396574,0.510108,94.370318,0.645176,Pause
5,9452,9932,6,4.8,3.545277,0.755728,88.778698,1.021695,Pause
6,11990,12517,6,5.27,3.514735,0.880632,95.832693,1.021576,Pause


In [19]:
# how can we make the pipeline optimization handle a list of dataframes?

dp = 0
datapoint = dataset[dp]

imu_data = datapoint.data["LowerBack"]
ref_data = datapoint.reference_parameters_
ref_walking_bouts = ref_data.walking_bouts

gs_data, ics, labels = extract_ref_data(imu_data, ref_walking_bouts)

# once you have the gs_data, ics and labels, then you can simply run the optimization on the ML algo itself, by doing my_algo.self_optimize(gs_data, ics, labels)



In [20]:
# Pipeline we need to build, but this is relativly easy
from tpcp import OptimizablePipeline, OptimizableParameter

class LROptiPipeline(OptimizablePipeline):
    # This is a trick we use internally to check that the optimization is not doing something strange.
    # Only the model is allowed to change. If other things change, we get an error by default.
    algo__model: OptimizableParameter

    algo_with_results_: UllrichLRDetection

    # A couple of ways to do the parameters here, the easiest is that (just requires a little bit annoying prefix nesting)
    def __init__(self, algo):
        self.algo = algo

    @property
    def ic_lr_(self):
        return self.algo_with_results_.ic_lr

    def run(self, dp):
        # Here, dp might just be a single datapoint
        
        imu_data = dp.data["LowerBack"]
        ref_data = dp.reference_parameters_
        ref_walking_bouts = ref_data.walking_bouts

        # if not isinstance(ref_walking_bouts, list):
        #     ref_walking_bouts = [ref_walking_bouts]
        gs_data, ics, _ = extract_ref_data(imu_data, ref_walking_bouts)
        # We just run this with the first GS here, but we could add a loop to handle multiple GS
        self.algo_with_results_ = self.algo.clone().detect(gs_data[0], ics[0], dp.sampling_rate_hz)
        return self

    def self_optimize(self, ds):
        # ds = dataset
        # Here we get a full dataset -> I.e. multiple datapoints
        sampling_rate_hz = ds[0].sampling_rate_hz
        
        all_gs_data = []
        all_ics = []
        all_labels = []
        for dp in ds:
            # dp = datapoint

            imu_data = dp.data["LowerBack"]
            ref_data = dp.reference_parameters_
            ref_walking_bouts = ref_data.walking_bouts

            # if not isinstance(ref_walking_bouts, list):
            #     ref_walking_bouts = [ref_walking_bouts]
            gs_data, ics, labels = extract_ref_data(imu_data, ref_walking_bouts)
            
            all_gs_data.extend(gs_data) # I haven't used this extend method before.
            all_ics.extend(ics)
            all_labels.extend(labels)

        # No cloning here -> We actually want to modify the object
        # this is the self_optimize method of the algorithm, not for the pipeline
        self.algo.self_optimize(all_gs_data, all_ics, all_labels)
        return self


In [22]:
# Just testing the pipeline

In [21]:
pipe = LROptiPipeline(UllrichLRDetection(model=svm.SVC()))
pipe.self_optimize(dataset)
pipe.run(dataset[0]).ic_lr_

  y = column_or_1d(y, warn=True)


array(['Left', 'Right', 'Left', 'Right', 'Left', 'Right', 'Left'],
      dtype=object)

In [22]:
dataset[0].reference_parameters_.walking_bouts

Unnamed: 0_level_0,start,end,n_strides,duration_s,length_m,avg_speed_mps,avg_cadence_spm,avg_stride_length_m,termination_reason
wb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,633,988,5,3.55,3.428989,0.975373,104.069084,1.124391,Pause
2,2865,3325,4,4.6,1.452572,0.411857,81.296475,0.581029,Pause
3,3854,5085,16,12.31,7.044042,0.617801,89.246331,0.83896,Pause
4,7642,8621,12,9.79,4.396574,0.510108,94.370318,0.645176,Pause
5,9452,9932,6,4.8,3.545277,0.755728,88.778698,1.021695,Pause
6,11990,12517,6,5.27,3.514735,0.880632,95.832693,1.021576,Pause


In [23]:
# For a Gridsearch we need one more thing: A scorer.
# In tpcp, as we have a general case, we always need to write our own scorer function
from sklearn.metrics import accuracy_score, f1_score

def scoring(pipe, dp):
    pipe = pipe.safe_run(dp)
    _, _, labels = extract_ref_data(dp.data["LowerBack"], dp.reference_parameters_.walking_bouts)
    return {"accuracy": accuracy_score(labels[0].to_numpy(), pipe.ic_lr_)}

In [24]:
# Testing the scorer
scoring(pipe, dataset[0])

{'accuracy': 1.0}

In [25]:
pipe.get_params()

{'algo__model__C': 1.0,
 'algo__model__break_ties': False,
 'algo__model__cache_size': 200,
 'algo__model__class_weight': None,
 'algo__model__coef0': 0.0,
 'algo__model__decision_function_shape': 'ovr',
 'algo__model__degree': 3,
 'algo__model__gamma': 'scale',
 'algo__model__kernel': 'rbf',
 'algo__model__max_iter': -1,
 'algo__model__probability': False,
 'algo__model__random_state': None,
 'algo__model__shrinking': True,
 'algo__model__tol': 0.001,
 'algo__model__verbose': False,
 'algo__model': SVC(),
 'algo': UllrichLRDetection(model=SVC())}

In [26]:
from tpcp.optimize import GridSearchCV
from sklearn.model_selection import ParameterGrid

# Note: the deep nesting of the paras here. This is a little arkward, but hey
para_grid = ParameterGrid({
    "algo__model__C": [1.0, 10., 25]
})

gs = GridSearchCV(LROptiPipeline(UllrichLRDetection(model=svm.SVC())), para_grid, scoring=scoring, cv=2, return_optimized="accuracy").optimize(dataset)

  from .autonotebook import tqdm as notebook_tqdm
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  8.43it/s]
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00, 10.62it/s]
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  8.75it/s]
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  9.84it/s]
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
  y = column_or_1d(y, warn=True)
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  9.80it/s]
Split-Para Combos: 100%|██████████| 6/6 [00:01<00:00,  5.29it/s]
  y = column_or_1d(y, warn=True)


In [28]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_optimize_time,std_optimize_time,mean_score_time,std_score_time,split0_test_data_labels,split1_test_data_labels,split0_train_data_labels,split1_train_data_labels,param_algo__model__C,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_single_accuracy,split1_test_single_accuracy
0,0.071965,0.003652,0.112969,0.012812,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",1.0,{'algo__model__C': 1.0},0.428571,0.428571,0.428571,0.0,1,[0.42857142857142855],[0.42857142857142855]
1,0.068424,0.00918,0.114587,0.006086,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",10.0,{'algo__model__C': 10.0},0.142857,0.428571,0.285714,0.142857,3,[0.14285714285714285],[0.42857142857142855]
2,0.069759,0.006859,0.115274,0.006507,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",25.0,{'algo__model__C': 25},0.142857,0.571429,0.357143,0.214286,2,[0.14285714285714285],[0.5714285714285714]


In [29]:
gs.best_params_

{'algo__model__C': 1.0}

In [30]:
gs.optimized_pipeline_.get_params()

{'algo__model__C': 1.0,
 'algo__model__break_ties': False,
 'algo__model__cache_size': 200,
 'algo__model__class_weight': None,
 'algo__model__coef0': 0.0,
 'algo__model__decision_function_shape': 'ovr',
 'algo__model__degree': 3,
 'algo__model__gamma': 'scale',
 'algo__model__kernel': 'rbf',
 'algo__model__max_iter': -1,
 'algo__model__probability': False,
 'algo__model__random_state': None,
 'algo__model__shrinking': True,
 'algo__model__tol': 0.001,
 'algo__model__verbose': False,
 'algo__model': SVC(),
 'algo': UllrichLRDetection(model=SVC())}

In [106]:
# getting the optimized model.
optimized_model = gs.optimized_pipeline_.algo
optimized_model

UllrichLRDetection(model=SVC())