In [1]:
from gaitlink.lr_detection import UllrichLRDetection
from gaitlink.lr_detection import McCamleyLRDetection
from gaitlink.lr_detection import PretrainedModel

from sklearn import neighbors

from gaitlink.data import load_mobilised_matlab_format, get_all_lab_example_data_paths
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import svm
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier

from gaitlink.data import LabExampleDataset

In [2]:
# ---------------------------------------------
#           Load some example data
# ---------------------------------------------

dataset = LabExampleDataset(reference_system="INDIP")
datapoint = dataset.get_subset(cohort="HA", participant_id="001", test="Test11", trial="Trial1")

sampling_rate_hz = datapoint.sampling_rate_hz
sampling_rate_hz_ref = datapoint.reference_sampling_rate_hz_


from gaitlink.lr_detection._utils import extract_ref_data

data_list, ic_list, label_list = extract_ref_data(datapoint)


## ML Test
____
Option 1 - using pretrained models

In [3]:
my_algo_pretrained = UllrichLRDetection(model = PretrainedModel.svm_linear)
my_algo_pretrained.model

# Note that users cannot change pretrained model's params until an action method has been triggered.

<PretrainedModel.svm_linear: 'svm_linear'>

Option 2 - using a custom sklearn models

In [5]:
my_algo_custom = UllrichLRDetection(model = svm.SVC())
my_algo_custom.model

### Setting model parameters
____

Custom model parameters can be changed using `set_params()`:

In [6]:
my_algo_custom.set_params(model__C=25, model__gamma=0.002)
my_algo_custom.model

Alternatively, setting params can also be done using a dictionary:

In [6]:
my_paras = {'model__C': 30, 'model__gamma': 0.003, 'model__kernel': 'linear'}
my_algo_custom.set_params(**my_paras)
my_algo_custom.model

### Retrieving model parameters
____
Custom model parameters can be retrieved using `get_params()`

In [44]:
my_algo_custom.model.get_params()

{'C': 30,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.003,
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

Or

In [45]:
my_algo_custom.get_params()

{'model__C': 30,
 'model__break_ties': False,
 'model__cache_size': 200,
 'model__class_weight': None,
 'model__coef0': 0.0,
 'model__decision_function_shape': 'ovr',
 'model__degree': 3,
 'model__gamma': 0.003,
 'model__kernel': 'linear',
 'model__max_iter': -1,
 'model__probability': False,
 'model__random_state': None,
 'model__shrinking': True,
 'model__tol': 0.001,
 'model__verbose': False,
 'model': SVC(C=30, gamma=0.003, kernel='linear')}

## Using the model for L/R Detection
____
### Option 1 - using a pretrained model.
Please note that this will throw a warning, due to sklearn version inconsistencies. Suggest resaving the model with the current sklearn version used in this package.


In [8]:
my_algo_pretrained.detect(data = data_list,
                          ic_list = ic_list,
                          sampling_rate_hz = sampling_rate_hz)

Model_checking...


UllrichLRDetection(model=SVC(C=100, kernel='linear'))

### Option 2 - Using a custom model

The following is expected to throw an error. This is because the model was not fit yet.

In [9]:
my_algo_custom.detect(data = data_list,
                          ic_list = ic_list,
                          sampling_rate_hz = sampling_rate_hz)

Model_checking...


RuntimeError: Model is not fitted. Call self_optimize before calling detect.

Fitting the custom model - This can be doe using the `self_optimize()` method.

In [10]:
my_algo_custom.self_optimize(data = data_list,
                      ic_list = ic_list,
                      label_list = label_list,
                      sampling_rate_hz = sampling_rate_hz)

UllrichLRDetection(model=SVC(C=30, gamma=0.003, kernel='linear'))

Finally, return the assigned labels.

In [31]:
my_algo_custom.detect(data = data_list, ic_list = ic_list, sampling_rate_hz = sampling_rate_hz)
my_algo_custom.ic_lr[0]

Model_checking...


Unnamed: 0,predicted_lr_label
0,Left
1,Right
2,Left
3,Right
4,Left
5,Right
6,Left


Check against ground truth.

In [133]:
label_list[0]

# ok, so the custom model seems to be doing an okay job here.

Unnamed: 0,lr_label
0,Left
1,Right
2,Left
3,Right
4,Left
5,Right
6,Left


Similarly, the predictions using the pretrained model are the following:

In [52]:
my_algo_pretrained.detect(data = data_list, ic_list = ic_list, sampling_rate_hz = sampling_rate_hz)
my_algo_pretrained.ic_lr[0]

# you can see that this misclassifies the second IC in this case, not ideal.

Model_checking...


Unnamed: 0,predicted_lr_label
0,Left
1,Left
2,Left
3,Right
4,Left
5,Right
6,Left


### Optimization using Grid Search
_____
We can do optimization using `tpcp`: https://tpcp.readthedocs.io/en/latest/auto_examples/parameter_optimization/_03_gridsearch_cv.html

For this, we need:
1) a dataset
2) a pipeline
3) a scoring method for a parameter search. 

We already have a dataset, this was loaded at the beginning of this tutorial. Let's explore it below:

In [7]:
dataset = LabExampleDataset(reference_system="INDIP").get_subset(cohort="HA", test="Test11")
dataset

Unnamed: 0,cohort,participant_id,time_measure,test,trial
0,HA,1,TimeMeasure1,Test11,Trial1
1,HA,2,TimeMeasure1,Test11,Trial1


### Defining the pipeline

In [8]:
# we should already have access to data_list, ic_list and label_list, these were extracted at the beginning of this tutorial.
# with these, we can simply run the optimization on the ML algo itself, by doing my_algo.self_optimize(gs_data, ics, labels)

from tpcp import OptimizablePipeline, OptimizableParameter

# TODO: it turns out that you really need to provide a dataset... especially to the run method.

class LROptiPipeline(OptimizablePipeline):
    # This is a trick we use internally to check that the optimization is not doing something strange.
    # Only the model is allowed to change. If other things change, we get an error by default.
    """
    Add docs here.
    """

    algo__model: OptimizableParameter
    algo_with_results_: UllrichLRDetection

    def __init__(self, algo):
        self.algo = algo

    @property
    def ic_lr_(self):
        """
        Add docs here.
        """
        return self.algo_with_results_.ic_lr
    
    def run(self, datapoint):
        """
        Add docs here.
        """
        sampling_rate_hz = datapoint.sampling_rate_hz

        # Firstly, we need to extract the data_list and ic_list from the datapoint. Note, that datapoint can contain multiple GSs.
        # We can use the extract_ref_data utility function for this.
        data_list, ic_list, _ = extract_ref_data(datapoint)

        self.algo_with_results_ = self.algo.clone().detect(data_list, ic_list, sampling_rate_hz)
        
        return self
    
    def self_optimize(self, dataset):
        """
        This method fits the algo to the entire dataset.
        """

        all_gs_data = []
        all_ics = []
        all_labels = []

        for datapoint in dataset:
            
            # TODO: this is a temporary fix, as HA, with participant_id = 002 (TimeMeasure1, Test5, Trial1) is problematic
            try:
                data_list, ic_list, label_list = extract_ref_data(datapoint)

                all_gs_data.extend(data_list)
                all_ics.extend(ic_list)
                all_labels.extend(label_list)
            except:
                pass
        
        # No cloning here -> we actually want to modify the object
        
        self.algo.self_optimize(all_gs_data, all_ics, all_labels)

        return self

#### Testing the pipeline

In [9]:
pipe = LROptiPipeline(UllrichLRDetection(model = svm.SVC()))
pipe.self_optimize(dataset)
pipe.run(dataset[1]).ic_lr_ # 

[  predicted_lr_label
 0              Right
 1               Left
 2              Right
 3               Left
 4              Right
 5               Left
 6              Right
 7              Right
 8              Right,
    predicted_lr_label
 0                Left
 1                Left
 2               Right
 3               Right
 4                Left
 5               Right
 6               Right
 7               Right
 8               Right
 9               Right
 10               Left
 11              Right
 12              Right
 13               Left
 14              Right
 15               Left
 16              Right
 17              Right,
    predicted_lr_label
 0                Left
 1                Left
 2                Left
 3               Right
 4                Left
 5               Right
 6               Right
 7               Right
 8               Right
 9               Right
 10              Right
 11              Right
 12               Left
 13              Ri

### Defining the scorer

In [10]:
from sklearn.metrics import accuracy_score, f1_score

def scoring(pipe, datapoint):
    pipe = pipe.safe_run(datapoint)
    _, _, labels = extract_ref_data(datapoint)

    # accuracy_score expects 1D arrays or lists of lables, so you need to flatten the list of predictions
    flat_labels = [item for sublist in labels for item in sublist.values.flatten()]
    flat_predictions = [item for sublist in pipe.run(datapoint).ic_lr_ for item in sublist.values.flatten()]

    return {"accuracy": accuracy_score(flat_labels, flat_predictions)}


In [11]:
datapoint = dataset[1]
scoring(pipe, datapoint) # ok, this seems to work now...

{'accuracy': 0.7608695652173914}

In [12]:
pipe.get_params()

{'algo__model__C': 1.0,
 'algo__model__break_ties': False,
 'algo__model__cache_size': 200,
 'algo__model__class_weight': None,
 'algo__model__coef0': 0.0,
 'algo__model__decision_function_shape': 'ovr',
 'algo__model__degree': 3,
 'algo__model__gamma': 'scale',
 'algo__model__kernel': 'rbf',
 'algo__model__max_iter': -1,
 'algo__model__probability': False,
 'algo__model__random_state': None,
 'algo__model__shrinking': True,
 'algo__model__tol': 0.001,
 'algo__model__verbose': False,
 'algo__model': SVC(),
 'algo': UllrichLRDetection(model=SVC())}

In [41]:
pipe.algo.model

### Finally, we can test the grid search...

In [30]:
from tpcp.optimize import GridSearchCV
from sklearn.model_selection import ParameterGrid

para_grid = ParameterGrid({
    "algo__model__C": [0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10]
})

gs = GridSearchCV(LROptiPipeline(UllrichLRDetection(model=svm.SVC())), para_grid, scoring=scoring, cv=2, return_optimized = "accuracy").optimize(dataset)

Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.15it/s].55it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s].72it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s].67it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s].73it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.90it/s].68it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.41it/s].70it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.04it/s].67it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s].70it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.02it/s].68it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]2.73it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.19it/s]2.68it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]2.73it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.21it/s]2.69it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]2.76it/s]
Datapoints: 1

In [31]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_optimize_time,std_optimize_time,mean_score_time,std_score_time,split0_test_data_labels,split1_test_data_labels,split0_train_data_labels,split1_train_data_labels,param_algo__model__C,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_single_accuracy,split1_test_single_accuracy
0,0.089924,0.007707,0.276672,0.028362,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",0.5,{'algo__model__C': 0.5},0.777778,0.673913,0.725845,0.051932,2,[0.7777777777777778],[0.6739130434782609]
1,0.087108,0.009522,0.277363,0.02426,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",0.75,{'algo__model__C': 0.75},0.777778,0.673913,0.725845,0.051932,2,[0.7777777777777778],[0.6739130434782609]
2,0.088991,0.008639,0.282011,0.019153,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",1.0,{'algo__model__C': 1},0.777778,0.673913,0.725845,0.051932,2,[0.7777777777777778],[0.6739130434782609]
3,0.089752,0.011793,0.276702,0.022855,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",1.25,{'algo__model__C': 1.25},0.793651,0.673913,0.733782,0.059869,1,[0.7936507936507936],[0.6739130434782609]
4,0.087044,0.006292,0.27511,0.019961,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",1.5,{'algo__model__C': 1.5},0.793651,0.652174,0.722912,0.070738,5,[0.7936507936507936],[0.6521739130434783]
5,0.088441,0.009643,0.275011,0.029915,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",2.0,{'algo__model__C': 2},0.793651,0.652174,0.722912,0.070738,5,[0.7936507936507936],[0.6521739130434783]
6,0.085874,0.008302,0.272837,0.028896,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",5.0,{'algo__model__C': 5},0.777778,0.608696,0.693237,0.084541,7,[0.7777777777777778],[0.6086956521739131]
7,0.088206,0.010889,0.272889,0.02118,"[(HA, 001, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 002, TimeMeasure1, Test11, Trial1)]","[(HA, 001, TimeMeasure1, Test11, Trial1)]",10.0,{'algo__model__C': 10},0.777778,0.608696,0.693237,0.084541,7,[0.7777777777777778],[0.6086956521739131]


In [32]:
gs.best_score_

0.7337819185645272

In [36]:
gs.optimized_pipeline_.get_params()

{'algo__model__C': 1.25,
 'algo__model__break_ties': False,
 'algo__model__cache_size': 200,
 'algo__model__class_weight': None,
 'algo__model__coef0': 0.0,
 'algo__model__decision_function_shape': 'ovr',
 'algo__model__degree': 3,
 'algo__model__gamma': 'scale',
 'algo__model__kernel': 'rbf',
 'algo__model__max_iter': -1,
 'algo__model__probability': False,
 'algo__model__random_state': None,
 'algo__model__shrinking': True,
 'algo__model__tol': 0.001,
 'algo__model__verbose': False,
 'algo__model': SVC(C=1.25),
 'algo': UllrichLRDetection(model=SVC(C=1.25))}

Assign the best model to the pipeline algo.

In [38]:
pipe.algo = gs.optimized_pipeline_.algo
pipe.algo

UllrichLRDetection(model=SVC(C=1.25))

Test the 'optimized' params.

In [39]:
datapoint = dataset[1]
scoring(pipe, datapoint) # ok, this seems to work now...

{'accuracy': 0.7391304347826086}

# Testing the optimizer class now...

In [1]:
from gaitlink.lr_detection import UllrichLRDetection
from gaitlink.lr_detection._utils import extract_ref_data
from gaitlink.data import LabExampleDataset
from sklearn import svm
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, f1_score

from gaitlink.lr_detection._lr_optimizer import LROptiPipeline, UllrichLROptimizer

dataset = LabExampleDataset(reference_system="INDIP").get_subset(cohort="HA", test="Test11")
dataset

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,cohort,participant_id,time_measure,test,trial
0,HA,1,TimeMeasure1,Test11,Trial1
1,HA,2,TimeMeasure1,Test11,Trial1


In [2]:
pipe = LROptiPipeline(UllrichLRDetection(model = svm.SVC()))
pipe.self_optimize(dataset)
pipe.run(dataset[1]).ic_lr_

[  predicted_lr_label
 0              Right
 1               Left
 2              Right
 3               Left
 4              Right
 5               Left
 6              Right
 7              Right
 8              Right,
    predicted_lr_label
 0                Left
 1                Left
 2               Right
 3               Right
 4                Left
 5               Right
 6               Right
 7               Right
 8               Right
 9               Right
 10               Left
 11              Right
 12              Right
 13               Left
 14              Right
 15               Left
 16              Right
 17              Right,
    predicted_lr_label
 0                Left
 1                Left
 2                Left
 3               Right
 4                Left
 5               Right
 6               Right
 7               Right
 8               Right
 9               Right
 10              Right
 11              Right
 12               Left
 13              Ri

In [3]:
para_grid = ParameterGrid({
    "algo__model__C": [0.5, 0.75, 1, 1.25, 1.5, 2, 5, 10]
})

def scoring(pipe, datapoint):
    pipe = pipe.safe_run(datapoint)
    _, _, labels = extract_ref_data(datapoint)

    # accuracy_score expects 1D arrays or lists of lables, so you need to flatten the list of predictions
    flat_labels = [item for sublist in labels for item in sublist.values.flatten()]
    flat_predictions = [item for sublist in pipe.run(datapoint).ic_lr_ for item in sublist.values.flatten()]

    return {"accuracy": accuracy_score(flat_labels, flat_predictions)}

datapoint = dataset[1]
scoring(pipe, datapoint) # ok, this seems to work now...

{'accuracy': 0.7608695652173914}

In [5]:
optimizer = UllrichLROptimizer(pipe, para_grid)
optimizer.optimize(dataset, scoring_function = scoring)

Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.20it/s].56it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s].75it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.95it/s].68it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s].71it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.16it/s].66it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s].72it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.10it/s].60it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s].67it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.23it/s].66it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s]2.73it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.05it/s]2.70it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.41it/s]2.74it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  4.16it/s]2.69it/s]
Datapoints: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]2.74it/s]
Datapoints: 1

GridSearchCV(cv=2, n_jobs=None, optimize_with_info=True, parameter_grid=<sklearn.model_selection._search.ParameterGrid object at 0x177001f10>, pipeline=LROptiPipeline(algo=UllrichLRDetection(model=SVC())), pre_dispatch='n_jobs', progress_bar=True, pure_parameters=False, return_optimized='accuracy', return_train_score=False, safe_optimize=True, scoring=<function scoring at 0x176776fc0>, verbose=0)

In [7]:
optimizer.optimization_results.best_params_

{'algo__model__C': 1.25}