### Imports

In [1]:
import json
import numpy as np
import pandas as pd
import random
import time

from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from xgboost import XGBClassifier

import torch
from torch.utils.data import DataLoader, Subset

from datasets import SurfaceDatasetXGB

### Seed

In [2]:
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

<torch._C.Generator at 0x2636829ab90>

### Constants

In [3]:
DATA_DIR = Path('../data/train_set/csv/')
HISTORY_DIR = Path('../results/tuning/')
LOOKBACK = 8/3
SAMPLING_FREQUENCY = 75.
DATASET_FREQUENCY = 150.
SUBSET = ('imu', 'servo')
CONFIGURATIONS = ('4W', '6W')

### Load and split data

In [4]:
with open('../data/train_set/labels.json') as fp:
    labels = json.load(fp)

In [5]:
dataset = [(DATA_DIR.joinpath(key + '.csv'), values['surface']) for key, values in labels.items() if values['kinematics'] in CONFIGURATIONS and values['spacing'] == 'R1' and 'T1' in values['trajectory']]

In [6]:
X = pd.Series([run[0] for run in dataset], name='bag_name')
y_primary = [run[1] for run in dataset]

In [7]:
y_secondary = []
# y_secondary = ['slippery' if label in ('1_Panele', '5_Spienione_PCV', '6_Linoleum')
#                else 'grippy' if label in ('3_Wykladzina_jasna', '8_Pusta_plyta', '9_podklady')
#                else 'neutral' for label in y_primary]
# y_secondary = ['slippery' if label in ('3_Wykladzina_jasna', '4_Trawa')
#                else 'grippy' if label in ('5_Spienione_PCV', '8_Pusta_plyta', '9_podklady', '10_Mata_ukladana')
#                else 'neutral' for label in y_primary] # Pawel set
# y_secondary = ['slippery' if label in ('3_Wykladzina_jasna', '4_Trawa')
#                else 'grippy' if label in ('2_Wykladzina_czarna', '5_Spienione_PCV', '9_podklady', '10_Mata_ukladana')
#                else 'neutral' for label in y_primary] # Clustering set

In [8]:
le = LabelEncoder()
if y_secondary:
    le.fit(y_secondary)
    y = le.transform(y_secondary)
else:
    le.fit(y_primary)
    y = le.transform(y_primary)
classes = le.classes_
num_classes = len(classes)

### Custom datasets

In [9]:
cv_data = SurfaceDatasetXGB(X, y, sample_freq=SAMPLING_FREQUENCY, data_freq=DATASET_FREQUENCY, lookback=LOOKBACK, subset=SUBSET)

In [10]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.1, 0.5],
}

### Training loop

In [11]:
history = {}

sss = StratifiedShuffleSplit(test_size=0.2)
for i, (training_index, test_index) in enumerate(sss.split(X, y)):
    # Initialize the model in each split
    xgb_model = XGBClassifier(
        objective='multi:softprob',
        num_class=num_classes,
    )
    
    train_subset = Subset(cv_data, training_index)
    test_subset = Subset(cv_data, test_index)

    train_dataloader = DataLoader(train_subset, batch_size=len(train_subset))
    test_dataloader = DataLoader(test_subset, batch_size=len(test_subset))
    # Extract the whole datasets to variables
    X_train, y_train = next(iter(train_dataloader))
    X_test, y_true = next(iter(test_dataloader))
    
    # Find the best hyperparameters
    clf_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=10,
    )
    clf_search.fit(X_train, y_train)

    # Find the most important features from the best estimator
    importances = clf_search.best_estimator_.feature_importances_
    idx = np.argsort(importances) # indexes with the highest importance 
    best_features = idx[-25:] # 25 best features
    
    # Fit the estimator on the best sets of hyperparameters and features
    xgb_tuned = XGBClassifier(
        objective='multi:softprob',
        num_class=num_classes,
        **clf_search.best_params_,
    )
    xgb_tuned.fit(X_train[:, best_features], y_train)
    y_pred = xgb_tuned.predict(X_test[:, best_features])
    
    history[i + 1] = {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred, average='macro'),
        'best_features': cv_data.engineered_features[best_features[::-1]].tolist(),
    }

history_filename = 'xgb_' + '_'.join((str(num_classes),) + CONFIGURATIONS + SUBSET) + '_' + time.strftime("%Y-%m-%d-%H-%M-%S")
json.dump(history, open(HISTORY_DIR / f'{history_filename}.json', 'w'))

  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean
  crest_factor = peak / rms  # how extreme the peaks are in a waveform
  form_factor = rms / mean  # the ratio of the RMS (root mean square) value to the average value
  pulse_indicator = peak / mean


Fitting 5 folds for each of 10 candidates, totalling 50 fits
