### Imports

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time

from pathlib import Path
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from xgboost import XGBClassifier

import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import DataLoader

from datasets import SurfaceDatasetXGB
from helpers import EarlyStopper
from models import CNNSurfaceClassifier

### Seed

In [2]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x22323bcb130>

### Constants

In [3]:
DATA_DIR = Path('../data/train_set/csv/')
LOOKBACK = 8/3
SAMPLING_FREQUENCY = 75.
DATASET_FREQUENCY = 150.
SUBSET = ('imu',)
CONFIGURATIONS = ('4W',)

### Load and split data

In [4]:
with open('../data/train_set/labels.json') as fp:
    labels = json.load(fp)

In [5]:
dataset = [(DATA_DIR.joinpath(key + '.csv'), values['surface']) for key, values in labels.items() if values['kinematics'] in CONFIGURATIONS and values['spacing'] == 'R1' and 'T1' in values['trajectory']]

In [6]:
X = pd.Series([run[0] for run in dataset], name='bag_name')
y_primary = [run[1] for run in dataset]

In [7]:
# y_secondary = []
# y_secondary = ['slippery' if label in ('1_Panele', '5_Spienione_PCV', '6_Linoleum')
#                else 'grippy' if label in ('3_Wykladzina_jasna', '8_Pusta_plyta', '9_podklady')
#                else 'neutral' for label in y_primary]
y_secondary = ['slippery' if label in ('3_Wykladzina_jasna', '4_Trawa')
               else 'grippy' if label in ('5_Spienione_PCV', '8_Pusta_plyta', '9_podklady', '10_Mata_ukladana')
               else 'neutral' for label in y_primary] # Pawel set
# y_secondary = ['slippery' if label in ('3_Wykladzina_jasna', '4_Trawa')
#                else 'grippy' if label in ('2_Wykladzina_czarna', '5_Spienione_PCV', '9_podklady', '10_Mata_ukladana')
#                else 'neutral' for label in y_primary] # Clustering set

In [8]:
le = LabelEncoder()
if y_secondary:
    le.fit(y_secondary)
    y = le.transform(y_secondary)
else:
    le.fit(y_primary)
    y = le.transform(y_primary)
classes = le.classes_
num_classes = len(classes)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

### Custom datasets

In [10]:
train_dataloader = DataLoader(SurfaceDatasetXGB(X_train, y_train, sample_freq=SAMPLING_FREQUENCY, data_freq=DATASET_FREQUENCY, lookback=LOOKBACK, subset=SUBSET), batch_size=len(X_train), shuffle=True)
test_dataloader = DataLoader(SurfaceDatasetXGB(X_test, y_test, sample_freq=SAMPLING_FREQUENCY, data_freq=DATASET_FREQUENCY, lookback=LOOKBACK, subset=SUBSET), batch_size=len(X_test))

In [11]:
X_train, y_train = next(iter(train_dataloader))
X_test, y_true = next(iter(test_dataloader))

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0.1, 0.5],
    'reg_lambda': [0.1, 0.5],
}

### Model

In [13]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
)

### Training loop

In [14]:
clf_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=10,
)
clf_search.fit(X_train, y_train)
importances = clf_search.best_estimator_.feature_importances_
idx = np.argsort(importances)
best_features = idx[-25:]

xgb_tuned = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
    **clf_search.best_params_,
)
xgb_tuned.fit(X_train[:, best_features], y_train)

model_name =  'xgb_classifier_' + '_'.join((str(num_classes),) + CONFIGURATIONS + SUBSET) + '_' + time.strftime("%Y-%m-%d-%H-%M-%S")
xgb_tuned.save_model(f"../results/checkpoints/{model_name}.json")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [15]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=num_classes,
)
xgb_model.load_model(f"../results/checkpoints/{model_name}.json")

In [16]:
y_pred = xgb_model.predict(X_test[:, best_features])