In [None]:
import datetime
import glob
import logging
import multiprocessing
import os
import sys
import json
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from scipy.interpolate import interp1d
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler
from torch import nn, optim
from torch.cuda.amp import GradScaler, autocast
from torch.nn.utils.rnn import pad_sequence
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from tqdm.notebook import tqdm

RND_SEED = 0
GPU_ID = 0
USE_GPU = True

if torch.cuda.is_available() and USE_GPU:
    gpu_name = torch.cuda.get_device_name(GPU_ID)
    print(f"Using GPU {GPU_ID} - {gpu_name}")
    device = torch.device(f"cuda:{GPU_ID}")
else:
    device = torch.device("cpu")

N_CPU_CORES = multiprocessing.cpu_count()

BASE_FOLDER = os.path.join(
    "..", "input", "tlvmc-parkinsons-freezing-gait-prediction"
)

print(f"Number of CPU cores available: {N_CPU_CORES}")

## Model

In [None]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            batch_first=True,
            bidirectional=True,
        )
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size * 4)
        self.ln1 = nn.LayerNorm(hidden_size * 4)
        self.fc2 = nn.Linear(hidden_size * 4, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)
        # res.shape = (batch_size, sequence_size, 2*hidden_size)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)

        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)

        # skip connection
        res = res + x

        return res, new_h  # log probabilities + hidden state


class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        self.res_bigrus = nn.ModuleList(
            [ResidualBiGRU(hidden_size, n_layers=1) for _ in range(n_layers)]
        )
        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x, h=None):
        # if we are at the beginning of a sequence (no hidden state)
        if h is None:
            # (re)initialize the hidden state
            h = [None for _ in range(self.n_layers)]

        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)

        return x, new_h  # log probabilities + hidden states

## Auxiliary functions

In [None]:
def get_model_id(model, params, extra_info="", timestamp=""):
    model_id = f"{type(model).__name__}"
    for k, v in params.items():
        model_id += f"_{k}{v}"
    if extra_info != "":
        model_id += f"_{extra_info}"
    if timestamp != "":
        model_id += f"_{timestamp}"
    else:
        now = datetime.datetime.now().strftime("%y%m%d%H%M%S")
        model_id += f"_{now}"
    return model_id


def read_seq(fpath):
    seq_id = fpath.split(os.path.sep)[-1].split(".")[0]
    seq = pd.read_csv(fpath)
    return seq_id, seq


def resample_seq_df(df, in_hz, out_hz, with_classes=True, with_bool_cols=True):
    in_ms = (1 / in_hz) * 1000
    out_ms = (1 / out_hz) * 1000
    FLOAT_COLS = ["AccV", "AccML", "AccAP"]
    if with_classes:
        CLASSES_COLS = ["StartHesitation", "Turn", "Walking"]
    if with_bool_cols:
        BOOL_COLS = ["Valid", "Task"]

    df["Time"] = pd.to_timedelta(df["Time"] * in_ms, unit="ms")
    df = df.set_index("Time")

    resampled_df = (
        df[FLOAT_COLS]
        .resample(f"{out_ms}ms")
        .mean()  # new val = "mean" in the 7.8125ms interval
        .interpolate()  # sometimes there is no previous value in the 7.8125ms
        # interval: we interpolate (linearly by default)
    )

    cols = []
    if with_classes:
        cols = cols + CLASSES_COLS
    if with_bool_cols:
        cols = cols + BOOL_COLS
    if cols != []:
        resampled_df[cols] = (
            df[cols]
            .resample(f"{out_ms}ms")
            .first()
            .ffill()  # new val = previous val
        )

    # needed as the introduction of NaNs forced pd to make all cols float
    if with_classes:
        resampled_df[CLASSES_COLS] = resampled_df[CLASSES_COLS].astype(int)
    if with_bool_cols:
        resampled_df[BOOL_COLS] = resampled_df[BOOL_COLS].astype(bool)

    return resampled_df


def convert_g_to_ms2(df):
    # 1g = 9.80665m/s^2
    df["AccV"] = df["AccV"] * 9.80665
    df["AccML"] = df["AccML"] * 9.80665
    df["AccAP"] = df["AccAP"] * 9.80665
    return df


def normalize(seq_features):
    return StandardScaler().fit_transform(seq_features)


def preprocess_tdcs_seq(seq_df, device, down_hz=None):
    FEATURES = ["AccV", "AccML", "AccAP"]

    if down_hz is not None:
        # downsample the data from 128Hz to ??Hz
        seq_df = resample_seq_df(
            seq_df, 128, down_hz, with_classes=False, with_bool_cols=False
        )

    # extracting the features columns and normalizing them
    seq = seq_df[FEATURES].values
    seq = normalize(seq)
    seq = torch.from_numpy(seq).float().to(device)
    seq = seq.unsqueeze(0)  # adding batch dim
    return seq


def preprocess_defog_seq(seq_df, device, down_hz=None):
    if down_hz is None:
        # upsampling the data from 100Hz to 128Hz
        # seq = upsample_defog(seq)
        seq_df = resample_seq_df(
            seq_df, 100, 128, with_classes=False, with_bool_cols=False
        )
    else:
        # downsample the data from 100Hz to ??Hz
        seq_df = resample_seq_df(
            seq_df, 100, down_hz, with_classes=False, with_bool_cols=False
        )

    # upsampling the data from 100Hz to 128Hz
    # seq_df = upsample_defog(seq_df, with_categorical_cols=False)

    # defog data is in g: we convert it into m/s^2
    seq_df = convert_g_to_ms2(seq_df)

    return preprocess_tdcs_seq(seq_df, device)


def predict_lbls_df(model, seq, seq_id):
    #     with autocast():  # mixed precision
    pred, h = model(seq)
    pred = torch.nn.functional.softmax(pred[0], dim=1)
    pred = pred.cpu().numpy()[:, :3]
    return pred


def resample_seq(seq_inhz, in_hz, out_hz):
    out_size = int(seq_inhz.shape[0] * (out_hz / in_hz))
    time_inhz = np.linspace(0, 1, seq_inhz.shape[0])
    time_outhz = np.linspace(0, 1, out_size)

    seq_outhz = np.zeros((out_size, seq_inhz.shape[1]))

    for i in range(seq_inhz.shape[1]):
        interp_func = interp1d(time_inhz, seq_inhz[:, i])
        seq_outhz[:, i] = interp_func(time_outhz)

    return seq_outhz


def build_res_df(preds):
    CLASSES = ["StartHesitation", "Turn", "Walking"]
    steps_ids = [(seq_id + "_" + str(i)) for i in range(preds.shape[0])]
    res_df = pd.DataFrame(data=preds, columns=CLASSES, index=steps_ids)
    return res_df

## Submission

In [None]:
MODELS_FOLDER = os.path.join("..", "input", "models")
model_id = "best_model" # best model
# model_id = "simplified_model" # simplified model, using only 1 layer instead of 3

params_path = os.path.join(MODELS_FOLDER, model_id, "params.json")
with open(params_path, "r", encoding="utf-8") as f:
    PARAMS = json.load(f)
print(PARAMS)

model = MultiResidualBiGRU(
    PARAMS["ISIZE"],
    PARAMS["HSIZE"],
    PARAMS["NC"],
    PARAMS["NL"],
)

model_path = os.path.join(MODELS_FOLDER, model_id, "model.pth")
model = model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()

In [None]:
TEST_ROOT_PATH = os.path.join(BASE_FOLDER, "test")
defog_TEST_PATH = os.path.join(TEST_ROOT_PATH, "defog")
tDCS_TEST_PATH = os.path.join(TEST_ROOT_PATH, "tdcsfog")

defog_test_fpaths = glob.glob(os.path.join(defog_TEST_PATH, "**"))
tdcs_test_fpaths = glob.glob(os.path.join(tDCS_TEST_PATH, "**"))

SUB_PATH = os.path.join(BASE_FOLDER, "sample_submission.csv")
sub_df = pd.read_csv(SUB_PATH)

# Id column temporarily becomes the explicit pandas index
sub_df.set_index("Id", inplace=True)

model.eval()
with torch.no_grad():
    for root_path in [tDCS_TEST_PATH, defog_TEST_PATH]:
        fpaths = glob.glob(os.path.join(root_path, "**"))
        for fpath in fpaths:
            # reading the sequence and its id
            seq_id, seq = read_seq(fpath)
            # out_size = seq.shape[0]  # useful for defog

            # preprocessing the sequence
            if root_path == defog_TEST_PATH:
                seq = preprocess_defog_seq(
                    seq, device, down_hz=PARAMS["DOWNHZ"]
                )
            else:
                seq = preprocess_tdcs_seq(seq, device, down_hz=PARAMS["DOWNHZ"])

            # using the model to predict labels
            preds = predict_lbls_df(model, seq, seq_id)

            if root_path == defog_TEST_PATH:
                in_hz = 128 if PARAMS["DOWNHZ"] is None else PARAMS["DOWNHZ"]
                preds = resample_seq(preds, in_hz, 100)
            elif PARAMS["DOWNHZ"] is not None:  # tdcs
                preds = resample_seq(preds, PARAMS["DOWNHZ"], 128)

            res_df = build_res_df(preds)

            # updating the submission dataframe with these partial results
            sub_df.update(res_df)

model.train()

# making Id back to a column and saving the dataframe in a csv file
sub_df.reset_index(inplace=True)
sub_df.to_csv("submission.csv", index=False)

display(sub_df)