# Combined model tsfresh

In [None]:
import pandas as pd
import numpy as np

from tsfresh.feature_extraction import extract_features, EfficientFCParameters, MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import matplotlib.pyplot as plt

from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from tsfresh.feature_selection import select_features
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from pyts.transformation import BOSS

from catboost import CatBoostClassifier

In [6]:
raw_data_prep_dir = '../raw_data/pads-parkinsons-disease-smartwatch-dataset-1.0.0/preprocessed/'

In [7]:
file_list = pd.read_csv(raw_data_prep_dir  + "file_list.csv")
file_list

Unnamed: 0,resource_type,id,study_id,condition,disease_comment,age_at_diagnosis,age,height,weight,gender,handedness,appearance_in_kinship,appearance_in_first_grade_kinship,effect_of_alcohol_on_tremor,label
0,patient,1,PADS,Healthy,-,56,56,173,78,male,right,True,True,Unknown,0
1,patient,2,PADS,Other Movement Disorders,Left-Sided resting tremor and hypokinesia with...,69,81,193,104,male,right,False,,No effect,2
2,patient,3,PADS,Healthy,-,45,45,170,78,female,right,False,,Unknown,0
3,patient,4,PADS,Parkinson's,IPS akinetic-rigid type,63,67,161,90,female,right,False,,No effect,1
4,patient,5,PADS,Parkinson's,IPS tremordominant type,65,75,172,86,male,left,False,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,patient,465,PADS,Parkinson's,IPS mixed type,62,65,175,80,male,right,True,False,No effect,1
465,patient,466,PADS,Healthy,-,84,84,172,74,female,right,True,True,No effect,0
466,patient,467,PADS,Parkinson's,"Essential Tremor, starting IPS tremordominant ...",55,57,190,100,male,right,False,,Improvement,1
467,patient,468,PADS,Parkinson's,IPS mixed type,73,76,198,118,male,right,False,,No effect,1


In [21]:
def load_time_series_data(file_list):

    time_data = []
    y = []
    X_to_split = []

    for idx, subject_idx in enumerate(file_list['id']):
        file_name = raw_data_prep_dir + f'movement/{subject_idx:03d}_ml.bin'
        time_idx_data = np.fromfile(file_name, dtype=np.float32).reshape((-1, 976))
        time_data.append(time_idx_data)
        y_idx = file_list[file_list['id']==subject_idx].label.values
        X_id = idx
        y.append(y_idx)
        X_to_split.append(X_id)
    X_to_split = np.array(X_to_split)
    y = np.array(y).flatten()
    time_data = np.array(time_data)
    return time_data, X_to_split, y

time_data = []
y = []
X_to_split = []


In [22]:
time_data, X_to_split, y = load_time_series_data(file_list)

In [23]:
time_data.shape

(469, 132, 976)

In [24]:
X_train_idx, X_test_idx, y_train, y_test = train_test_split(X_to_split, y, test_size=0.20, random_state=42, stratify = y)
X_train = time_data[X_train_idx]
X_test = time_data[X_test_idx]
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))
X_train.shape

375 94
375 94


(375, 132, 976)

In [None]:
# shape (N, F, T)
N, F, T = X_test.shape
feature_names = [f"feat_{i}" for i in range(F)]

def make_long_df(batch_data: np.ndarray, id_offset: int = 0) -> pd.DataFrame:

    n_batch = batch_data.shape[0]


    x = np.transpose(batch_data, (0, 2, 1))
    values = x.reshape(-1)

    ids = np.repeat(np.arange(id_offset, id_offset + n_batch), T * F)


    times = np.tile(np.repeat(np.arange(T), F), n_batch)


    kinds_idx = np.tile(np.arange(F), n_batch * T)
    kinds = np.array(feature_names, dtype=object)[kinds_idx]

    return pd.DataFrame({"id": ids, "time": times, "kind": kinds, "value": values})

df_long = make_long_df(X, id_offset=0)


X_features = extract_features(
    df_long,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=0
)

print(X_features.shape)  # (375, num_features)