In [None]:
import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
import librosa as lib

from sklearn.model_selection import StratifiedGroupKFold

In [None]:
# Please set the data path where you saved.

tdcs_meta = pd.read_csv('tdcsfog_metadata.csv')[1:] # The first row contains an example, so you should exclude this.
defog_meta = pd.read_csv('defog_metadata.csv')[1:]  # Same Here

defog_list = glob.glob('train/defog/*.csv')
tdcs_list = glob.glob('train/tdcsfog/*.csv')

meta = pd.concat([defog_meta, tdcs_meta])
lists = defog_list + tdcs_list

In [None]:

'''
Extract signals from dataframe (or parquet if you using extra datas for psuedo-labeling.)
Perform mean-std normalization and create pickle files to read from the dataset class
'''

for i in range(len(lists)):
    name = lists[i].split('/')[-1].split('.')[0]
    df = pd.read_csv(lists[i])
    data = np.asarray(df[['AccV','AccAP','AccML']])
    np.save(f'train/data/{name}.npy', data)
lists = defog_list + tdcs_list
npy_l = glob.glob('train/data/*.npy')

data = {}

# Do mean-std Normalization
Normalize = True   

for i in range(len(npy_l)):
    npy = np.load(npy_l[i])
    if(Normalize == True):
        npy = npy.reshape(3, -1)
        npy = npy - npy.mean(1, keepdims=True)
        npy = npy / npy.std(1, keepdims=True)
        npy = npy.reshape(-1, 3)
    name = npy_l[i].split('\\')[-1].split('.')[0]
    data[name] = npy


# Save the data
with open(f'npy.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
'''
Now we have to make dataframes, just concatenate all of defog and tdcsfog dataframes.
We declared variable name 'lists' in cell number 1.
And make the 'Subject' column but this may be an inefficient method.
'''

a = pd.DataFrame()

for i in lists:
    b = pd.read_csv(i)
    b['id'] = i.split('\\')[-1].split('.')[0]
    a = pd.concat([a, b])


a['Subject'] = 0
for i in range(len(meta)):
    d = meta.iloc[i]
    id = d['Id']
    a.loc[(a['id']==id), 'Subject']=d['Subject']
    


In [None]:
'''
    Now, we need to make folds using StratifiedGroupKFold(y = new label, groups = Subject)
    Default labels cannot splited by StratifiedGroupKFold because It does not support multi-label data.
    So let's make new multi-class labels.
'''

# make new multi-class
a['new_label'] = -1
idx = a[(a['StartHesitation'] == 0)&(a['Turn'] == 0)&(a['Walking'] == 0)].index
a['new_label'][idx] = 0
idx = a[(a['StartHesitation'] == 1)&(a['Turn'] == 0)&(a['Walking'] == 0)].index
a['new_label'][idx] = 1
idx = a[(a['StartHesitation'] == 0)&(a['Turn'] == 1)&(a['Walking'] == 0)].index
a['new_label'][idx] = 2
idx = a[(a['StartHesitation'] == 0)&(a['Turn'] == 0)&(a['Walking'] == 1)].index
a['new_label'][idx] = 3


def get_folds(df, seed, n_splits):
    skf = StratifiedGroupKFold(n_splits = n_splits, random_state = seed, shuffle=True)
    df['fold'] = -1
    for i, (train_idx, valid_idx) in enumerate(skf.split(X = df, y=df['new_label'], groups=df['Subject'])):
        df.loc[valid_idx, 'fold'] = i
    
    return df

# get folds and save!
d = get_folds(a, seed = np.random.randint(0, 342039480), n_splits=5)
d.to_parquet('train_5fold.parquet')