In [384]:
import pandas as pd
import os
import numpy as np
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
import os

In [385]:
DATA_DIR="data"
ANNOTATIONS_DIR = "train_val"
RAW_DATA_DIR = os.path.join(DATA_DIR,"raw")
TEMP_DATA_DIR = os.path.join(DATA_DIR,"temp")
FINAL_DATA_DIR = os.path.join(DATA_DIR,"final")

In [386]:

def load_data(path):
    with open(path, 'rb') as f:
        df = pd.read_pickle(f)
    return df

def save_data(df, path):
    with open(path, 'wb') as f:
        pd.to_pickle(df, f)
        
def save_step(train_data, test_data, step):
    save_data(train_data, os.path.join(TEMP_DATA_DIR, f"train_{step}.pkl"))
    save_data(test_data, os.path.join(TEMP_DATA_DIR, f"test_{step}.pkl"))

        
def dataset_info(data):
    print("Dataset shape: ", data.shape)
    print("Dataset columns: ", data.columns)
    # Number of classes
    print("Number of classes: ", len(data['description'].unique()))

In [387]:
train_split = load_data('train_val/train.pkl')
test_split = load_data('train_val/test.pkl')

In [388]:
dataset_info(train_split)
dataset_info(test_split)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  22
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  20


# Remove duplicates

In [389]:
STEP="remove_duplicates"

In [390]:
classes ={'Slice a potato', 'Spread jelly on a bread slice', 'Load dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Unload dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Clear cutting board', 'Get items from cabinets: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Spread almond butter on a bread slice', 'Clean a plate with a sponge', 'Slice a cucumber', 'Clean a pan with a sponge', 'Slice bread', 'Clean a plate with a towel', 'Pour water from a pitcher into a glass', 'Peel a cucumber', 'Set table: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Open/close a jar of almond butter', 'Peel a potato', 'Get/replace items from refrigerator/cabinets/drawers', 'Stack on table: 3 each large/small plates, bowls', 'Clean a pan with a towel'}
classes_map = {c: i for i, c in enumerate(classes)}

In [391]:
def remove_synonyms(description):
    if description=="Get items from refrigerator/cabinets/drawers":
        return "Get/replace items from refrigerator/cabinets/drawers"
    elif description=="Open a jar of almond butter":
        return "Open/close a jar of almond butter"
    else:
        return description
    
def filter_dataset(data, name, step=STEP):
    filtered_dataset = data.copy()
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes found.")
    filtered_dataset['description'] = filtered_dataset['description'].apply(remove_synonyms)
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes after removing synonyms.")
    filtered_dataset['label'] = filtered_dataset['description'].apply(lambda x: classes_map[x])
    # Drop column labels
    filtered_dataset = filtered_dataset.drop(columns=['labels'])
    return filtered_dataset

In [392]:
train_split_filtered = filter_dataset(train_split, "train")
test_split_filtered = filter_dataset(test_split, "test")

22 classes found.
20 classes after removing synonyms.
20 classes found.
19 classes after removing synonyms.


In [393]:
dataset_info(train_split_filtered)
dataset_info(test_split_filtered)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  20
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  19


In [394]:
save_step(train_split_filtered, test_split_filtered, STEP)

# Merge

In [395]:
STEP="merge"
train_data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))

In [396]:
def merge(mode):
    if mode=="train":
        data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
    elif mode=="test":
        data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))
    rows = []
    for tup in data.iterrows():
        row = tup[1].copy()
        file = row["file"]
        idx = row["index"]
        emg = load_data(os.path.join(RAW_DATA_DIR, file))
        emg = emg.iloc[idx].copy()
        emg["subject"]=file.split('.')[0]
        emg["label"]=row["label"]
        emg["description"]=row["description"]
        rows.append(emg)
    merged = pd.DataFrame(rows)
    return merged

In [397]:
train_data = merge("train")
test_data = merge("test")

In [398]:
dataset_info(train_data)
dataset_info(test_data)

Dataset shape:  (527, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (59, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [399]:
save_step(train_data, test_data, STEP)

# Data Augmentation

In [400]:
STEP="augment"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_merge.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_merge.pkl"))
DURATION = 10
NUM_CLIPS=20

In [401]:
# Function to process each row of the DataFrame
def data_augmentation(data:pd.DataFrame, duration, num_clips=None):
    if num_clips is None:
        augmented=[]
        for tup in data.iterrows():
            row = tup[1].copy()
            tot_time =  row["stop"]- row["start"]
            if tot_time>duration:
                cuts = np.arange(row["start"], row["stop"], duration)[:-1]
                for i, c in enumerate(cuts):
                    new_row = row.copy()
                    new_row["start"] = c
                    end = c+duration
                    if i==len(cuts)-1:
                        end = row["stop"]
                    new_row["stop"] = end
                    left_indexes=(new_row["myo_left_timestamps"]>=c) & (new_row["myo_left_timestamps"]<end)
                    right_indexes=(new_row["myo_right_timestamps"]>=c) & (new_row["myo_right_timestamps"]<end)
                    new_row["myo_left_timestamps"] = new_row["myo_left_timestamps"][left_indexes]
                    new_row["myo_right_timestamps"] = new_row["myo_right_timestamps"][right_indexes]
                    new_row["myo_left_readings"] = new_row["myo_left_readings"][left_indexes,:]
                    new_row["myo_right_readings"] = new_row["myo_right_readings"][right_indexes,:]
                    augmented.append(new_row)
            else:
                augmented.append(row)
        return pd.DataFrame(augmented)
    else:
        augmented=[]
        for tup in data.iterrows():
            row = tup[1].copy()
            tot_time =  row["stop"]- row["start"]
            duration = min(duration, tot_time)
            highest_offset=max(row["start"], row["stop"]-duration)
            cuts = np.linspace(row["start"], highest_offset, num_clips)
            for c in cuts:
                new_row = row.copy()
                new_row["start"] = c
                if c+duration>row["stop"]:
                    new_row["stop"] = row["stop"]
                else:
                    new_row["stop"] = c+duration
                left_indexes=(new_row["myo_left_timestamps"]>=c) & (new_row["myo_left_timestamps"]<=c+duration)
                right_indexes=(new_row["myo_right_timestamps"]>=c) & (new_row["myo_right_timestamps"]<=c+duration)
                new_row["myo_left_timestamps"] = new_row["myo_left_timestamps"][left_indexes]
                new_row["myo_right_timestamps"] = new_row["myo_right_timestamps"][right_indexes]
                new_row["myo_left_readings"] = new_row["myo_left_readings"][left_indexes,:]
                new_row["myo_right_readings"] = new_row["myo_right_readings"][right_indexes,:]
                augmented.append(new_row)
        a = pd.DataFrame(augmented)
        return a

In [402]:
train_data_augmented = data_augmentation(train_data, DURATION, NUM_CLIPS)
test_data_augmented = data_augmentation(test_data, DURATION, NUM_CLIPS)

In [403]:
dataset_info(train_data_augmented)
dataset_info(test_data_augmented)

Dataset shape:  (10540, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (1180, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [404]:
save_step(train_data_augmented, test_data_augmented, STEP)

# Preprocessing

In [405]:
STEP = "emg"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_augment.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_augment.pkl"))
FS = 160  # Sampling frequency
CUTOFF = 5  # Cutoff frequency
NUM_CHANNELS=8

In [406]:
def rectify_signal(data):
    return np.abs(data)

def filter_signal(data):
    for i in range(NUM_CHANNELS):
        data[:,i] = low_pass_filter(data[:,i])
    return data

def low_pass_filter(data, order=5):
    nyquist = 0.5 * FS
    normal_cutoff = CUTOFF / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data, padlen=5)

def normalization(data, mean, std):
    """Normalize with mean and std
    data: (n_samples, n_channels)
    mean and std: (n_channels,)
    """
    return (data - mean) / std

def mean_std(data):
    sides=["left", "right"]
    means=[]
    stds=[]
    for s in sides:
        normalized_data = data.copy()
        normalized_data[f"myo_{s}_mean"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.mean(x, axis=0))
        normalized_data[f"myo_{s}_std"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.std(x, axis=0))
        means.append(np.mean(normalized_data[f"myo_{s}_mean"].to_list(), axis=0))
        stds.append(np.mean(normalized_data[f"myo_{s}_std"].to_list(), axis=0))
    return (means[0], stds[0]), (means[1], stds[1])

def preprocess(train_data, test_data, normalize=False):
    # Preprocess train data
    train = train_data.copy()
    steps=[rectify_signal, filter_signal]
    sides=["left", "right"]
    for side in sides:
        for step in steps:
            train[f"myo_{side}_readings"] = train[f"myo_{side}_readings"].apply(step)
    if normalize:
        (left_mean, left_std), (right_mean, right_std) = mean_std(train)
        print(left_mean, left_std, right_mean, right_std)
        train[f"myo_left_readings"] = train[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        train[f"myo_right_readings"] = train[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    # Preprocess test data
    test = test_data.copy()
    for side in sides:
        for step in steps:
            test[f"myo_{side}_readings"] = test[f"myo_{side}_readings"].apply(step)
    if normalize:
        test[f"myo_left_readings"] = test[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        test[f"myo_right_readings"] = test[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    return train, test

In [407]:
preprocessed_train, preprocessed_test = preprocess(train_data, test_data, normalize=True)

[13.83767894 13.11577493 14.54656682 15.66527495 16.68707235 12.93012465
 10.64835843 10.58289213] [4.86291377 4.11740436 4.45829986 5.06286368 5.6143549  4.56827959
 3.66215016 3.88714868] [13.66348284 16.86827578 19.50473562 16.98609403 12.46804998  9.97944208
 12.97797236 10.96904905] [4.88247377 6.38570786 6.67814243 5.83467806 4.49633266 3.71481983
 5.16284828 4.31314243]


In [408]:
dataset_info(preprocessed_train)
dataset_info(preprocessed_test)

Dataset shape:  (10540, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (1180, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [409]:
print(preprocessed_train[["myo_left_readings","myo_right_readings"]].head())
print(preprocessed_test[["myo_left_readings","myo_right_readings"]].head())

                                    myo_left_readings  \
40  [[-1.611724844692337, -0.5138613413610372, 1.6...   
40  [[-2.023000901476728, -1.7282186313955923, -2....   
40  [[-1.8173628730845324, -0.2709898833541262, 1....   
40  [[-2.2286389298689233, -2.9425759214301475, -2...   
40  [[-2.6399149866533147, -2.213961547409414, -2....   

                                   myo_right_readings  
40  [[-2.593661214426692, -2.954766518282782, 1.12...  
40  [[-2.1840327973688884, 0.4904271052454041, 2.1...  
40  [[-0.9551475461954783, -2.3283676776412934, 9....  
40  [[-1.3647759632532817, 3.779021018613218, 1.72...  
40  [[-3.0032896314844955, -2.3283676776412934, 0....  
                                   myo_left_readings  \
4  [[-2.84555301504551, 7.015153856853204, 2.7933...   
4  [[-0.5835347027313591, -0.028118425347215224, ...   
4  [[-2.2286389298689233, 1.4291103226942508, -4....   
4  [[-3.4624671002220966, -1.2424757153817703, -2...   
4  [[2.089759666367183, -0.99960425737485

In [410]:
save_step(preprocessed_train, preprocessed_test, STEP)

# Save Multimodal data

In [411]:
STEP="multimodal"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_emg.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_emg.pkl"))

In [412]:
train_rgb_data = train_data.loc[train_data["subject"]=="S04_1"]
test_rgb_data = test_data.loc[test_data["subject"]=="S04_1"]

In [413]:
dataset_info(train_rgb_data)
dataset_info(test_rgb_data)

Dataset shape:  (1020, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19
Dataset shape:  (160, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  7


In [414]:
save_step(train_rgb_data, test_rgb_data, STEP)

# Add RGB data

In [415]:
STEP="multimodal_with_frames"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_multimodal.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_multimodal.pkl"))

In [416]:
def add_frames(data):
    data = data.copy()
    fps=29.67
    offset=load_data(os.path.join(RAW_DATA_DIR, "S04_1.pkl")).iloc[1]["start"]
    data["start_frame"]=(data["start"]-offset)*fps
    data["stop_frame"]=(data["stop"]-offset)*fps
    #convert column to int
    data[["start_frame","stop_frame"]]=data[["start_frame","stop_frame"]].astype(int)
    return data

In [417]:
train_data_with_frames = add_frames(train_data)
test_data_with_frames = add_frames(test_data)

In [418]:
dataset_info(train_data_with_frames)
dataset_info(test_data_with_frames)

Dataset shape:  (1020, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  19
Dataset shape:  (160, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  7


In [419]:
print(train_data_with_frames[["start_frame","stop_frame"]].head())
print(test_data_with_frames[["start_frame","stop_frame"]].head())

    start_frame  stop_frame
10        16413       16710
10        16507       16804
10        16601       16898
10        16695       16991
10        16789       17085
    start_frame  stop_frame
34        43132       43429
34        43141       43438
34        43150       43447
34        43159       43456
34        43168       43465


In [420]:
save_step(train_data_with_frames, test_data_with_frames, STEP)

# Analyze final data

In [428]:
train_emg = load_data(os.path.join(TEMP_DATA_DIR, "train_emg.pkl"))
test_emg = load_data(os.path.join(TEMP_DATA_DIR, "test_emg.pkl"))
train_rgb = load_data(os.path.join(TEMP_DATA_DIR, "train_multimodal_with_frames.pkl"))
test_rgb = load_data(os.path.join(TEMP_DATA_DIR, "test_multimodal_with_frames.pkl"))

In [429]:
# reset index
train_emg = train_emg.reset_index(drop=True)
test_emg = test_emg.reset_index(drop=True)
train_rgb = train_rgb.reset_index(drop=True)
test_rgb = test_rgb.reset_index(drop=True)

In [430]:
dataset_info(train_emg)
dataset_info(test_emg)

Dataset shape:  (10540, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (1180, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [431]:
print(train_emg.head())
print(test_emg.head())

                     description         start          stop  \
0  Spread jelly on a bread slice  1.657739e+09  1.657739e+09   
1  Spread jelly on a bread slice  1.657739e+09  1.657739e+09   
2  Spread jelly on a bread slice  1.657739e+09  1.657739e+09   
3  Spread jelly on a bread slice  1.657739e+09  1.657739e+09   
4  Spread jelly on a bread slice  1.657739e+09  1.657739e+09   

                                 myo_left_timestamps  \
0  [1657738827.4506874, 1657738827.458187, 165773...   
1  [1657738828.076678, 1657738828.080177, 1657738...   
2  [1657738828.710679, 1657738828.718178, 1657738...   
3  [1657738829.3326783, 1657738829.340179, 165773...   
4  [1657738829.963178, 1657738829.9706826, 165773...   

                                   myo_left_readings  \
0  [[-1.611724844692337, -0.5138613413610372, 1.6...   
1  [[-2.023000901476728, -1.7282186313955923, -2....   
2  [[-1.8173628730845324, -0.2709898833541262, 1....   
3  [[-2.2286389298689233, -2.9425759214301475, -2...  

In [432]:
dataset_info(train_rgb)
dataset_info(test_rgb)

Dataset shape:  (1020, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  19
Dataset shape:  (160, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  7


In [433]:
print(train_rgb.head())
print(test_rgb.head())

                                         description         start  \
0  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
1  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
2  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
3  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
4  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   

           stop                                myo_left_timestamps  \
0  1.655241e+09  [1655240527.6204886, 1655240527.624489, 165524...   
1  1.655241e+09  [1655240530.7854846, 1655240530.796483, 165524...   
2  1.655241e+09  [1655240533.954478, 1655240533.957978, 1655240...   
3  1.655241e+09  [1655240537.1189718, 1655240537.126472, 165524...   
4  1.655241e+09  [1655240540.2919679, 1655240540.299468, 165524...   

                                   myo_left_readings  \
0  [[-2.434276958261119, -3.185447379437058, -1.0...   
1  [[-2.6399149866533147, -1.9710900894025032,

In [434]:
save_data(train_emg, os.path.join(FINAL_DATA_DIR, "train_EMG.pkl"))
save_data(test_emg, os.path.join(FINAL_DATA_DIR, "test_EMG.pkl"))
save_data(train_rgb, os.path.join(FINAL_DATA_DIR, "train_MULTIMODAL.pkl"))
save_data(test_rgb, os.path.join(FINAL_DATA_DIR, "test_MULTIMODAL.pkl"))