In [81]:
import pandas as pd
import os
import numpy as np
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
import os

In [82]:
DATA_DIR="data"
ANNOTATIONS_DIR = "train_val"
RAW_DATA_DIR = os.path.join(DATA_DIR,"raw")
TEMP_DATA_DIR = os.path.join(DATA_DIR,"temp")
FINAL_DATA_DIR = os.path.join(DATA_DIR,"final")

In [83]:

def load_data(path):
    with open(path, 'rb') as f:
        df = pd.read_pickle(f)
    return df

def save_data(df, path):
    with open(path, 'wb') as f:
        pd.to_pickle(df, f)
        
def save_step(train_data, test_data, step):
    save_data(train_data, os.path.join(TEMP_DATA_DIR, f"train_{step}.pkl"))
    save_data(test_data, os.path.join(TEMP_DATA_DIR, f"test_{step}.pkl"))

        
def dataset_info(data):
    print("Dataset shape: ", data.shape)
    print("Dataset columns: ", data.columns)
    # Number of classes
    print("Number of classes: ", len(data['description'].unique()))

In [84]:
train_split = load_data('train_val/train.pkl')
test_split = load_data('train_val/test.pkl')

In [85]:
dataset_info(train_split)
dataset_info(test_split)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  22
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  20


# Remove duplicates

In [86]:
STEP="remove_duplicates"

In [87]:
classes ={'Slice a potato', 'Spread jelly on a bread slice', 'Load dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Unload dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Clear cutting board', 'Get items from cabinets: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Spread almond butter on a bread slice', 'Clean a plate with a sponge', 'Slice a cucumber', 'Clean a pan with a sponge', 'Slice bread', 'Clean a plate with a towel', 'Pour water from a pitcher into a glass', 'Peel a cucumber', 'Set table: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Open/close a jar of almond butter', 'Peel a potato', 'Get/replace items from refrigerator/cabinets/drawers', 'Stack on table: 3 each large/small plates, bowls', 'Clean a pan with a towel'}
classes_map = {c: i for i, c in enumerate(classes)}

In [88]:
def remove_synonyms(description):
    if description=="Get items from refrigerator/cabinets/drawers":
        return "Get/replace items from refrigerator/cabinets/drawers"
    elif description=="Open a jar of almond butter":
        return "Open/close a jar of almond butter"
    else:
        return description
    
def filter_dataset(data, name, step=STEP):
    filtered_dataset = data.copy()
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes found.")
    filtered_dataset['description'] = filtered_dataset['description'].apply(remove_synonyms)
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes after removing synonyms.")
    filtered_dataset['label'] = filtered_dataset['description'].apply(lambda x: classes_map[x])
    # Drop column labels
    filtered_dataset = filtered_dataset.drop(columns=['labels'])
    return filtered_dataset

In [89]:
train_split_filtered = filter_dataset(train_split, "train")
test_split_filtered = filter_dataset(test_split, "test")

22 classes found.
20 classes after removing synonyms.
20 classes found.
19 classes after removing synonyms.


In [90]:
dataset_info(train_split_filtered)
dataset_info(test_split_filtered)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  20
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  19


In [91]:
save_step(train_split_filtered, test_split_filtered, STEP)

# Merge

In [92]:
STEP="merge"
train_data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))

In [93]:
def merge(mode):
    if mode=="train":
        data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
    elif mode=="test":
        data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))
    rows = []
    for tup in data.iterrows():
        row = tup[1].copy()
        file = row["file"]
        idx = row["index"]
        emg = load_data(os.path.join(RAW_DATA_DIR, file))
        emg = emg.iloc[idx].copy()
        emg["subject"]=file.split('.')[0]
        emg["label"]=row["label"]
        emg["description"]=row["description"]
        rows.append(emg)
    merged = pd.DataFrame(rows)
    return merged

In [94]:
train_data = merge("train")
test_data = merge("test")

In [95]:
dataset_info(train_data)
dataset_info(test_data)

Dataset shape:  (527, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (59, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [96]:
save_step(train_data, test_data, STEP)

# Data Augmentation

In [97]:
STEP="augment"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_merge.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_merge.pkl"))
DURATION = 10
NUM_CLIPS=None

In [98]:
# Function to process each row of the DataFrame
def data_augmentation(data:pd.DataFrame, duration, num_clips=None):
    if num_clips is None:
        augmented=[]
        for tup in data.iterrows():
            row = tup[1].copy()
            tot_time =  row["stop"]- row["start"]
            if tot_time>duration:
                cuts = np.arange(row["start"], row["stop"], duration)[:-1]
                for i, c in enumerate(cuts):
                    new_row = row.copy()
                    new_row["start"] = c
                    end = c+duration
                    if i==len(cuts)-1:
                        end = row["stop"]
                    new_row["stop"] = end
                    left_indexes=(new_row["myo_left_timestamps"]>=c) & (new_row["myo_left_timestamps"]<end)
                    right_indexes=(new_row["myo_right_timestamps"]>=c) & (new_row["myo_right_timestamps"]<end)
                    new_row["myo_left_timestamps"] = new_row["myo_left_timestamps"][left_indexes]
                    new_row["myo_right_timestamps"] = new_row["myo_right_timestamps"][right_indexes]
                    new_row["myo_left_readings"] = new_row["myo_left_readings"][left_indexes,:]
                    new_row["myo_right_readings"] = new_row["myo_right_readings"][right_indexes,:]
                    augmented.append(new_row)
            else:
                augmented.append(row)
        return pd.DataFrame(augmented)
    else:
        augmented=[]
        for tup in data.iterrows():
            row = tup[1].copy()
            tot_time =  row["stop"]- row["start"]
            duration = min(duration, tot_time)
            highest_offset=max(row["start"], row["stop"]-duration)
            cuts = np.linspace(row["start"], highest_offset, num_clips)
            for c in cuts:
                new_row = row.copy()
                new_row["start"] = c
                if c+duration>row["stop"]:
                    new_row["stop"] = row["stop"]
                else:
                    new_row["stop"] = c+duration
                left_indexes=(new_row["myo_left_timestamps"]>=c) & (new_row["myo_left_timestamps"]<=c+duration)
                right_indexes=(new_row["myo_right_timestamps"]>=c) & (new_row["myo_right_timestamps"]<=c+duration)
                new_row["myo_left_timestamps"] = new_row["myo_left_timestamps"][left_indexes]
                new_row["myo_right_timestamps"] = new_row["myo_right_timestamps"][right_indexes]
                new_row["myo_left_readings"] = new_row["myo_left_readings"][left_indexes,:]
                new_row["myo_right_readings"] = new_row["myo_right_readings"][right_indexes,:]
                augmented.append(new_row)
        a = pd.DataFrame(augmented)
        return a

In [99]:
train_data_augmented = data_augmentation(train_data, DURATION, NUM_CLIPS)
test_data_augmented = data_augmentation(test_data, DURATION, NUM_CLIPS)

In [100]:
dataset_info(train_data_augmented)
dataset_info(test_data_augmented)

Dataset shape:  (3311, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (362, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [101]:
save_step(train_data_augmented, test_data_augmented, STEP)

# Preprocessing

In [102]:
STEP = "emg"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_augment.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_augment.pkl"))
FS = 160  # Sampling frequency
CUTOFF = 5  # Cutoff frequency
NUM_CHANNELS=8

In [103]:
def rectify_signal(data):
    return np.abs(data)

def filter_signal(data):
    for i in range(NUM_CHANNELS):
        data[:,i] = low_pass_filter(data[:,i])
    return data

def low_pass_filter(data, order=5):
    nyquist = 0.5 * FS
    normal_cutoff = CUTOFF / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data, padlen=5)

def normalization(data, mean, std):
    """Normalize with mean and std
    data: (n_samples, n_channels)
    mean and std: (n_channels,)
    """
    return (data - mean) / std

def mean_std(data):
    sides=["left", "right"]
    means=[]
    stds=[]
    for s in sides:
        normalized_data = data.copy()
        normalized_data[f"myo_{s}_mean"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.mean(x, axis=0))
        normalized_data[f"myo_{s}_std"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.std(x, axis=0))
        means.append(np.mean(normalized_data[f"myo_{s}_mean"].to_list(), axis=0))
        stds.append(np.mean(normalized_data[f"myo_{s}_std"].to_list(), axis=0))
    return (means[0], stds[0]), (means[1], stds[1])

def preprocess(train_data, test_data, normalize=False):
    # Preprocess train data
    train = train_data.copy()
    steps=[rectify_signal, filter_signal]
    sides=["left", "right"]
    for side in sides:
        for step in steps:
            train[f"myo_{side}_readings"] = train[f"myo_{side}_readings"].apply(step)
    if normalize:
        (left_mean, left_std), (right_mean, right_std) = mean_std(train)
        print(left_mean, left_std, right_mean, right_std)
        train[f"myo_left_readings"] = train[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        train[f"myo_right_readings"] = train[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    # Preprocess test data
    test = test_data.copy()
    for side in sides:
        for step in steps:
            test[f"myo_{side}_readings"] = test[f"myo_{side}_readings"].apply(step)
    if normalize:
        test[f"myo_left_readings"] = test[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        test[f"myo_right_readings"] = test[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    return train, test

In [104]:
preprocessed_train, preprocessed_test = preprocess(train_data, test_data, normalize=True)

[11.56361771 11.21051396 11.9618695  13.23233926 16.07757608 12.7148602
  9.6654832   9.25045687] [5.91995267 4.91895658 5.27592846 6.09308719 7.61099604 6.4389358
 4.69545215 4.98935264] [12.53663034 16.36586102 19.19948703 15.95498182 12.36631464  9.65959738
 11.89806297 10.40399191] [6.20287008 8.56938787 8.76395084 7.43965282 5.96235598 4.7795494
 6.65425632 5.69748396]


In [105]:
dataset_info(preprocessed_train)
dataset_info(preprocessed_test)

Dataset shape:  (3311, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (362, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [106]:
print(preprocessed_train[["myo_left_readings","myo_right_readings"]].head())
print(preprocessed_test[["myo_left_readings","myo_right_readings"]].head())

                                    myo_left_readings  \
40  [[-0.939807803185361, -0.0427964656607138, 1.9...   
40  [[-1.9533294198689748, -2.279043082640766, -1....   
40  [[-1.784409150421706, -2.0757479356425796, -2....   
40  [[-1.784409150421706, -2.279043082640766, -1.8...   
10  [[-1.6154888809744368, -2.279043082640766, -0....   

                                   myo_right_readings  
40  [[-1.8598858582399078, -2.143194040127815, 0.8...  
40  [[1.0419966212858311, 1.5910283417032969, 0.31...  
40  [[-1.3762387783189514, -1.3263328941022594, -1...  
40  [[1.6868593945137733, -0.8595550963733705, -2....  
10  [[-1.6986701649329223, -0.9762495458055928, -1...  
                                   myo_left_readings  \
4  [[-1.9533294198689748, 6.259353091283071, 2.85...   
4  [[1.7629165079709426, -0.6526819066552735, -0....   
4  [[-0.09520645594901608, 0.567088975333846, 3.6...   
4  [[-0.26412672539628507, 2.1934501513193387, -1...   
4  [[-0.09520645594901608, 2.600040445315

In [107]:
save_step(preprocessed_train, preprocessed_test, STEP)

# Save Multimodal data

In [108]:
STEP="multimodal"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_emg.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_emg.pkl"))

In [109]:
train_rgb_data = train_data.loc[train_data["subject"]=="S04_1"]
test_rgb_data = test_data.loc[test_data["subject"]=="S04_1"]

In [110]:
dataset_info(train_rgb_data)
dataset_info(test_rgb_data)

Dataset shape:  (335, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19
Dataset shape:  (33, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  7


In [111]:
save_step(train_rgb_data, test_rgb_data, STEP)

# Add RGB data

In [112]:
STEP="multimodal_with_frames"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_multimodal.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_multimodal.pkl"))

In [113]:
def add_frames(data):
    data = data.copy()
    fps=29.67
    offset=load_data(os.path.join(RAW_DATA_DIR, "S04_1.pkl")).iloc[1]["start"]
    data["start_frame"]=(data["start"]-offset)*fps
    data["stop_frame"]=(data["stop"]-offset)*fps
    #convert column to int
    data[["start_frame","stop_frame"]]=data[["start_frame","stop_frame"]].astype(int)
    return data

In [114]:
train_data_with_frames = add_frames(train_data)
test_data_with_frames = add_frames(test_data)

In [115]:
dataset_info(train_data_with_frames)
dataset_info(test_data_with_frames)

Dataset shape:  (335, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  19
Dataset shape:  (33, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  7


In [116]:
print(train_data_with_frames[["start_frame","stop_frame"]].head())
print(test_data_with_frames[["start_frame","stop_frame"]].head())

    start_frame  stop_frame
10        16413       16561
10        16561       16710
10        16710       16858
10        16858       17006
10        17006       17155
    start_frame  stop_frame
34        43132       43281
34        43281       43429
34        43429       43599
23        33737       33885
23        33885       34033


In [117]:
save_step(train_data_with_frames, test_data_with_frames, STEP)

# Analyze final data

In [118]:
train_emg = load_data(os.path.join(TEMP_DATA_DIR, "train_emg.pkl"))
test_emg = load_data(os.path.join(TEMP_DATA_DIR, "test_emg.pkl"))
train_rgb = load_data(os.path.join(TEMP_DATA_DIR, "train_multimodal_with_frames.pkl"))
test_rgb = load_data(os.path.join(TEMP_DATA_DIR, "test_multimodal_with_frames.pkl"))

In [119]:
# reset index
train_emg = train_emg.reset_index(drop=True)
test_emg = test_emg.reset_index(drop=True)
train_rgb = train_rgb.reset_index(drop=True)
test_rgb = test_rgb.reset_index(drop=True)

In [120]:
dataset_info(train_emg)
dataset_info(test_emg)

Dataset shape:  (3311, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (362, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [121]:
print(train_emg.head())
print(test_emg.head())

                                         description         start  \
0                      Spread jelly on a bread slice  1.657739e+09   
1                      Spread jelly on a bread slice  1.657739e+09   
2                      Spread jelly on a bread slice  1.657739e+09   
3                      Spread jelly on a bread slice  1.657739e+09   
4  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   

           stop                                myo_left_timestamps  \
0  1.657739e+09  [1657738827.4506874, 1657738827.458187, 165773...   
1  1.657739e+09  [1657738832.453177, 1657738832.456678, 1657738...   
2  1.657739e+09  [1657738837.447678, 1657738837.455178, 1657738...   
3  1.657739e+09  [1657738842.451178, 1657738842.4621787, 165773...   
4  1.655241e+09  [1655240527.6204886, 1655240527.624489, 165524...   

                                   myo_left_readings  \
0  [[-0.939807803185361, -0.0427964656607138, 1.9...   
1  [[-1.9533294198689748, -2.279043082640766, 

In [122]:
dataset_info(train_rgb)
dataset_info(test_rgb)

Dataset shape:  (335, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  19
Dataset shape:  (33, 11)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label', 'start_frame', 'stop_frame'],
      dtype='object')
Number of classes:  7


In [123]:
print(train_rgb.head())
print(test_rgb.head())

                                         description         start  \
0  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
1  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
2  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
3  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   
4  Get/replace items from refrigerator/cabinets/d...  1.655241e+09   

           stop                                myo_left_timestamps  \
0  1.655241e+09  [1655240527.6204886, 1655240527.624489, 165524...   
1  1.655241e+09  [1655240532.626479, 1655240532.6304793, 165524...   
2  1.655241e+09  [1655240537.617971, 1655240537.629471, 1655240...   
3  1.655241e+09  [1655240542.624464, 1655240542.628464, 1655240...   
4  1.655241e+09  [1655240547.619455, 1655240547.623455, 1655240...   

                                   myo_left_readings  \
0  [[-1.6154888809744368, -2.279043082640766, -0....   
1  [[-0.09520645594901608, 1.3802695633265922,

In [124]:
save_data(train_emg, os.path.join(FINAL_DATA_DIR, "train_EMG.pkl"))
save_data(test_emg, os.path.join(FINAL_DATA_DIR, "test_EMG.pkl"))
save_data(train_rgb, os.path.join(FINAL_DATA_DIR, "train_MULTIMODAL.pkl"))
save_data(test_rgb, os.path.join(FINAL_DATA_DIR, "test_MULTIMODAL.pkl"))