In [1]:
import pandas as pd
import os
import numpy as np
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
import os

In [2]:
DATA_DIR="data"
ANNOTATIONS_DIR = "train_val"
RAW_DATA_DIR = os.path.join(DATA_DIR,"raw")
TEMP_DATA_DIR = os.path.join(DATA_DIR,"temp_a")
FINAL_DATA_DIR = os.path.join(DATA_DIR,"final_a")

In [3]:

def load_data(path):
    with open(path, 'rb') as f:
        df = pd.read_pickle(f)
    return df

def save_data(df, path):
    with open(path, 'wb') as f:
        pd.to_pickle(df, f)
        
def save_step(train_data, test_data, step):
    save_data(train_data, os.path.join(TEMP_DATA_DIR, f"train_{step}.pkl"))
    save_data(test_data, os.path.join(TEMP_DATA_DIR, f"test_{step}.pkl"))

        
def dataset_info(data):
    print("Dataset shape: ", data.shape)
    print("Dataset columns: ", data.columns)
    # Number of classes
    print("Number of classes: ", len(data['description'].unique()))

In [4]:
train_split = load_data('train_val/train.pkl')
test_split = load_data('train_val/test.pkl')

In [5]:
dataset_info(train_split)
dataset_info(test_split)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  22
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'labels'], dtype='object')
Number of classes:  20


# Remove duplicates

In [6]:
STEP="remove_duplicates"

In [7]:
classes ={'Slice a potato', 'Spread jelly on a bread slice', 'Load dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Unload dishwasher: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Clear cutting board', 'Get items from cabinets: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Spread almond butter on a bread slice', 'Clean a plate with a sponge', 'Slice a cucumber', 'Clean a pan with a sponge', 'Slice bread', 'Clean a plate with a towel', 'Pour water from a pitcher into a glass', 'Peel a cucumber', 'Set table: 3 each large/small plates, bowls, mugs, glasses, sets of utensils', 'Open/close a jar of almond butter', 'Peel a potato', 'Get/replace items from refrigerator/cabinets/drawers', 'Stack on table: 3 each large/small plates, bowls', 'Clean a pan with a towel'}
classes_map = {c: i for i, c in enumerate(classes)}

In [8]:
def remove_synonyms(description):
    if description=="Get items from refrigerator/cabinets/drawers":
        return "Get/replace items from refrigerator/cabinets/drawers"
    elif description=="Open a jar of almond butter":
        return "Open/close a jar of almond butter"
    else:
        return description
    
def filter_dataset(data, name, step=STEP):
    filtered_dataset = data.copy()
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes found.")
    filtered_dataset['description'] = filtered_dataset['description'].apply(remove_synonyms)
    classes = filtered_dataset['description'].unique()
    print(f"{len(classes)} classes after removing synonyms.")
    filtered_dataset['label'] = filtered_dataset['description'].apply(lambda x: classes_map[x])
    # Drop column labels
    filtered_dataset = filtered_dataset.drop(columns=['labels'])
    return filtered_dataset

In [9]:
train_split_filtered = filter_dataset(train_split, "train")
test_split_filtered = filter_dataset(test_split, "test")

22 classes found.
20 classes after removing synonyms.
20 classes found.
19 classes after removing synonyms.


In [10]:
dataset_info(train_split_filtered)
dataset_info(test_split_filtered)

Dataset shape:  (527, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  20
Dataset shape:  (59, 4)
Dataset columns:  Index(['index', 'file', 'description', 'label'], dtype='object')
Number of classes:  19


In [12]:
save_step(train_split_filtered, test_split_filtered, STEP)

# Merge

In [13]:
STEP="merge"
train_data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))

In [14]:
def merge(mode):
    if mode=="train":
        data = load_data(os.path.join(TEMP_DATA_DIR, "train_remove_duplicates.pkl"))
    elif mode=="test":
        data = load_data(os.path.join(TEMP_DATA_DIR, "test_remove_duplicates.pkl"))
    rows = []
    for tup in data.iterrows():
        row = tup[1].copy()
        file = row["file"]
        idx = row["index"]
        emg = load_data(os.path.join(RAW_DATA_DIR, file))
        emg = emg.iloc[idx].copy()
        emg["subject"]=file.split('.')[0]
        emg["label"]=row["label"]
        emg["description"]=row["description"]
        rows.append(emg)
    merged = pd.DataFrame(rows)
    return merged

In [15]:
train_data = merge("train")
test_data = merge("test")

In [16]:
dataset_info(train_data)
dataset_info(test_data)

Dataset shape:  (527, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (59, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [17]:
save_step(train_data, test_data, STEP)

# Preprocessing

In [18]:
STEP = "emg"
train_data = load_data(os.path.join(TEMP_DATA_DIR, f"train_merge.pkl"))
test_data = load_data(os.path.join(TEMP_DATA_DIR, f"test_merge.pkl"))
FS = 160  # Sampling frequency
CUTOFF = 5  # Cutoff frequency
NUM_CHANNELS=8

In [19]:
def rectify_signal(data):
    return np.abs(data)

def filter_signal(data):
    for i in range(NUM_CHANNELS):
        data[:,i] = low_pass_filter(data[:,i])
    return data

def low_pass_filter(data, order=5):
    nyquist = 0.5 * FS
    normal_cutoff = CUTOFF / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data, padlen=5)

def normalization(data, mean, std):
    """Normalize with mean and std
    data: (n_samples, n_channels)
    mean and std: (n_channels,)
    """
    return (data - mean) / std

def mean_std(data):
    sides=["left", "right"]
    means=[]
    stds=[]
    for s in sides:
        normalized_data = data.copy()
        normalized_data[f"myo_{s}_mean"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.mean(x, axis=0))
        normalized_data[f"myo_{s}_std"] = normalized_data[f"myo_{s}_readings"].apply(lambda x: np.std(x, axis=0))
        means.append(np.mean(normalized_data[f"myo_{s}_mean"].to_list(), axis=0))
        stds.append(np.mean(normalized_data[f"myo_{s}_std"].to_list(), axis=0))
    return (means[0], stds[0]), (means[1], stds[1])

def preprocess(train_data, test_data, normalize=False):
    # Preprocess train data
    train = train_data.copy()
    steps=[rectify_signal, filter_signal]
    sides=["left", "right"]
    for side in sides:
        for step in steps:
            train[f"myo_{side}_readings"] = train[f"myo_{side}_readings"].apply(step)
    if normalize:
        (left_mean, left_std), (right_mean, right_std) = mean_std(train)
        print(left_mean, left_std, right_mean, right_std)
        train[f"myo_left_readings"] = train[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        train[f"myo_right_readings"] = train[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    # Preprocess test data
    test = test_data.copy()
    for side in sides:
        for step in steps:
            test[f"myo_{side}_readings"] = test[f"myo_{side}_readings"].apply(step)
    if normalize:
        test[f"myo_left_readings"] = test[f"myo_left_readings"].apply(normalization, args=(left_mean, left_std))
        test[f"myo_right_readings"] = test[f"myo_right_readings"].apply(normalization, args=(right_mean, right_std))
    return train, test

In [20]:
preprocessed_train, preprocessed_test = preprocess(train_data, test_data, normalize=True)

[13.75624219 13.06355075 14.42117728 15.51845746 16.66506348 12.8855857
 10.57188832 10.54096876] [7.42496851 6.0592204  6.56498921 7.43746219 8.41245864 7.11516333
 5.51764957 5.98278444] [13.54149584 16.74649575 19.52858889 16.95801181 12.46527825  9.95593723
 12.8419363  10.88651791] [7.31494636 9.73468516 9.50411395 8.2842876  6.1577131  5.2747223
 7.71895575 6.48097835]


In [21]:
dataset_info(preprocessed_train)
dataset_info(preprocessed_test)

Dataset shape:  (527, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  20
Dataset shape:  (59, 9)
Dataset columns:  Index(['description', 'start', 'stop', 'myo_left_timestamps',
       'myo_left_readings', 'myo_right_timestamps', 'myo_right_readings',
       'subject', 'label'],
      dtype='object')
Number of classes:  19


In [22]:
print(preprocessed_train[["myo_left_readings","myo_right_readings"]].head())
print(preprocessed_test[["myo_left_readings","myo_right_readings"]].head())

                                    myo_left_readings  \
40  [[-1.0446161729358687, -0.3405637377935135, 1....   
10  [[-1.5833389963599647, -2.155978803913611, -0....   
6   [[-0.9099354670798447, -0.5056014710771587, -0...   
24  [[-1.3139775846479167, -0.6706392043608039, -2...   
6   [[-0.9099354670798447, -1.3307901374953848, -2...   

                                   myo_right_readings  
40  [[-1.7145027755423212, -1.925742378200468, 0.7...  
10  [[-1.5777963732581994, -0.8984877896617861, -1...  
6   [[1.156331672424237, 0.2314922577307637, -0.58...  
24  [[-1.1676771664058339, -0.17940957768470897, 4...  
6   [[-0.8942643618375903, -0.48758595424631346, 2...  
                                    myo_left_readings  \
4   [[-1.8527004080720129, 4.775605993999488, 1.91...   
34  [[-1.4486582905039407, 2.7951531945957453, -2....   
48  [[-1.8527004080720129, -1.9909410706299657, -2...   
4   [[-0.3712126436557488, -1.6608656040626752, -2...   
62  [[-1.7180197022159887, -1.660865

In [23]:
save_step(preprocessed_train, preprocessed_test, STEP)