In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import time, os, shutil
import dv_processing as dv
from PIL import Image
import numpy as np
import helper_funcs as hf

# Data Preprocessing 

This notebook preprocesses the data created after executing the `utils/generate_letter_datasets.py` script. The outputs of this script go to the directory specified, for example `../data/train/a.csv`. Within the CSV file, there are the following columns: `timestamp`, `x`, `y`, `polarity`. These CSV files contain events from the randomly sampled batches from all subjects recordings, all concatenated into these files.

This notebook processes these files and store all events into Numpy binary files for easy dataloading into PyTorch during model training.

### Process Into Training and Testing Data

This section creates Numpy binaries for all letters in training and testing datasets.

In [13]:
def process_data(MODE, letters):
    """
    Process data for a given mode ("train" or "test") and a list of letters.

    Args:
    MODE (str): The mode, either "train" or "test".
    letters (list): A list of letters to process.

    Raises:
    FileNotFoundError: If the input CSV file for a letter is not found.

    Notes:
    - This function reads CSV files located at '../data/{MODE}/{letter}.csv'.
    - For each letter, it creates a directory '../data/{MODE}/{letter}' if it doesn't exist.
    - It then reads the CSV file, reindexes the DataFrame, and finds the start indices of each sample.
    - Each sample is saved as a binary file '{MODE.lower()}_{letter}_{index}.bin' in the respective directory.
    - Samples with fewer than 1000 events are skipped.

    Example:
    >>> MODE = "train"
    >>> letters = ['a', 'b', 'c']
    >>> process_data(MODE, letters)
    """

    for l in letters:
        DIR = f'../data/{MODE}/{l}'
        if not os.path.isdir(DIR):
            os.makedirs(DIR)

        FILE = f"{DIR}.csv"
        df = pd.read_csv(FILE)
        df = df.reindex(columns=['x', 'y', 'timestamp', 'polarity'])
        starts = df[df['timestamp'] == 0].index


        for i, t in enumerate(starts):
            sample = df.iloc[t:starts[i+1]] if i+1 < len(starts) else df.iloc[t:]
            SAVE = f"{DIR}/{MODE.lower()}_{l}_{i:04}.bin"

            # Throw out samples with < 1000 events 
            if len(sample) < 1000:
                continue

            data = [tuple(row) for row in sample.to_numpy()]
            data = np.array(data, dtype=[('x', '<i8'), ('y', '<i8'), ('t', '<i8'), ('p', '<i8')])

            with open(SAVE, 'wb') as f:
                np.save(f, data)

In [None]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k',
           'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 
           'v', 'w', 'x', 'y']

process_data("train", letters)
process_data("test", letters)

In [None]:
def process_files_by_letter(letters, data_dir, train_ratio=0.7):
    """
    Process files for each letter in the given list within the specified data directory.

    Args:
    letters (list): List of letters to process.
    data_dir (str): Path to the directory containing the data.
    train_ratio (float, optional): The ratio of training data to total data. Defaults to 0.7.

    Notes:
    - For each letter, the function reads the files within the corresponding subdirectory in the data directory.
    - It shuffles the indices to create training and test sets based on the train_ratio split.
    - It then prints the filenames in the set for each letter.

    Example:
    >>> letters = ['a', 'b', 'c']
    >>> data_dir = "/path/to/data"
    >>> process_files_by_letter(letters, data_dir)
    """
    for l in letters:
        letter_dir = os.path.join(data_dir, l)
        files = sorted(os.listdir(letter_dir))

        # Shuffle indices - create train and test sets
        idx = np.arange(len(files))
        np.random.shuffle(idx)
        s = int(len(files) * train_ratio)
        train_idx = idx[:s]
        test_idx = idx[s:]

        train_files = [f for i, f in enumerate(files) if i in train_idx]
        test_files = [f for i, f in enumerate(files) if i in test_idx]

        # Print filenames in the test set for each letter
        for f in test_files:
            print(f)

        print('----------\n')


In [None]:
# Define the directory containing the data and the training ratio to split data
MODE = "train"
DIR = f"/Users/ria/Documents/GitHub/COSC525_ASL_SNN/data/{MODE}"
train_ratio = 0.7

# Process files for each letter
process_files_by_letter(letters, DIR, train_ratio)

### Create Trainset and Testset Labels CSV

Creates a CSV for both the trainset and testset that contains the following:
- `file`: the file path to the sample binary file
- `label`: the class label associated to the sample

In [None]:
def create_data_csv(data_dir, output_path):
    """
    Create a DataFrame from files in the specified data directory and save it as a CSV.

    Args:
    data_dir (str): Path to the directory containing the data files.
    output_path (str): Path to save the output CSV file.
    """
    df = pd.DataFrame(columns=['file', 'label'])
    labels = {}
    label_count = 0

    for f in sorted(os.listdir(data_dir)):
        label = f.split('_')[0]

        if label not in labels:
            labels[label] = label_count
            label_count += 1

        path = os.path.join(data_dir, f)

        data = pd.DataFrame({'file': [path], 'label': [labels[label]]})
        df = pd.concat([df, data])

    df.to_csv(output_path, index=False)


In [None]:
create_data_csv("../data/train", "../data/train/train_data.csv")
create_data_csv("../data/test", "../data/test/test_data.csv")

### Metadata

This prints out the number of samples created for training and testing (used as validation in our model training).

In [110]:
trainset = pd.read_csv("../data/train_data.csv")
testset = pd.read_csv("../data/test_data.csv")

print(f"Trainset: {len(trainset)} items")
print(f"Testset: {len(testset)} items")
print('-----')
print(f"Total data: {len(trainset) + len(testset)} items")

Trainset: 497 items
Testset: 214 items
-----
Total data: 711 items
