# Data Explorer

This notebook visualizes what is going on in the dataset. 

Each `.aedat4` file contains an event stream that contains event batches.
Event batches contains events that occured in each frame. 
The number of events in each batch can vary, especially for each letter and each subject.

This notebook aims to help us understand our dataset better and how the authors collected it.
It will assist us in passing the data as input spikes to the spiking neural network we created.

#### Helpful References:
* [IniVation DV-Processing API Documentation](https://dv-processing.inivation.com/rel_1_7/api.html#api)
   * Particularly, the [`Accumulator` class](https://dv-processing.inivation.com/rel_1_7/api.html#_CPPv4N2dv11AccumulatorE) is helpful in explaining how events are stored in event batches

## 0 - Import packages and define helper functions

In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import time, os
import dv_processing as dv
from PIL import Image
import numpy as np
import json

In [2]:
# generate_gif(imgs, subject, letter)
# generate_images_from_aedat(file, gif=False)

# events_to_img(sample, resolution)
# eviz = dv_processing.visualization.EventVisualizer(resolution, white, black, black)

In [18]:
def generate_gif(imgs, subject, letter):
    imgs = [Image.fromarray(img) for img in imgs]
    d = 100 # duration of each frame in GIF (in milliseconds)

    imgs[0].save(f"../animations/subject{subject}_{letter}.gif", save_all=True, append_images=imgs[1:], duration=d, loop=0)
    

def events_to_img(sample, resolution):
    white = (255.0, 255.0, 255.0)
    black = (0.0, 0.0, 0.0)

    eviz = dv.visualization.EventVisualizer(resolution, black, white, white)
    img = eviz.generateImage(sample)
    img = Image.fromarray(img).convert('L') # Convert to grayscale image (3 channels -> 1 channel)
    img = np.array(img) / 255.0

    return img

def parse_aedat(file):

    # Read event stream from file and split into event packets
    recording = dv.io.MonoCameraRecording(file)
    resolution = recording.getEventResolution()

    min_batch_size, max_batch_size = np.inf, 0
    imgs = []

    if recording.isEventStreamAvailable():
        samples = 0
        while True:
            sample = recording.getNextEventBatch()
            if sample is None:
                break
            sample_length = len(sample.numpy())

            if min_batch_size > sample_length:
                min_batch_size = sample_length
            
            if max_batch_size < sample_length:
                max_batch_size = sample_length

            # Create image from sample of recording
            imgs.append(events_to_img(sample, resolution))
            
            samples += 1


        print("\n-----")
        print("Total # of Samples:", samples)
        print("Minimum # of Events For Recording:", min_batch_size)
        print("Maximum # of Events For Recording:", max_batch_size)
    
    return np.array(imgs)

## 1 - Parse AEDAT4 Data

In [19]:
subject = 1
letter = 'a'
AEDAT = f"../data/aedat/subject{subject}/{letter}.aedat4"
GIF = False

In [57]:
imgs = parse_aedat(AEDAT)

if GIF:
    generate_gif(imgs, subject, letter)

img = imgs[0]




-----
Total # of Samples: 2663
Minimum # of Events For Recording: 9
Maximum # of Events For Recording: 3652


In [38]:
imgs[0].flatten().shape

(43200,)

### Store image data into csv for all subjects

In [87]:
def store_data_per_letter(letter):
    subjects = [1, 2, 3, 4, 5]
    cols = ["subject", "sample", "num_events", "duration", "image_array"]
    df = pd.DataFrame(columns=cols)

    for sub in subjects:
        FILE = f"../data/aedat/subject{sub}/{letter}.aedat4"
        if os.path.isfile(FILE):

            # Read event stream from file and split into event packets
            recording = dv.io.MonoCameraRecording(FILE)
            resolution = recording.getEventResolution()

            if recording.isEventStreamAvailable():
                samples = 0
                while True:
                    # Get the next sample in the recording
                    sample = recording.getNextEventBatch()
                    if sample is None:
                        break

                    num_events = len(sample.numpy())
                    duration = dv.EventStore.duration(sample)

                    # Create image from the sample 
                    img = events_to_img(sample, resolution).flatten()
                    # simg = pd.arrays.SparseArray(img.flatten(), fill_value=0)
                    

                    # Save image into csv format
                    data = {
                        'subject': sub, 
                        'sample': samples, 
                        'num_events': num_events, 
                        'duration':duration, 
                        'image_array': [np.array(img)]
                        }
                    d = pd.DataFrame(data, dtype=object)
                    d['image_array'] = d['image_array'].apply(lambda x: json.dumps(x.tolist()))
                    
                    df = pd.concat([df, d])
                    samples += 1
        

    SAVE = f"../data/all/{letter}.csv"
    df.to_csv(SAVE, index=False)

    return

def gen_train_test_dfs(letters, train=0.7):
    cols = ["letter", "subject", "sample", "num_events", "duration", "image_array"]
    full_train = pd.DataFrame(columns=cols)
    full_test = pd.DataFrame(columns=cols)

    for letter in letters:
        subjects = [1, 2, 3, 4, 5]
        FILE = f"../data/all/{letter}.csv"
 
        if os.path.isfile(FILE):
            df = pd.read_csv(FILE)
            size = df.shape[0]
            print("DF size:", size)
            
            df.insert(0, "letter", [letter] * size, True)

            for sub in subjects:
                s = df[df["subject"] == sub]
                sub_size = s.shape[0]
                print(f"Total # of subject {sub} samples: {sub_size}")

            print()

            train_df = df.sample(frac=train)
            full_train = pd.concat([full_train, train_df])
            print("Randomized Trainset size:", train_df.shape[0])
            
            test_idx = df.index.symmetric_difference(train_df.index)
            test_df = df.iloc[test_idx]
            full_test = pd.concat([full_test, test_df])
            print("Randomized Testset size:", test_df.shape[0])

            for sub in subjects:
                s = train_df[train_df["subject"] == sub]
                print(f"# of subject {sub} trainset samples: {s.shape[0]}")
            
                f = test_df[test_df["subject"] == sub]
                print(f"# of subject {sub} testset samples: {f.shape[0]}")

    full_train.to_csv("../data/trainset.csv", index=False)
    full_test.to_csv("../data/testset.csv", index=False)
            
            


In [88]:
# DIR = "../data/all"
# if not os.path.isdir(DIR):
#     os.makedirs(DIR)

letters = [i.split('/')[0].split('.')[0] for i in sorted(os.listdir("../data/aedat/subject1")) if "aedat" in i]
letters.append('z')

store_data_per_letter('a')
# for letter in letters:
#     store_data_per_letter(letter)

KeyboardInterrupt: 

In [None]:
# store_data_per_letter('y')
# store_data_per_letter('z')

## 2 - Random Sampling of Recording

In [81]:
df = pd.read_csv("../data/all/a.csv")

simg = df['image_array']
np.array(simg[0])


array('[0. 0. 0. ... 0. 0. 0.]', dtype='<U23')

In [196]:
# subjects = [1, 2, 3, 4, 5]

# size = df.shape[0]
# print("DF size:", size)
# for sub in subjects:
#     s = df[df["subject"] == sub]
#     sub_size = s.shape[0]
#     print(f"Total # of subject {sub} samples: {sub_size}")

# print()

# random = df.sample(frac=0.7)
# print("Randomized DF size:", random.shape[0])
# for sub in subjects:
#     s = random[random["subject"] == sub]
#     sub_size = s.shape[0]
#     print(f"# of subject {sub} randomized samples: {sub_size}")




In [197]:
gen_train_test_dfs(letters)

DF size: 40481
Total # of subject 1 samples: 2663
Total # of subject 2 samples: 6558
Total # of subject 3 samples: 7814
Total # of subject 4 samples: 5910
Total # of subject 5 samples: 17536

Randomized Trainset size: 28337
Randomized Testset size: 12144
# of subject 1 trainset samples: 1902
# of subject 1 testset samples: 761
# of subject 2 trainset samples: 4638
# of subject 2 testset samples: 1920
# of subject 3 trainset samples: 5416
# of subject 3 testset samples: 2398
# of subject 4 trainset samples: 4099
# of subject 4 testset samples: 1811
# of subject 5 trainset samples: 12282
# of subject 5 testset samples: 5254
DF size: 49949
Total # of subject 1 samples: 4836
Total # of subject 2 samples: 9941
Total # of subject 3 samples: 9268
Total # of subject 4 samples: 7213
Total # of subject 5 samples: 18691

Randomized Trainset size: 34964
Randomized Testset size: 14985
# of subject 1 trainset samples: 3405
# of subject 1 testset samples: 1431
# of subject 2 trainset samples: 6988
# 

In [7]:
train = pd.read_csv('../data/trainset.csv', index_col=False)
train = train[~train.letter.str.contains('z')]
train

Unnamed: 0,letter,subject,sample,num_events,duration,image_array
0,a,2,1399,813,0 days 00:00:00.010004,[255 255 255 ... 255 255 255]
1,a,5,4934,1759,0 days 00:00:00.010002,[255 255 255 ... 255 255 255]
2,a,2,6254,233,0 days 00:00:00.010103,[255 255 255 ... 255 255 255]
3,a,3,6018,1854,0 days 00:00:00.010001,[255 255 255 ... 255 255 255]
4,a,2,3001,2048,0 days 00:00:00.010024,[255 255 255 ... 255 255 255]
...,...,...,...,...,...,...
761555,y,3,5499,3009,0 days 00:00:00.010000,[255 255 255 ... 255 255 255]
761556,y,2,6139,2822,0 days 00:00:00.010000,[255 255 255 ... 255 0 255]
761557,y,1,463,192,0 days 00:00:00.010020,[255 255 255 ... 255 255 255]
761558,y,2,1397,1194,0 days 00:00:00.010001,[255 255 255 ... 255 255 255]


In [27]:
np.array(train['image_array'].iloc[0])
    

array('[255 255 255 ... 255 255 255]', dtype='<U29')