## Import hfo data from .mat files and process it

### Check WD (change if necessary) and file loading

In [117]:
# Show current directory
import os
curr_dir = os.getcwd()
print(curr_dir)

# Check if the current WD is the file location
if "/src/seeg_data/clinical" not in os.getcwd():
    # Set working directory to this file location
    file_location = f"{os.getcwd()}/thesis-lava/src/seeg_data/clinical"
    print("File Location: ", file_location)

    # Change the current working Directory
    os.chdir(file_location)

    # New Working Directory
    print("New Working Directory: ", os.getcwd())

# Choose the patient
PATIENT_LABEL = 'ics' # 'csl'
PATH_TO_FILE = f'patients/{PATIENT_LABEL}/' # 'patients/csl/'   # This is needed if the WD is not the same as the file location

/home/monkin/Desktop/feup/thesis/thesis-lava/src/seeg_data/clinical


The `.mat` files are not uploaded to GitHub. As such, they must be taken from GDrive and moved to the `clinical` folder

In [118]:
import scipy.io as sio
import numpy as np

# Load the data
data = sio.loadmat(f'{PATH_TO_FILE}signal.mat')

# Print the data structure
print(data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'channel_types', 'channels', 'data', 'duration', 'samples', 'sr'])


# Print the content of the .mat file

## General experiment information

In [119]:
sampling_rate = data['sr'][0][0]
input_duration = data['duration'][0][0]
num_samples = data['samples'][0][0]

print(f"sr: {sampling_rate}")  # Sampling rate

print(f"duration: {input_duration}")

print(f"samples: {num_samples}")

sr: 2048.0
duration: 61.0625
samples: 125056


- The **sampling rate** is 2048 Hz, which means that 2048 samples are recorded per second (for each electrode).
- The **duration** of the experiment is ~63.1s.
- There is a **total of 129239 samples** (sampling_rate * duration) per electrode.

## Channel information

In [120]:
num_channels = data['channels'].shape[1]

print(f"Shape of channels: {data['channels'].shape}")

print(f"channels: {data['channels']}")

print("=====================================\n\n\n")

print(f"channel_types: {data['channel_types']}")

Shape of channels: (1, 101)
channels: [[array(['AMS1-AMS2'], dtype='<U9') array(['AMS2-AMS3'], dtype='<U9')
  array(['AMS3-AMS4'], dtype='<U9') array(['AMS4-AMS5'], dtype='<U9')
  array(['AMS5-AMS6'], dtype='<U9') ...
  array(['Ch119-Ch120'], dtype='<U11')
  array(['Ch120-Ch121'], dtype='<U11')
  array(['Ch121-Ch122'], dtype='<U11')
  array(['Ch122-Ch123'], dtype='<U11')
  array(['Ch123-Ch124'], dtype='<U11')]]



channel_types: [[array(['SEEG'], dtype='<U4') array(['SEEG'], dtype='<U4')
  array(['SEEG'], dtype='<U4') array(['SEEG'], dtype='<U4')
  array(['SEEG'], dtype='<U4') ... array(['SEEG'], dtype='<U4')
  array(['SEEG'], dtype='<U4') array(['SEEG'], dtype='<U4')
  array(['SEEG'], dtype='<U4') array(['SEEG'], dtype='<U4')]]


This recording has 86 channels, which correspond to 86 electrodes**. 

The channels are divided into **X groups of Y channels each**. Each **group** corresponds to a **different brain region**:
- **Group 1**: OT'8
- **Group 2**: B'1
- **Group 3**: GPH'2
- **Group 4**: A'1
- **Group 5**: I6
- **Group 6**: PM6
- **Group 7**: PM10
- **Group 8**: CR5

## Markers information
The `.mat` file does not contain relevant info about the markers. We will load it from a `.csv` file

In [121]:
if 'markers' not in data.keys():
    print("No markers in the data")
else:
    print("Shape of markers: ", data['markers'].shape)

    print(f"markers: {data['markers']}")

No markers in the data


## Read Markers data from the csv file

In [122]:
# Read csv file 
import pandas as pd

markers_csv = pd.read_csv(f'{PATH_TO_FILE}markers_for_test.csv')

# print(markers_csv)

# Remove marker column- `Value`: I'm not sure what it refers to
markers_csv = markers_csv.drop(columns=['Value'])

print(markers_csv)

           Label        Pos  Duration     Target
0         Ripple   1.877930       0.0  HPA1-HPA2
1         Ripple   1.877930       0.0  HPA2-HPA3
2         Ripple   1.877930       0.0  HPA3-HPA4
3         Ripple   1.877930       0.0  HPA4-HPA5
4         Ripple   2.577148       0.0  HPA1-HPA2
..           ...        ...       ...        ...
610  Fast Ripple  60.981934       0.0  HPA3-HPA4
611       Ripple  60.982422       0.0  HPA1-HPA2
612       Ripple  60.982422       0.0  HPA2-HPA3
613       Ripple  60.982422       0.0  HPA3-HPA4
614       Ripple  60.982422       0.0  HPA4-HPA5

[615 rows x 4 columns]


## Create a column for the channel indices

## Map channel names to indices

In [123]:
channel_idx_map = {}

for idx in range(num_channels):
    # print(f"Channel {idx}: {data['channels'][0][idx][0]}")
    channel_idx_map[data['channels'][0][idx][0]] = idx
    

print("channel_idx_map: ", channel_idx_map)

channel_idx_map:  {'AMS1-AMS2': 0, 'AMS2-AMS3': 1, 'AMS3-AMS4': 2, 'AMS4-AMS5': 3, 'AMS5-AMS6': 4, 'AMS6-AMS7': 5, 'AMS7-AMS8': 6, 'AMS8-AMS9': 7, 'AMS9-AMS10': 8, 'AMS10-AMS11': 9, 'CA1-CA2': 10, 'CA2-CA3': 11, 'CA3-CA4': 12, 'CA8-CA9': 13, 'CA9-CA10': 14, 'CA10-CA11': 15, 'CA11-CA12': 16, 'OpF1-OpF2': 17, 'OpF2-OpF3': 18, 'OpF3-OpF4': 19, 'OpF4-OpF5': 20, 'OpF5-OpF6': 21, 'OpF6-OpF7': 22, 'FO1-FO2': 23, 'FO2-FO3': 24, 'FO3-FO4': 25, 'FO6-FO7': 26, 'FO7-FO8': 27, 'FO11-FO12': 28, 'PT1-PT2': 29, 'PT2-PT3': 30, 'PT3-PT4': 31, 'PT4-PT5': 32, 'PT5-PT6': 33, 'PT6-PT7': 34, 'PT7-PT8': 35, 'PT8-PT9': 36, 'A1-A2': 37, 'A2-A3': 38, 'A3-A4': 39, 'A4-A5': 40, 'A8-A9': 41, 'A9-A10': 42, 'A10-A11': 43, 'A11-A12': 44, 'A12-A13': 45, 'A13-A14': 46, 'HPA1-HPA2': 47, 'HPA2-HPA3': 48, 'HPA3-HPA4': 49, 'HPA7-HPA8': 50, 'HPA8-HPA9': 51, 'HPA9-HPA10': 52, 'HPA10-HPA11': 53, 'HPA11-HPA12': 54, 'HPA12-HPA13': 55, 'HPP1-HPP2': 56, 'HPP2-HPP3': 57, 'HPP5-HPP6': 58, 'HPP6-HPP7': 59, 'HPP7-HPP8': 60, 'HPP8-HPP9

### Get the channel idx from the channel names

In [124]:
# Map the channel names to the channel index. If mapping is not found, delete the row
markers_csv['channel_idx'] = markers_csv['Target'].map(channel_idx_map)

# Drop rows with NaN values in the channel_idx column (TODO: Check why some channel names are not valid)
markers_csv = markers_csv.dropna(subset=['channel_idx'])

# Convert the channel_idx column to int
markers_csv['channel_idx'] = markers_csv['channel_idx'].astype(int)

# Drop the column with the channel name now that we have the index
markers_csv = markers_csv.drop(columns=['Target'])

markers_csv

Unnamed: 0,Label,Pos,Duration,channel_idx
0,Ripple,1.877930,0.0,47
1,Ripple,1.877930,0.0,48
2,Ripple,1.877930,0.0,49
4,Ripple,2.577148,0.0,47
5,Ripple,2.577148,0.0,48
...,...,...,...,...
609,Fast Ripple,60.981934,0.0,48
610,Fast Ripple,60.981934,0.0,49
611,Ripple,60.982422,0.0,47
612,Ripple,60.982422,0.0,48


###  Convert the Position and Duration column from seconds to milliseconds

In [125]:
markers_csv['Pos'] = markers_csv['Pos'].map(lambda x: x*1000)
markers_csv['Duration'] = markers_csv['Duration'].map(lambda x: x*1000)   # Should be 0 already in the synthetic dataset

print(markers_csv)

           Label           Pos  Duration  channel_idx
0         Ripple   1877.929688       0.0           47
1         Ripple   1877.929688       0.0           48
2         Ripple   1877.929688       0.0           49
4         Ripple   2577.148438       0.0           47
5         Ripple   2577.148438       0.0           48
..           ...           ...       ...          ...
609  Fast Ripple  60981.933594       0.0           48
610  Fast Ripple  60981.933594       0.0           49
611       Ripple  60982.421875       0.0           47
612       Ripple  60982.421875       0.0           48
613       Ripple  60982.421875       0.0           49

[523 rows x 4 columns]


### Create a numpy array from the csv file

In [126]:
markers_npy = markers_csv.to_numpy()

markers_npy

array([['Ripple', 1877.9296875, 0.0, 47],
       ['Ripple', 1877.9296875, 0.0, 48],
       ['Ripple', 1877.9296875, 0.0, 49],
       ['Ripple', 2577.1484375, 0.0, 47],
       ['Ripple', 2577.1484375, 0.0, 48],
       ...,
       ['Fast Ripple', 60981.93359375, 0.0, 48],
       ['Fast Ripple', 60981.93359375, 0.0, 49],
       ['Ripple', 60982.421875, 0.0, 47],
       ['Ripple', 60982.421875, 0.0, 48],
       ['Ripple', 60982.421875, 0.0, 49]], dtype=object)

## Join all the markers data into a single structured datatype
We want to group the markers by channel, so the shape of the structure datatype = (num_channels, [events])

The structured datatype will have the following fields:
- `label`: the type of event that occurred (can be a combination of 'Spike', 'Ripple' and 'Fast-Ripple')
- `position`: the position of the marker in milliseconds
- `duration`: the duration of the event in milliseconds

The index of each row corresponds to the channel the event is related to

In [127]:
from utils.io import preview_np_array

# Create an empty np array to store the markers
# markers_arr = [ np.empty(shape=(1), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)]) for _ in range(num_channels)]
markers_arr = np.empty(num_channels).tolist()
for idx in range(num_channels):
    markers_arr[idx] = [np.empty(shape=(0), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)])]

print("markers_arr: ", markers_arr)

markers_arr:  [[array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      

In [128]:
# Iterate all the marked events and add them to markers_arr
for rowIdx in range(len(markers_npy)):    
    currObj = markers_npy[rowIdx]
    channelIdx = currObj[3]

    currArr = markers_arr[channelIdx]
        
    currRow = np.array((currObj[0], currObj[1], currObj[2]), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)])  # Create tuple (Label, Pos, Duration)
    # currArr.append(currRow)
    # Append the tuple to the numpy array
    currArr = np.append(currArr, currRow)
    
    # Update the array of the current channel    
    markers_arr[channelIdx] = currArr

print("Markers Array: ", markers_arr)

Markers Array:  [[array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
    

###  Convert the array to a numpy array to allow writing to a .npy file

In [129]:
# Convert the 2D array to a 1D numpy array
final_markers_npy = np.array(markers_arr, dtype=object)

preview_np_array(final_markers_npy, "final_markers_npy")

final_markers_npy Shape: (101,).
Preview: [list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 ...
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])]


### Write the processed markers into a .npy file

In [130]:
file_name = f"{PATH_TO_FILE}seeg_{PATIENT_LABEL}_markers.npy"

EXPORT_MARKERS = True
if EXPORT_MARKERS:
    np.save(file_name, final_markers_npy)   # Save the data to a numpy file (not stored in git due to size)

## See what is the average number of annotated events per channel

In [131]:
ripple_counter = 0
fr_counter = 0

for marker in markers_npy:
    curr_label = marker[0]
    curr_pos = marker[1]

    if curr_label == 'Ripple':
        ripple_counter += 1
    elif curr_label == 'Fast Ripple':
        fr_counter += 1
    else:
        print(f"Unknown label: {curr_label}")

print(f"Ripple Count: {ripple_counter}")
print(f"Fast Ripple Count: {fr_counter}")

Ripple Count: 329
Fast Ripple Count: 194


In [132]:
# Ripple Frequency per minute and per channel
ripple_freq = ripple_counter / ((input_duration / 60) * num_channels)
fr_freq = fr_counter / ((input_duration / 60) * num_channels)

print(f"Ripple Frequency: {ripple_freq} ripples / minute / channel")
print(f"Fast Ripple Frequency: {fr_freq} fast ripples / minute / channel")

Ripple Frequency: 3.2007458678314094 ripples / minute / channel
Fast Ripple Frequency: 1.8873699038276395 fast ripples / minute / channel


### Find the channels with the most annotated events

In [133]:
# Build a dictionary with the channel idx and the number of annotated events
channel_event_count = {}
for idx in range(len(final_markers_npy)):
    channel_event_count[idx] = len(final_markers_npy[idx])
print("Channel Event Count: ", channel_event_count)

# Sort the dictionary by the number of events
sorted_channel_event_count = dict(sorted(channel_event_count.items(), key=lambda item: item[1], reverse=True))
print("Sorted Channel Event Count: ", sorted_channel_event_count)
# {ch_idx: num_events}

Channel Event Count:  {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 4, 30: 4, 31: 4, 32: 3, 33: 1, 34: 1, 35: 1, 36: 1, 37: 8, 38: 8, 39: 8, 40: 3, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 132, 48: 132, 49: 132, 50: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 43, 57: 8, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 2, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 2, 84: 2, 85: 2, 86: 2, 87: 2, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 97: 1, 98: 1, 99: 1, 100: 1}
Sorted Channel Event Count:  {47: 132, 48: 132, 49: 132, 56: 43, 37: 8, 38: 8, 39: 8, 57: 8, 29: 4, 30: 4, 31: 4, 32: 3, 40: 3, 76: 2, 83: 2, 84: 2, 85: 2, 86: 2, 87: 2, 0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 

---

## SEEG Data

In [134]:
print(f"data: {data['data']}")

print(f"Shape of data: {data['data'].shape}")
# Shape of the data is (channels, samples)
# Nº of channels = 960 (Each channel represents a different electrode)
# Nº of samples = 245760

data: [[-107.66629   -111.653915  -113.51482   -111.38808   -114.844025  ...
    11.16539     10.10202     15.418877    10.10202      4.253479 ]
 [ -81.347855   -82.942924   -81.61371    -85.60135    -84.53797   ...
     9.038654     7.443596     4.7851715    6.1143875    6.6460648]
 [  19.14067     17.811462    10.3678665    5.582695     2.6584244 ...
    58.48539     57.156174    52.90269     49.446735    46.788315 ]
 [  34.027863    39.07888     46.25663     51.0418      55.029434  ...
    12.760448    14.88719     17.811459    18.608988    20.73573  ]
 [-127.60449   -130.26292   -130.7946    -127.87033   -128.13617   ...
  -172.00021   -166.9492    -165.88583   -159.23976   -155.25212  ]
 ...
 [  35.35707     34.825394    36.95214     37.749657    38.547188  ...
  -287.11008   -289.23685   -287.37592   -286.31256   -286.84424  ]
 [  36.42044     38.015503    36.952133    36.420456    37.48381   ...
    24.723358    24.723389    20.735718    19.938202    18.077301 ]
 [ -25.25505    

### Shape of the data
The shape of the data is (n_channels, n_samples)
- Number of channels: 960
- Number of samples: 245760

The **data corresponds** to the recordings of the SEEG signals from 960 channels, acquiring a total of 245760 samples for each channel.

In [135]:
recorded_data = data['data']

Each value of the `recorded_data` is a float number that represents the amplitude of the signal at that specific time (voltage). The voltage is measured in millivolts (mV)?? TODO: Check units

# Change the structure of the data

Let's change the structure of the data to a 2D array that is ordered by time. This way, we can use the input data of various channels together by following the time order. 

Therefore, let's **transform the shape from (num_channels, num_samples) to (num_samples, num_channels)**. This way, each row will represent a time point and contains the voltage values of all channels at that time point. 

It is not necessary to specify the time of each row since there is a designated sampling rate of the input.

The structure is exemplified below, with a total of 245760 rows:

| Channel 1       | Channel 2     | Channel ...     | Channel 960   |
|-----------------|---------------|-----------------|---------------|
| 3               | 1             | 3               | 1             |
| 8               | 0             | 7               | 15            |
| ...             | ...           | ...             | ...           |
| 14              | 5             | 3               | 1             |


## Select the channels to be used
For the sake of simplicity, we can define a list of channels to be used.

In [136]:
# channels_used: set = {1, 2, 3, 4, 5, 6, 7, 8}
channels_used = set(range(1, num_channels+1, 1))

print(channels_used)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101}


In [137]:
ordered_recorded_data = recorded_data.T     # Swap the structure of the recorded_data to (num_samples, num_channels)

ordered_recorded_data.shape

(125056, 101)

# Write the processed data to a .npy file

Finally, we write the processed data to a .npy file. This way, we can use it in the Spiking Neural Networks (SNN) model.

The .npy file is a binary file that contains the processed data in a numpy array format. This format is easy to read and write, and it is compatible with the numpy library.

In [138]:
file_name = f"{PATH_TO_FILE}seeg_{PATIENT_LABEL}.npy"

EXPORT_SEEG = True
if EXPORT_SEEG:
    np.save(file_name, ordered_recorded_data)   # Save the data to a numpy file (not stored in git due to size)