## Import hfo data from .mat files and process it

### Check WD (change if necessary) and file loading

In [1]:
# Show current directory
import os
curr_dir = os.getcwd()
print(curr_dir)

# Check if the current WD is the file location
if "/src/seeg_data/clinical" not in os.getcwd():
    # Set working directory to this file location
    file_location = f"{os.getcwd()}/thesis-lava/src/seeg_data/clinical"
    print("File Location: ", file_location)

    # Change the current working Directory
    os.chdir(file_location)

    # New Working Directory
    print("New Working Directory: ", os.getcwd())

# Choose the patient
PATIENT_LABEL = 'csl' # 'csl'
PATH_TO_FILE = f'patients/{PATIENT_LABEL}/' # 'patients/csl/'   # This is needed if the WD is not the same as the file location

/home/monkin/Desktop/feup/thesis
File Location:  /home/monkin/Desktop/feup/thesis/thesis-lava/src/seeg_data/clinical
New Working Directory:  /home/monkin/Desktop/feup/thesis/thesis-lava/src/seeg_data/clinical


The `.mat` files are not uploaded to GitHub. As such, they must be taken from GDrive and moved to the `clinical` folder

In [2]:
import scipy.io as sio
import numpy as np

# Load the data
data = sio.loadmat(f'{PATH_TO_FILE}signal.mat')

# Print the data structure
print(data.keys())

dict_keys(['__header__', '__version__', '__globals__', 'channel_types', 'channels', 'data', 'duration', 'markers', 'samples', 'sr'])


# Print the content of the .mat file

## General experiment information

In [3]:
sampling_rate = data['sr'][0][0]
input_duration = data['duration'][0][0]
num_samples = data['samples'][0][0]

print(f"sr: {sampling_rate}")  # Sampling rate

print(f"duration: {input_duration}")

print(f"samples: {num_samples}")

sr: 2048.0
duration: 63.10498046875
samples: 129239


- The **sampling rate** is 2048 Hz, which means that 2048 samples are recorded per second (for each electrode).
- The **duration** of the experiment is ~63.1s.
- There is a **total of 129239 samples** (sampling_rate * duration) per electrode.

## Channel information

In [4]:
num_channels = data['channels'].shape[1]

print(f"Shape of channels: {data['channels'].shape}")

print(f"channels: {data['channels']}")

print("=====================================\n\n\n")

print(f"channel_types: {data['channel_types']}")

Shape of channels: (1, 86)
channels: [[array(["CA'1-CA'2"], dtype='<U9') array(["CA'2-CA'3"], dtype='<U9')
  array(["CA'3-CA'4"], dtype='<U9') array(["CA'9-CA'10"], dtype='<U10')
  array(["CA'10-CA'11"], dtype='<U11')
  array(["CA'11-CA'12"], dtype='<U11')
  array(["CA'12-CA'13"], dtype='<U11')
  array(["CA'13-CA'14"], dtype='<U11') array(["TP'1-TP'2"], dtype='<U9')
  array(["TP'2-TP'3"], dtype='<U9') array(["TP'5-TP'6"], dtype='<U9')
  array(["TP'6-TP'7"], dtype='<U9') array(["TP'7-TP'8"], dtype='<U9')
  array(["Hes'1-Hes'2"], dtype='<U11')
  array(["Hes'2-Hes'3"], dtype='<U11')
  array(["Hes'5-Hes'6"], dtype='<U11')
  array(["Hes'6-Hes'7"], dtype='<U11')
  array(["TSM'1-TSM'2"], dtype='<U11')
  array(["TSM'2-TSM'3"], dtype='<U11')
  array(["TSM'3-TSM'4"], dtype='<U11')
  array(["TSM'4-TSM'5"], dtype='<U11')
  array(["TSM'5-TSM'6"], dtype='<U11')
  array(["Amg'1-Amg'2"], dtype='<U11')
  array(["Amg'2-Amg'3"], dtype='<U11')
  array(["Amg'3-Amg'4"], dtype='<U11')
  array(["Amg'4-Amg'5"]

This recording has 86 channels, which correspond to 86 electrodes**. 

The channels are divided into **X groups of Y channels each**. Each **group** corresponds to a **different brain region**:
- **Group 1**: OT'8
- **Group 2**: B'1
- **Group 3**: GPH'2
- **Group 4**: A'1
- **Group 5**: I6
- **Group 6**: PM6
- **Group 7**: PM10
- **Group 8**: CR5

## Markers information
The `.mat` file does not contain relevant info about the markers. We will load it from a `.csv` file

In [5]:
if 'markers' not in data.keys():
    print("No markers in the data")
else:
    print("Shape of markers: ", data['markers'].shape)

    print(f"markers: {data['markers']}")

Shape of markers:  (1, 18836)
markers: [[array(['FP - Fast Ripple'], dtype='<U16')
  array(['FP - Fast Ripple'], dtype='<U16')
  array(['FP - Ripple'], dtype='<U11') ...
  array(['FP - Fast Ripple'], dtype='<U16')
  array(['FP - Fast Ripple'], dtype='<U16')
  array(['FP - Ripple'], dtype='<U11')]]


## Read Markers data from the csv file

In [6]:
# Read csv file 
import pandas as pd

markers_csv = pd.read_csv(f'{PATH_TO_FILE}markers_for_test.csv')

# print(markers_csv)

# Remove marker column- `Value`: I'm not sure what it refers to
markers_csv = markers_csv.drop(columns=['Value'])

print(markers_csv)

       Label        Pos  Duration       Target
0     Ripple   0.522949       0.0  HPA'1-HPA'2
1     Ripple   0.522949       0.0  HPA'2-HPA'3
2     Ripple   0.522949       0.0  HPA'3-HPA'4
3     Ripple   0.522949       0.0  HPA'4-HPA'5
4     Ripple   0.522949       0.0    PH'1-PH'2
...      ...        ...       ...          ...
1596  Ripple  62.346191       0.0  Amg'2-Amg'3
1597  Ripple  62.346191       0.0  Amg'3-Amg'4
1598  Ripple  62.897949       0.0  Amg'1-Amg'2
1599  Ripple  62.897949       0.0  Amg'2-Amg'3
1600  Ripple  62.897949       0.0  Amg'3-Amg'4

[1601 rows x 4 columns]


## Create a column for the channel indices

## Map channel names to indices

In [7]:
channel_idx_map = {}

for idx in range(num_channels):
    # print(f"Channel {idx}: {data['channels'][0][idx][0]}")
    channel_idx_map[data['channels'][0][idx][0]] = idx
    

print("channel_idx_map: ", channel_idx_map)

channel_idx_map:  {"CA'1-CA'2": 0, "CA'2-CA'3": 1, "CA'3-CA'4": 2, "CA'9-CA'10": 3, "CA'10-CA'11": 4, "CA'11-CA'12": 5, "CA'12-CA'13": 6, "CA'13-CA'14": 7, "TP'1-TP'2": 8, "TP'2-TP'3": 9, "TP'5-TP'6": 10, "TP'6-TP'7": 11, "TP'7-TP'8": 12, "Hes'1-Hes'2": 13, "Hes'2-Hes'3": 14, "Hes'5-Hes'6": 15, "Hes'6-Hes'7": 16, "TSM'1-TSM'2": 17, "TSM'2-TSM'3": 18, "TSM'3-TSM'4": 19, "TSM'4-TSM'5": 20, "TSM'5-TSM'6": 21, "Amg'1-Amg'2": 22, "Amg'2-Amg'3": 23, "Amg'3-Amg'4": 24, "Amg'4-Amg'5": 25, "Amg'10-Amg'11": 26, "Amg'11-Amg'12": 27, "Amg'12-Amg'13": 28, "HPA'1-HPA'2": 29, "HPA'2-HPA'3": 30, "HPA'3-HPA'4": 31, "HPA'4-HPA'5": 32, "HPA'9-HPA'10": 33, "HPA'10-HPA'11": 34, "HPA'11-HPA'12": 35, "HPP'1-HPP'2": 36, "HPP'2-HPP'3": 37, "HPP'6-HPP'7": 38, "HPP'7-HPP'8": 39, "HPP'8-HPP'9": 40, "Et'1-Et'2": 41, "Et'2-Et'3": 42, "Et'5-Et'6": 43, "Et'6-Et'7": 44, "Et'7-Et'8": 45, "Et'8-Et'9": 46, "Et'9-Et'10": 47, "PH'1-PH'2": 48, "PH'2-PH'3": 49, "PH'3-PH'4": 50, "PH'6-PH'7": 51, "PH'7-PH'8": 52, "PH'8-PH'9": 

### Get the channel idx from the channel names

In [8]:
# Map the channel names to the channel index. If mapping is not found, delete the row
markers_csv['channel_idx'] = markers_csv['Target'].map(channel_idx_map)

# Drop rows with NaN values in the channel_idx column (TODO: Check why some channel names are not valid)
markers_csv = markers_csv.dropna(subset=['channel_idx'])

# Convert the channel_idx column to int
markers_csv['channel_idx'] = markers_csv['channel_idx'].astype(int)

# Drop the column with the channel name now that we have the index
markers_csv = markers_csv.drop(columns=['Target'])

markers_csv

Unnamed: 0,Label,Pos,Duration,channel_idx
0,Ripple,0.522949,0.0,29
1,Ripple,0.522949,0.0,30
2,Ripple,0.522949,0.0,31
3,Ripple,0.522949,0.0,32
4,Ripple,0.522949,0.0,48
...,...,...,...,...
1596,Ripple,62.346191,0.0,23
1597,Ripple,62.346191,0.0,24
1598,Ripple,62.897949,0.0,22
1599,Ripple,62.897949,0.0,23


###  Convert the Position and Duration column from seconds to milliseconds

In [9]:
markers_csv['Pos'] = markers_csv['Pos'].map(lambda x: x*1000)
markers_csv['Duration'] = markers_csv['Duration'].map(lambda x: x*1000)   # Should be 0 already in the synthetic dataset

print(markers_csv)

       Label           Pos  Duration  channel_idx
0     Ripple    522.949219       0.0           29
1     Ripple    522.949219       0.0           30
2     Ripple    522.949219       0.0           31
3     Ripple    522.949219       0.0           32
4     Ripple    522.949219       0.0           48
...      ...           ...       ...          ...
1596  Ripple  62346.191406       0.0           23
1597  Ripple  62346.191406       0.0           24
1598  Ripple  62897.949219       0.0           22
1599  Ripple  62897.949219       0.0           23
1600  Ripple  62897.949219       0.0           24

[1333 rows x 4 columns]


### Create a numpy array from the csv file

In [10]:
markers_npy = markers_csv.to_numpy()

markers_npy

array([['Ripple', 522.94921875, 0.0, 29],
       ['Ripple', 522.94921875, 0.0, 30],
       ['Ripple', 522.94921875, 0.0, 31],
       ...,
       ['Ripple', 62897.94921875, 0.0, 22],
       ['Ripple', 62897.94921875, 0.0, 23],
       ['Ripple', 62897.94921875, 0.0, 24]], dtype=object)

## Join all the markers data into a single structured datatype
We want to group the markers by channel, so the shape of the structure datatype = (num_channels, [events])

The structured datatype will have the following fields:
- `label`: the type of event that occurred (can be a combination of 'Spike', 'Ripple' and 'Fast-Ripple')
- `position`: the position of the marker in milliseconds
- `duration`: the duration of the event in milliseconds

The index of each row corresponds to the channel the event is related to

In [11]:
from utils.io import preview_np_array

# Create an empty np array to store the markers
# markers_arr = [ np.empty(shape=(1), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)]) for _ in range(num_channels)]
markers_arr = np.empty(num_channels).tolist()
for idx in range(num_channels):
    markers_arr[idx] = [np.empty(shape=(0), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)])]

print("markers_arr: ", markers_arr)

markers_arr:  [[array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      

In [12]:
# Iterate all the marked events and add them to markers_arr
for rowIdx in range(len(markers_npy)):    
    currObj = markers_npy[rowIdx]
    channelIdx = currObj[3]

    currArr = markers_arr[channelIdx]
        
    currRow = np.array((currObj[0], currObj[1], currObj[2]), dtype=[('label', 'U64'), ('position', np.float32), ('duration', np.float32)])  # Create tuple (Label, Pos, Duration)
    # currArr.append(currRow)
    # Append the tuple to the numpy array
    currArr = np.append(currArr, currRow)
    
    # Update the array of the current channel    
    markers_arr[channelIdx] = currArr

print("Markers Array: ", markers_arr)

Markers Array:  [[array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
      dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])], [array([],
    

###  Convert the array to a numpy array to allow writing to a .npy file

In [13]:
# Convert the 2D array to a 1D numpy array
final_markers_npy = np.array(markers_arr, dtype=object)

preview_np_array(final_markers_npy, "final_markers_npy")

final_markers_npy Shape: (86,).
Preview: [list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 ...
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])
 list([array([],
       dtype=[('label', '<U64'), ('position', '<f4'), ('duration', '<f4')])])]


### Write the processed markers into a .npy file

In [14]:
file_name = f"{PATH_TO_FILE}seeg_{PATIENT_LABEL}_markers.npy"

EXPORT_MARKERS = True
if EXPORT_MARKERS:
    np.save(file_name, final_markers_npy)   # Save the data to a numpy file (not stored in git due to size)

## See what is the average number of annotated events per channel

In [15]:
ripple_counter = 0
fr_counter = 0

for marker in markers_npy:
    curr_label = marker[0]
    curr_pos = marker[1]

    if curr_label == 'Ripple':
        ripple_counter += 1
    elif curr_label == 'Fast Ripple':
        fr_counter += 1
    else:
        print(f"Unknown label: {curr_label}")

print(f"Ripple Count: {ripple_counter}")
print(f"Fast Ripple Count: {fr_counter}")

Ripple Count: 923
Fast Ripple Count: 410


In [16]:
# Ripple Frequency per minute and per channel
ripple_freq = ripple_counter / ((input_duration / 60) * num_channels)
fr_freq = fr_counter / ((input_duration / 60) * num_channels)

print(f"Ripple Frequency: {ripple_freq} ripples / minute / channel")
print(f"Fast Ripple Frequency: {fr_freq} fast ripples / minute / channel")

Ripple Frequency: 10.20447963993877 ripples / minute / channel
Fast Ripple Frequency: 4.532867445693278 fast ripples / minute / channel


### Find the channels with the most annotated events

In [17]:
# Build a dictionary with the channel idx and the number of annotated events
channel_event_count = {}
for idx in range(len(final_markers_npy)):
    channel_event_count[idx] = len(final_markers_npy[idx])
print("Channel Event Count: ", channel_event_count)

# Sort the dictionary by the number of events
sorted_channel_event_count = dict(sorted(channel_event_count.items(), key=lambda item: item[1], reverse=True))
print("Sorted Channel Event Count: ", sorted_channel_event_count)
# {ch_idx: num_events}

Channel Event Count:  {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 33, 23: 33, 24: 33, 25: 1, 26: 1, 27: 1, 28: 1, 29: 130, 30: 130, 31: 130, 32: 128, 33: 1, 34: 1, 35: 1, 36: 99, 37: 99, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 170, 49: 170, 50: 170, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 2, 65: 2, 66: 2, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1}
Sorted Channel Event Count:  {48: 170, 49: 170, 50: 170, 29: 130, 30: 130, 31: 130, 32: 128, 36: 99, 37: 99, 22: 33, 23: 33, 24: 33, 64: 2, 65: 2, 66: 2, 0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 25: 1, 26: 1, 27: 1, 28: 1, 33: 1, 34: 1, 35: 1, 38: 1, 39: 1, 40: 1

---

## SEEG Data

In [18]:
print(f"data: {data['data']}")

print(f"Shape of data: {data['data'].shape}")
# Shape of the data is (channels, samples)
# Nº of channels = 960 (Each channel represents a different electrode)
# Nº of samples = 245760

data: [[   1.0633698     1.8608971    -1.8608971    -2.1267433    -0.5316849
  ...    0.26584435    1.0633717     1.860899     -1.0633717
    -1.0633707 ]
 [  34.293705     35.888763     37.21797      36.952133     33.76202
  ...    0.5316849    -0.7975273     3.4559546     1.3292141
    -2.3925838 ]
 [  -5.3168535    -6.380224     -7.1777525    -6.9119096    -6.114382
  ...    5.582696      6.114381      1.860899      4.5193253
     6.380224  ]
 [  31.369436     32.964493     35.091232     31.635279     32.69865
  ...   34.027863     34.2937       35.62292      35.357075
    34.027863  ]
 [ -37.483818    -36.952133    -37.217976    -34.559547    -36.952133
  ... -123.88269    -123.351      -123.61685    -124.14853
  -122.553474  ]
 ...
 [  10.10202       7.443596      8.241123      7.975277      3.4559555
  ...   -8.241127    -10.899551    -11.962917    -11.431232
   -12.228764  ]
 [  43.066517     45.72494      45.72494      44.66157      48.649208
  ...   58.21955      61.941345    

### Shape of the data
The shape of the data is (n_channels, n_samples)
- Number of channels: 960
- Number of samples: 245760

The **data corresponds** to the recordings of the SEEG signals from 960 channels, acquiring a total of 245760 samples for each channel.

In [19]:
recorded_data = data['data']

Each value of the `recorded_data` is a float number that represents the amplitude of the signal at that specific time (voltage). The voltage is measured in millivolts (mV)?? TODO: Check units

# Change the structure of the data

Let's change the structure of the data to a 2D array that is ordered by time. This way, we can use the input data of various channels together by following the time order. 

Therefore, let's **transform the shape from (num_channels, num_samples) to (num_samples, num_channels)**. This way, each row will represent a time point and contains the voltage values of all channels at that time point. 

It is not necessary to specify the time of each row since there is a designated sampling rate of the input.

The structure is exemplified below, with a total of 245760 rows:

| Channel 1       | Channel 2     | Channel ...     | Channel 960   |
|-----------------|---------------|-----------------|---------------|
| 3               | 1             | 3               | 1             |
| 8               | 0             | 7               | 15            |
| ...             | ...           | ...             | ...           |
| 14              | 5             | 3               | 1             |


## Select the channels to be used
For the sake of simplicity, we can define a list of channels to be used.

In [20]:
# channels_used: set = {1, 2, 3, 4, 5, 6, 7, 8}
channels_used = set(range(1, num_channels+1, 1))

print(channels_used)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86}


In [21]:
ordered_recorded_data = recorded_data.T     # Swap the structure of the recorded_data to (num_samples, num_channels)

ordered_recorded_data.shape

(129239, 86)

# Write the processed data to a .npy file

Finally, we write the processed data to a .npy file. This way, we can use it in the Spiking Neural Networks (SNN) model.

The .npy file is a binary file that contains the processed data in a numpy array format. This format is easy to read and write, and it is compatible with the numpy library.

In [22]:
file_name = f"{PATH_TO_FILE}seeg_{PATIENT_LABEL}.npy"

EXPORT_SEEG = True
if EXPORT_SEEG:
    np.save(file_name, ordered_recorded_data)   # Save the data to a numpy file (not stored in git due to size)