# Preprocess EEg data for temporal embedding

In [2]:
# import necessary libraries
import pickle as pk
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ripser
import seaborn as sb
from rich import inspect, pretty, print
from rich.progress import track
from sklearn.metrics import pairwise_distances
from tda import (downsample, persistent_homology_loop,
                 topological_features_loop, trim, windowing)
from tqdm import tqdm

pretty.install()

sb.set_theme(style="ticks", context="paper")

results_dir = "../results/functional_connectivity/"
data_dir = "../data/preprocessing/"
data_out_dir = "../data/functional_connectivity/"
Path(results_dir).mkdir(parents=True, exist_ok=True)
Path(data_dir).mkdir(parents=True, exist_ok=True)
Path(data_out_dir).mkdir(parents=True, exist_ok=True)

## Load data

In [3]:
# cropped6_data = pk.load(open('../data/sixconditiondata.pkl', 'rb'))
cropped6_data = pk.load(open(f"{data_dir}EEG_per_condition_dict.pkl", "rb"))

## Divide into windows (and downsample if needed)

In [50]:
skip = 25  # factor by which the input data is to be downsampled
n_windows = 1  # number of equally-sized windows into which a condition is divided

In [51]:
# empty dictionary
prep_data = dict()

for key in cropped6_data:
    filename = cropped6_data[key]["file"]
    group = cropped6_data[key]["group"]
    subject = cropped6_data[key]["subject"]
    condition = cropped6_data[key]["condition"]
    serie = cropped6_data[key]["series"]
    chs = cropped6_data[key]["channels"]

    # trim time series to desired length
    # trimmed_series = trim(serie, i_ini = i_ini, i_end = i_end, max_length = max_length)

    downsampled_series = downsample(serie, skip=skip)
    windowed_series = windowing(downsampled_series, n_windows=n_windows)

    for i in range(n_windows):

        window_key = f"{subject}_{condition}_{i}"
        prep_data[window_key] = dict()

        prep_data[window_key]["file"] = window_key
        prep_data[window_key]["group"] = group
        prep_data[window_key]["subject"] = subject
        prep_data[window_key]["condition"] = condition
        prep_data[window_key]["window"] = i
        prep_data[window_key]["series"] = windowed_series[i]
        prep_data[window_key]["channels"] = chs

In [52]:
# pandas DataFrame from the prep_data_dict
prep_data_df = pd.DataFrame.from_dict(prep_data, orient="index")

In [53]:
prep_data_df.loc["07_ROE_0"]["series"].shape

[1m([0m[1;36m16[0m, [1;36m999[0m[1m)[0m

In [54]:
prep_data_df

Unnamed: 0,file,group,subject,condition,window,series,channels
01_ROE_0,01_ROE_0,high,01,ROE,0,"[[1.0079775810241698e-05, 6.223475456237792e-0...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
01_RCE_0,01_RCE_0,high,01,RCE,0,"[[3.207792043685913e-06, 3.089810609817505e-06...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
01_IND1_0,01_IND1_0,high,01,IND1,0,"[[-4.318517208099365e-06, -4.166403770446777e-...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
01_IND2_0,01_IND2_0,high,01,IND2,0,"[[1.0585275888442992e-06, 2.9010703563690185e-...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
01_NH_0,01_NH_0,high,01,NH,0,"[[-4.6376829147338864e-06, 9.220789074897765e-...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
...,...,...,...,...,...,...,...
31_RCE_0,31_RCE_0,low,31,RCE,0,"[[3.0984611511230466e-06, 4.155732631683349e-0...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
31_IND1_0,31_IND1_0,low,31,IND1,0,"[[8.021036148071288e-06, 3.1144559383392333e-0...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
31_IND2_0,31_IND2_0,low,31,IND2,0,"[[7.2614860534667964e-06, 7.123832702636719e-0...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."
31_NH_0,31_NH_0,low,31,NH,0,"[[4.52498197555542e-08, 3.796555399894714e-07,...","[F7, F3, FZ, F4, F8, C3, C4, T7, CZ, T8, P3, P..."


## EEG data storage

In [55]:
pk.dump(
    prep_data_df,
    open(f"{data_dir}EEG_{n_windows}wind_per_condition_skip_{skip}_df.pkl", "wb"),
)

In [56]:
# prep_data_df.to_csv(f"{data_dir}EEG_{n_windows}wind_per_condition_skip_{skip}_df.csv", index=False)

In [111]:
# pk.dump(prep_data_df, open('../data/corrprepdatadf.pkl', 'wb'))