In [73]:
import os
import polars as pl
import mne

In [74]:
ROOT_PATH = "/home/bobby/repos/latent-neural-dynamics-modeling"
DATA_PATH = os.path.join(ROOT_PATH, "data")

In [75]:
participants = pl.read_csv(
    os.path.join(DATA_PATH, "participants.tsv"), separator="\t", null_values="n/a"
)

In [76]:
def list_files(folder_path: str, root_: bool = False) -> list:
    if root_:
        return os.listdir(folder_path)
    else:
        return os.listdir(os.path.join(DATA_PATH, folder_path))

In [77]:
participants = participants.with_columns(
    pl.col("participant_id")
    .map_elements(lambda pid: list_files(pid), return_dtype=pl.List(pl.String))
    .alias("session")
).explode(pl.col("session"))

## iEEG

In [6]:
participants_ieeg = participants.with_columns(
    pl.concat_str(
        [
            pl.lit(DATA_PATH),
            pl.col("participant_id"),
            pl.col("session"),
            pl.lit("ieeg"),
        ],
        separator="/",
    ).alias("ieeg_path"),
)

In [7]:
participants_ieeg = participants_ieeg.with_columns(
    pl.col("ieeg_path")
    .map_elements(
        lambda ieeg_path: list_files(ieeg_path, root_=True),
        return_dtype=pl.List(pl.String),
    )
    .alias("ieeg_file")
).explode(pl.col("ieeg_file"))

In [8]:
participants_ieeg = (
    participants_ieeg.with_columns(
        pl.col("ieeg_file").str.split(by="_").alias("splitted_file")
    )
    .with_columns(
        pl.col("splitted_file").list.get(-1).str.split(".").list.get(0).alias("type"),
        pl.col("splitted_file")
        .list.get(-1)
        .str.split(".")
        .list.get(-1)
        .alias("data_format"),
        pl.col("splitted_file").list.get(-2).alias("run"),
    )
    .drop("splitted_file")
)

In [9]:
channel_metadata_schema = pl.List(
    pl.Struct(
        [
            pl.Field("name", pl.Utf8),
            pl.Field("type", pl.Utf8),
            pl.Field("units", pl.Utf8),
            pl.Field("low_cutoff", pl.Float64),
            pl.Field("high_cutoff", pl.Float64),
            pl.Field("sampling_frequency", pl.Float64),
        ]
    )
)

In [10]:
def read_csv_(row: dict[str, str]) -> pl.Series:
    df = pl.read_csv(
        os.path.join(row["ieeg_path"], row["ieeg_file"]),
        separator="\t",
        null_values="n/a",
    )
    return df.to_struct()

In [11]:
channels_df = participants_ieeg.filter(
    (pl.col("type") == "channels") & (pl.col("data_format") == "tsv")
).select(
    "participant_id",
    "session",
    "run",
    pl.struct(["ieeg_path", "ieeg_file"])
    .map_elements(read_csv_, return_dtype=channel_metadata_schema)
    .alias("channels_info"),
)

In [12]:
participants_ieeg = participants_ieeg.join(
    channels_df, on=["participant_id", "session", "run"], how="left"
).filter(~((pl.col("type") == "channels") & (pl.col("data_format") == "tsv")))

In [13]:
events_schema = pl.List(
    pl.Struct(
        [
            pl.Field("onset", pl.Float64),
            pl.Field("duration", pl.Float64),
            pl.Field("trial_tyoe", pl.Float64),
            pl.Field("value", pl.Int64),
            pl.Field("sample", pl.Int64),
        ]
    )
)

In [14]:
events_df = participants_ieeg.filter(
    (pl.col("type") == "events") & (pl.col("data_format") == "tsv")
).select(
    "participant_id",
    "session",
    "run",
    pl.struct(["ieeg_path", "ieeg_file"])
    .map_elements(read_csv_, return_dtype=events_schema)
    .alias("events"),
)

In [15]:
participants_ieeg = participants_ieeg.join(
    events_df, on=["participant_id", "session", "run"], how="left"
).filter(~((pl.col("type") == "events") & (pl.col("data_format") == "tsv")))

In [16]:
participants_ieeg = participants_ieeg.filter(~(pl.col("data_format") == "json"))

In [17]:
recording_schema = pl.Struct(
    [
        pl.Field(
            "header",
            pl.Struct(
                [
                    pl.Field("n_channels", pl.Int64),
                    pl.Field("ch_names", pl.List(pl.Utf8)),
                    pl.Field("sfreq", pl.Float64),
                ]
            ),
        ),
        pl.Field(
            "markers",
            pl.List(
                pl.Struct(
                    [  # A DataFrame is a list of structs
                        pl.Field("sample", pl.Int64),
                        pl.Field("onset_sec", pl.Float64),
                        pl.Field("description", pl.Utf8),
                    ]
                )
            ),
        ),
    ]
)

In [18]:
def parse_brainvision_recording(row: dict[str, str]) -> dict | None:

    vhdr_path = os.path.join(row["ieeg_path"], row["ieeg_file"])

    if not os.path.exists(vhdr_path):
        return None

    raw = mne.io.read_raw_brainvision(vhdr_path, preload=False, verbose=False)

    header_info = {
        "n_channels": len(raw.ch_names),
        "ch_names": raw.ch_names,
        "sfreq": raw.info["sfreq"],
    }

    marker_df = None
    try:
        events, event_id = mne.events_from_annotations(raw, verbose=False)
        if len(events) > 0:
            id_to_desc = {v: k for k, v in event_id.items()}
            marker_df = pl.DataFrame(
                {
                    "sample": events[:, 0],
                    "onset_sec": events[:, 0] / raw.info["sfreq"],
                    "description": [id_to_desc[code] for code in events[:, 2]],
                }
            ).to_dicts()
    except ValueError:
        pass

    return {
        "header": header_info,
        "markers": marker_df,
    }

In [19]:
headers_markers_df = participants_ieeg.filter(
    (pl.col("type") == "ieeg") & (pl.col("data_format") == "vhdr")
).select(
    "participant_id",
    "session",
    "run",
    pl.struct(["ieeg_path", "ieeg_file"])
    .map_elements(parse_brainvision_recording, return_dtype=recording_schema)
    .alias("recording_meta"),
    pl.col("ieeg_file").alias("ieeg_headers_file"),
)

In [20]:
participants_ieeg = participants_ieeg.join(
    headers_markers_df, on=["participant_id", "session", "run"], how="left"
).filter(~((pl.col("type") == "ieeg") & ~(pl.col("data_format") == "eeg")))

In [21]:
participants_ieeg = participants_ieeg.drop(
    "type", "data_format", "channels_info_right", strict=False
)

In [70]:
def band_pass_resample(ieeg_headers_file: str) -> str | None:
    sfreq = 1000
    low_freq = 3
    high_freq = 100
    os.makedirs("./resampled", exist_ok=True)
    ieeg_file = f"./resampled/{ieeg_headers_file.split('/')[-1].split('.')[0]}.fif"

    if not os.path.exists(ieeg_headers_file):
        return f"no {ieeg_headers_file}"

    try:
        raw = mne.io.read_raw_brainvision(
            ieeg_headers_file, preload=True, verbose=False
        )

        raw.notch_filter(freqs=[50, 100], verbose=False)
        raw.filter(l_freq=low_freq, h_freq=high_freq)
        raw.resample(sfreq=sfreq, verbose=False)

        raw.save(ieeg_file, overwrite=True, verbose=False)

        return "SAVED & RESAMPLED"
    except Exception as e:
        return "ERROR " + str(e)

In [71]:
participants_ieeg = participants_ieeg.with_columns(
    pl.concat_str(pl.col("ieeg_path"), pl.col("ieeg_headers_file"), separator="/")
    .map_elements(band_pass_resample, return_dtype=pl.String)
    .alias("saved")
)

In [72]:
participants_ieeg["saved"][0]

'SAVED & RESAMPLED'

In [39]:
participants_ieeg.write_parquet(
    "./participants_ieeg", partition_by=["participant_id", "session"]
)

In [55]:
participants_ieeg = pl.read_parquet("./participants_ieeg")

In [41]:
participants_ieeg

participant_id,age,sex,hand,weight,height,session,ieeg_path,ieeg_file,run,channels_info,events,recording_meta,ieeg_headers_file
str,str,str,str,str,str,str,str,str,str,list[struct[6]],list[struct[5]],struct[2],str
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-7""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{6.91,1.652091,null,25,2073}, {8.563333,9.126364,null,1,2569}, … {200.97,17.894227,null,15,60291}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{2073,6.91,""Stimulus/S 25""}, {2569,8.563333,""Stimulus/S 1""}, … {60291,200.97,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-11""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{25.2,3.1705,null,25,7560}, {28.37,9.000364,null,1,8511}, … {222.836667,18.552,null,15,66851}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{7560,25.2,""Stimulus/S 25""}, {8511,28.37,""Stimulus/S 1""}, … {66851,222.836667,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-3""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{7.226667,4.922045,null,25,2168}, {12.15,9.118227,null,1,3645}, … {193.126667,56.071045,null,15,57938}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{2168,7.226667,""Stimulus/S 25""}, {3645,12.15,""Stimulus/S 1""}, … {57938,193.126667,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-12""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{10.873333,2.071727,null,25,3262}, {12.946667,8.972773,null,1,3884}, … {218.49,175.53,null,15,65547}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{3262,10.873333,""Stimulus/S 25""}, {3884,12.946667,""Stimulus/S 1""}, … {65547,218.49,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-2""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{8.006667,1.752273,null,25,2402}, {9.756667,9.015818,null,1,2927}, … {185.78,32.906273,null,15,55734}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{2402,8.006667,""Stimulus/S 25""}, {2927,9.756667,""Stimulus/S 1""}, … {55734,185.78,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-10""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{10.396667,2.184591,null,25,3119}, {12.58,8.995864,null,1,3774}, … {204.013333,244.918636,null,15,61204}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{3119,10.396667,""Stimulus/S 25""}, {3774,12.58,""Stimulus/S 1""}, … {61204,204.013333,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-4""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{6.033333,2.190818,null,25,1810}, {8.223333,9.112364,null,1,2467}, … {207.183333,29.352909,null,15,62155}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{1810,6.033333,""Stimulus/S 25""}, {2467,8.223333,""Stimulus/S 1""}, … {62155,207.183333,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-5""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{8.11,4.156818,null,25,2433}, {12.266667,9.090409,null,1,3680}, … {199.966667,24.289955,null,15,59990}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{2433,8.11,""Stimulus/S 25""}, {3680,12.266667,""Stimulus/S 1""}, … {59990,199.966667,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-8""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{9.426667,1.755,null,25,2828}, {11.183333,9.032591,null,1,3355}, … {207.833333,23.003455,null,15,62350}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{2828,9.426667,""Stimulus/S 25""}, {3355,11.183333,""Stimulus/S 1""}, … {62350,207.833333,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""run-6""","[{""LFP_1"",""DBS"",""V"",0.0,150.0,300.0}, {""LFP_2"",""DBS"",""V"",0.0,150.0,300.0}, … {""EOG_4"",""EOG"",""V"",0.0,150.0,300.0}]","[{12.356667,3.842682,null,25,3707}, {16.2,9.009409,null,1,4860}, … {203.396667,30.345227,null,15,61019}]","{{24,[""LFP_1"", ""LFP_2"", … ""EOG_4""],300.0},[{3707,12.356667,""Stimulus/S 25""}, {4860,16.2,""Stimulus/S 1""}, … {61019,203.396667,""Stimulus/S 15""}]}","""sub-PDI4_ses-3_task-copydraw_r…"


## Motion

In [78]:
participants_motion = participants.with_columns(
    pl.concat_str(
        [
            pl.lit(DATA_PATH),
            pl.col("participant_id"),
            pl.col("session"),
            pl.lit("motion"),
        ],
        separator="/",
    ).alias("motion_path"),
)

In [79]:
participants_motion = participants_motion.with_columns(
    pl.col("motion_path")
    .map_elements(
        lambda motion_path: list_files(motion_path, root_=True),
        return_dtype=pl.List(pl.String),
    )
    .alias("motion_file")
).explode(pl.col("motion_file"))

In [81]:
participants_motion = (
    participants_motion.with_columns(
        pl.col("motion_file").str.split(by="_").alias("splitted_file")
    )
    .with_columns(
        pl.col("splitted_file")
        .list.get(-1)
        .str.split(".")
        .list.get(-1)
        .alias("data_format"),
        pl.col("splitted_file").list.get(-3).alias("chunk"),
        pl.col("splitted_file").list.get(-4).alias("run"),
    )
    .drop("splitted_file")
)

In [85]:
participants_motion = participants_motion.filter(pl.col("data_format") != "json").drop("data_format")

In [86]:
participants_motion

participant_id,age,sex,hand,weight,height,session,motion_path,motion_file,chunk,run
str,str,str,str,str,str,str,str,str,str,str
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-09""","""run-03"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-04""","""run-09"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-05""","""run-05"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-03""","""run-07"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-11""","""run-01"""
…,…,…,…,…,…,…,…,…,…,…
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-02""","""run-05"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-11""","""run-01"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-06""","""run-11"""
"""sub-PDI4""",,,,,,"""ses-3""","""/home/bobby/repos/latent-neura…","""sub-PDI4_ses-3_task-copydraw_r…","""chunk-11""","""run-07"""
