In [4]:
import pandas as pd
import numpy as np
import requests
from typing import List
from sklearn.model_selection import train_test_split

In [None]:
#Download original data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00388/data.csv"

r = requests.get(url)

with open("../../data/raw/data.csv", 'wb') as f:
    f.write(r.content)

In [None]:
def assign_split_col(df: pd.DataFrame, col: str, name_list: List[str], pat: str=None):
    df = df.copy()
    split_col = df[col].str.split(pat, expand=True)

    return df.assign(
        **dict(
            zip(name_list, [split_col.iloc[:, x] for x in range(split_col.shape[1])])
        )
    )

In [None]:
df = pd.read_csv("../../data/raw/data.csv")

In [None]:
#Split the ID column into different variables.  Recode outcomes into Seizure and
#non-Seizure
edited_df = (
    df.pipe(
        assign_split_col,
        col="Unnamed: 0",
        name_list=["nth_chunk", "File", "Participant"],
        pat=".",
    )
    .loc[lambda x: ~x["Participant"].isna()]
    .sort_values(by="nth_chunk")
    .drop(columns=["Unnamed: 0", "nth_chunk", "File"])
    .assign(y = lambda x: x["y"].mask(lambda x: x>1, 0))

)

train, validation = train_test_split(
    edited_df, test_size=0.3, random_state=0, stratify=edited_df[["Participant", "y"]]
)

train.to_csv("../../data/interim/train.csv", index=False)
validation.to_csv("../../data/interim/validation.csv", index=False)