In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [3]:

#contains all information related to patients 
path = './DataSet/'
metadata = pd.read_csv(path + 'tdcsfog_metadata.csv')
#select the off_medication condition, as the reference paper worked
off_medication = metadata[metadata['Medication'] == 'off']
#collect the information(subject_id) related to each patient
#convert subjects to list to get each one using the subject file 
subjects_list = off_medication['Subject'].unique().tolist()
subjects_list

#After that, we can access the data for each patient using Id column in metadata file
#so firstly we combine the id files for each patient
#key : paitent , values : list of his records!!

subjects_info = defaultdict(list)
for patient in subjects_list:
    bool_temp = off_medication["Subject"] == patient
    subjects_info[patient] = off_medication[bool_temp]["Id"].tolist()
    #print(patient)

subjects_info

defaultdict(list,
            {'231c3b': ['01d0fe7266',
              '0974bf9e9f',
              '32685c1cb7',
              '54ac78a467',
              '758ddee3f7',
              '8b7447aec8',
              '93dc071d54',
              'af012f2ceb',
              'c2c0d73c35',
              'db8f589217',
              'de52ef6ef4',
              'e9de6fd9b4'],
             '54ee6e': ['03c9d8dbfa',
              '04e10e0797',
              '39748207e6',
              'c296f8bc53',
              'e5b50f097a'],
             '242a3e': ['053e7a7261',
              '0d36278c02',
              '15219ff4dc',
              '5d320ade20',
              '6674377054',
              '6dc32464d9',
              'ab366d69a2',
              'bbe711ca75',
              'd63e3d09dc',
              'db0221f462',
              'e1e992e107',
              'e5d573a30f'],
             '4ca9b3': ['05e892544d',
              '1a302f432f',
              '310c0b04e3',
              '43dc843208',
              '

In [4]:
n  =2 
list(subjects_info.keys())[:n]

['231c3b', '54ee6e']

In [5]:
def Npatients_data(Parent_folder, n = 1): 
    # in this function, we try to attach the patient Id for each record in all visits
    #this methed helps alot to access the data using pandas functions which is very fast
    #then we combine all these files in one file for later usage
    subject_ids = list(subjects_info.keys())[:n]
    SubjectsVisitsList = []
    for subject in tqdm(subject_ids):
        #for each subject,we need to aggregate his visits!!
        #SubjectVisitList  = []
        for file in subjects_info[subject]:#
            visit_path = Parent_folder+f'{file}.csv'
            subjectVisit = pd.read_csv(visit_path)
            # let's attach the patient id for this readings
            subjectVisit['patientID'] = subject
            SubjectsVisitsList.append(subjectVisit )

    npatients_data = pd.concat(SubjectsVisitsList, axis = 0) 
    #saving this file for later usage!
    npatients_data.to_csv(Parent_folder+f'{n}_patients_data.csv')
    return npatients_data


In [6]:
#combine data for n patients: 
# in this code, we try to make one file .csv contained all informtion for selected patients
n = 37

Parent_folder = path + './Off_Medication_Data/'
data = Npatients_data(Parent_folder, n = n)

100%|██████████| 37/37 [00:04<00:00,  7.95it/s]


In [None]:


import pandas as pd
from tqdm import tqdm

def timeSeriesData(df,
                   window=128,
                   shift=16,
                   forecasting_period=128,
                   label_cols=('StartHesitation', 'Turn', 'Walking'),
                   parent_folder=None,
                   save_csv=True):
    """
    Prepare time-series data for forecasting.
    Input  = past `window` steps
    Output = next `forecasting_period` steps (multi-output forecasting)

    df must contain: patientID, AccV, AccML, AccAP, and the label columns.
    """

    subjects = []
    sequences = []
    labels = []

    # Loop per patient
    for subject_id, patient_data in tqdm(df.groupby('patientID'), desc="Processing Patients"):
        print(f"Patient {subject_id} samples: {len(patient_data)}")

        # Features
        features = patient_data[['AccV', 'AccML', 'AccAP']].values

        # Targets
        target = patient_data[list(label_cols)].values

        # Full window = past window + future window
        total_len = window + forecasting_period

        # Sliding window
        for i in range(0, len(features) - total_len + 1, shift):

            # Past input
            seq_x = features[i:i + window, :]

            # Future labels (forecast window)
            seq_y = target[i + window : i + window + forecasting_period, :]

            sequences.append(seq_x)
            labels.append(seq_y)
            subjects.append(subject_id)

        print(f"Total sequences so far: {len(sequences)}")

    # Create DataFrame
    Seq_df = pd.DataFrame({
        'sequence': sequences,
        'label': labels,
        'subject': subjects
    })

    # Convert arrays → list (for CSV)
    Seq_df_to_save = Seq_df.copy()
    Seq_df_to_save['sequence'] = Seq_df_to_save['sequence'].apply(lambda a: a.tolist())
    Seq_df_to_save['label'] = Seq_df_to_save['label'].apply(lambda a: a.tolist())

    # Save
    if save_csv:
        assert parent_folder is not None, "parent_folder must be given if save_csv=True"
        out_path = parent_folder + 'SequencesData.xlsx'
        print(f"Saving Excel file to: {out_path}")
        Seq_df_to_save.to_excel(out_path, index=False)

    return Seq_df, df



In [8]:
#df  = timeSeriesData(df =data ,window = 256,shift = 128, label_cols = ['StartHesitation', 'Turn' , 'Walking'], parent_folder= Parent_folder )

Seq_df, df = timeSeriesData(
    df=data,
    window=256,
    shift=128,
    forecasting_period=128,
    label_cols=['StartHesitation', 'Turn', 'Walking'],
    parent_folder=Parent_folder
)


Processing Patients: 100%|██████████| 37/37 [00:00<00:00, 104.86it/s]

Patient 07285e samples: 66588
Total sequences so far: 518
Patient 194d1d samples: 186621
Total sequences so far: 1973
Patient 220a17 samples: 65818
Total sequences so far: 2485
Patient 231c3b samples: 75847
Total sequences so far: 3075
Patient 242a3e samples: 83182
Total sequences so far: 3722
Patient 24a59d samples: 39367
Total sequences so far: 4027
Patient 251738 samples: 53234
Total sequences so far: 4440
Patient 2a39f8 samples: 18302
Total sequences so far: 4580
Patient 2c98f7 samples: 46390
Total sequences so far: 4940
Patient 2d57c2 samples: 483239
Total sequences so far: 8713
Patient 31d269 samples: 78851
Total sequences so far: 9327
Patient 364459 samples: 40180
Total sequences so far: 9638
Patient 3b2403 samples: 84052
Total sequences so far: 10292
Patient 3b2b7a samples: 118503
Total sequences so far: 11215
Patient 48fd62 samples: 24959
Total sequences so far: 11407
Patient 4b39ac samples: 75743
Total sequences so far: 11996
Patient 4ca9b3 samples: 100669
Total sequences so 




Saving Excel file to: ./DataSet/./Off_Medication_Data/SequencesData.xlsx


In [9]:
df 

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,patientID
0,0,-8.741742,0.447673,-3.782152,0,0,0,231c3b
1,1,-8.750978,0.447704,-3.761971,0,0,0,231c3b
2,2,-8.750704,0.443296,-3.759559,0,0,0,231c3b
3,3,-8.743981,0.445426,-3.750533,0,0,0,231c3b
4,4,-8.747818,0.441290,-3.738788,0,0,0,231c3b
...,...,...,...,...,...,...,...,...
62315,62315,-9.836408,-0.864804,-0.869158,0,0,0,194d1d
62316,62316,-9.841402,-0.840607,-0.867648,0,0,0,194d1d
62317,62317,-9.844992,-0.811945,-0.878778,0,0,0,194d1d
62318,62318,-9.842589,-0.814921,-0.875801,0,0,0,194d1d


In [10]:
# now based on the results csv file which created from (features) [window size of 3 features, for example 0-255] with (label)[ the three states of [start, turn , walking] which crossponding to the featues  0-255
# now let combine these three states into one label based on: 
# if any one in each state ==> 1 , else zero: 


In [11]:
Parent_folder = 'C:\\Users\\Student\\Desktop\\Abouhashem\\DeepLearningProject\\'
data  = pd.read_csv(Parent_folder  + 'SequencesData.csv')

In [12]:
data.head()

Unnamed: 0,sequence,label,subject
0,"[[-9.49957715154697, -0.168283209722647, -2.97...","[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [...",07285e
1,"[[-9.49797010567625, -0.16071738881562, -2.971...","[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [...",07285e
2,"[[-9.50505427008857, -0.153830715044685, -2.96...","[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [...",07285e
3,"[[-9.50463907478176, -0.170416978408638, -2.96...","[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [...",07285e
4,"[[-9.49976189338822, -0.168138558490196, -2.97...","[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [...",07285e


In [13]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np
import ast  # for literal_eval

def combine_labels(csv_path , save_path=None):
    """
    Combine the three label channels [StartHesitation, Turn, Walking]
    into one binary label per timestep: 1 if any of the three is 1,
    otherwise 0.

    Assumes the CSV was written by `timeSeriesData`, where 'sequence'
    and 'label' columns are string representations of Python lists.

    Returns
    -------
    combined_df : pd.DataFrame
        'sequence' : np.array (T, 3)
        'label'    : np.array (T,)   # combined binary label per timestep
        'subject'
    """
    df = pd.read_csv(csv_path)

    # Turn back into numpy arrays
    df['sequence'] = df['sequence'].apply(
        lambda s: np.array(ast.literal_eval(s), dtype=float)
    )
    df['label'] = df['label'].apply(
        lambda s: np.array(ast.literal_eval(s), dtype=int)
    )

    combined_labels = []
    for lbl in df['label']:
        # lbl shape: (window, 3)
        combined = (lbl.sum(axis=1) > 0).astype(int)  # shape (window,)
        combined_labels.append(combined)

    combined_df = pd.DataFrame({
        'sequence': df['sequence'],
        'label': combined_labels,
        'subject': df['subject'],
    })

    if save_path is not None:
        # If you want the combined CSV to also be parseable later, save as lists
        save_df = combined_df.copy()
        save_df['sequence'] = save_df['sequence'].apply(lambda a: a.tolist())
        save_df['label']    = save_df['label'].apply(lambda a: a.tolist())
        save_df.to_csv(save_path, index=False)
        print(f"✅ Combined label CSV saved to: {save_path}")

    return combined_df


In [15]:
# 2) Later, combine labels from the saved CSV
combined_df = combine_labels(
    csv_path=Parent_folder + 'SequencesData.csv',
    save_path=Parent_folder + 'SequencesData_combined.csv'
)

✅ Combined label CSV saved to: C:\Users\Student\Desktop\Abouhashem\DeepLearningProject\SequencesData_combined.csv


In [16]:
import pandas as pd
import numpy as np
import ast  # <- important for parsing list-strings


def make_window_labels(
    csv_path,
    ratio=0.3,
    drop_hybrid=False,
    save_path=None
):
    """
    From the combined CSV (sequence + per-timestep binary labels),
    create a new dataset with ONE label per window based on the
    percentage of ones.

    Assumes the CSV was written by `combine_labels`, where 'sequence'
    and 'label' are stored as Python-list strings.

    Parameters
    ----------
    csv_path : str
        Path to the combined CSV (output of your previous combine step).
    ratio : float, default=0.3
        Threshold ratio. If (ones / window_len) > ratio -> window_label = 1,
        else window_label = 0.
    drop_hybrid : bool, default=False
        If True, windows that have BOTH 0s and 1s but
        (ones / window_len) < ratio are REMOVED from the dataset.
        (i.e., "hybrid" but not FoG-dominant windows).
    save_path : str or None
        If provided, save the resulting DataFrame to this CSV.

    Returns
    -------
    out_df : pd.DataFrame
        Columns:
            - 'sequence'      : (T, 3) accel data
            - 'frame_labels'  : (T,) per-timestep binary labels
            - 'window_label'  : scalar 0/1 per window
            - 'subject'       : patient ID
    """
    assert 0.0 <= ratio <= 1.0, "ratio must be between 0 and 1"

    # Load CSV
    df = pd.read_csv(csv_path)

    # Parse columns back to numpy arrays
    df['sequence'] = df['sequence'].apply(
        lambda s: np.array(ast.literal_eval(s), dtype=float)  # shape (T, 3)
    )
    df['label'] = df['label'].apply(
        lambda s: np.array(ast.literal_eval(s), dtype=int)    # shape (T,)
    )

    sequences = []
    frame_labels = []
    window_labels = []
    subjects = []

    dropped_hybrid = 0

    for seq, lbl, subj in zip(df['sequence'], df['label'], df['subject']):
        lbl = np.asarray(lbl, dtype=int)
        n = len(lbl)
        n_ones = int(lbl.sum())
        perc_ones = n_ones / n if n > 0 else 0.0

        # Hybrid window: mix of zeros and ones
        is_hybrid = (n_ones > 0) and (n_ones < n)

        # Optionally drop hybrid windows that are not FoG-dominant
        if drop_hybrid and is_hybrid and (perc_ones < ratio):
            dropped_hybrid += 1
            continue

        # Assign window-level label
        # "more than a specific ratio" -> >
        win_label = 1 if perc_ones > ratio else 0

        sequences.append(seq)
        frame_labels.append(lbl)
        window_labels.append(win_label)
        subjects.append(subj)

    out_df = pd.DataFrame({
        'sequence': sequences,
        'frame_labels': frame_labels,  # keep per-timestep labels
        'window_label': window_labels, # new scalar label per window
        'subject': subjects
    })

    if save_path is not None:
        # Save arrays as lists so they can be parsed again later
        save_df = out_df.copy()
        save_df['sequence']     = save_df['sequence'].apply(lambda a: a.tolist())
        save_df['frame_labels'] = save_df['frame_labels'].apply(lambda a: a.tolist())
        save_df.to_csv(save_path, index=False)
        print(f"✅ Saved window-level dataset to: {save_path}")
        print(f"   Dropped hybrid windows: {dropped_hybrid}")
    else:
        print(f"Processed windows: {len(out_df)} (dropped hybrid={dropped_hybrid})")

    return out_df


In [17]:
# 3) Turn each window into a single label based on ratio
window_df = make_window_labels(
    csv_path=Parent_folder + 'SequencesData_combined.csv',
    ratio=0.6,
    drop_hybrid=False,#True,  # or False
    save_path=Parent_folder + 'SequencesData_window_labels.csv'
)

✅ Saved window-level dataset to: C:\Users\Student\Desktop\Abouhashem\DeepLearningProject\SequencesData_window_labels.csv
   Dropped hybrid windows: 0


In [19]:
data = pd.read_csv(Parent_folder + "SequencesData_window_labels.csv")

In [20]:
data.head()

Unnamed: 0,sequence,frame_labels,window_label,subject
0,"[[-9.49957715154697, -0.168283209722647, -2.97...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,07285e
1,"[[-9.49797010567625, -0.16071738881562, -2.971...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,07285e
2,"[[-9.50505427008857, -0.153830715044685, -2.96...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,07285e
3,"[[-9.50463907478176, -0.170416978408638, -2.96...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,07285e
4,"[[-9.49976189338822, -0.168138558490196, -2.97...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,07285e


In [21]:
data.columns

Index(['sequence', 'frame_labels', 'window_label', 'subject'], dtype='object')

In [22]:
cols = ['sequence',  'window_label', 'subject']

In [23]:
data = data[cols]

In [24]:
data.head()

Unnamed: 0,sequence,window_label,subject
0,"[[-9.49957715154697, -0.168283209722647, -2.97...",0,07285e
1,"[[-9.49797010567625, -0.16071738881562, -2.971...",0,07285e
2,"[[-9.50505427008857, -0.153830715044685, -2.96...",0,07285e
3,"[[-9.50463907478176, -0.170416978408638, -2.96...",0,07285e
4,"[[-9.49976189338822, -0.168138558490196, -2.97...",0,07285e


In [25]:
data['sequence']

0        [[-9.49957715154697, -0.168283209722647, -2.97...
1        [[-9.49797010567625, -0.16071738881562, -2.971...
2        [[-9.50505427008857, -0.153830715044685, -2.96...
3        [[-9.50463907478176, -0.170416978408638, -2.96...
4        [[-9.49976189338822, -0.168138558490196, -2.97...
                               ...                        
21853    [[-9.53926391099992, -0.596079464888405, -1.02...
21854    [[-9.52602634238134, -0.573701769707145, -1.13...
21855    [[-9.52903192364183, -0.556135882998697, -1.05...
21856    [[-9.53834891763892, -0.549401295728269, -1.08...
21857    [[-9.52223145606167, -0.553883433255421, -1.16...
Name: sequence, Length: 21858, dtype: object

In [26]:
data['window_label']

0        0
1        0
2        0
3        0
4        0
        ..
21853    0
21854    0
21855    0
21856    0
21857    0
Name: window_label, Length: 21858, dtype: int64

In [27]:
len(data[ data['window_label'] == 1 ]) #FOG

8124

In [28]:
len(data[ data['window_label'] == 0 ])# Non FOG

13734

In [29]:
import pandas as pd
import numpy as np
import ast
from scipy.signal import butter, filtfilt

# 1) Design filters
def design_filters(fs=128.0):
    """
    Design the 2nd-order low-pass (15 Hz) and 3rd-order high-pass (0.2 Hz)
    Butterworth filters for sampling rate fs.
    """
    nyq = fs / 2.0

    # 2nd order low-pass at 15 Hz
    low_cut = 15.0 / nyq
    b_lp, a_lp = butter(N=2, Wn=low_cut, btype='low', analog=False)

    # 3rd order high-pass at 0.2 Hz
    high_cut = 0.2 / nyq
    b_hp, a_hp = butter(N=3, Wn=high_cut, btype='high', analog=False)

    return (b_lp, a_lp), (b_hp, a_hp)


# 2) Helper to parse 'sequence' column back to a numpy array
def parse_sequence(seq_obj):
    """
    Turn a 'sequence' cell into a numpy array of shape (T, 3).
    Handles both:
      - string representation like "[[...],[...],...]"
      - already-loaded Python lists/arrays.
    """
    if isinstance(seq_obj, np.ndarray):
        arr = seq_obj.astype(float)
    elif isinstance(seq_obj, list):
        arr = np.asarray(seq_obj, dtype=float)
    else:
        # assume string from CSV
        arr = np.array(ast.literal_eval(seq_obj), dtype=float)

    # ensure 2D: (T, 3)
    if arr.ndim == 1:
        arr = arr.reshape(-1, 1)
    return arr


# 3) Apply bandpass (low-pass + high-pass) to every window
def apply_filters_to_sequences(df, fs=128.0, save_path=None):
    """
    Apply 15 Hz low-pass (2nd order) then 0.2 Hz high-pass (3rd order)
    Butterworth filters to each window's 3-axis accelerometer signal.
    
    df must have:
        - 'sequence'      : sequence (T, 3) stored as list/array/string
        - 'window_label'  : 0/1 per window
        - 'subject'       : patient ID

    Returns a new DataFrame with filtered sequences.
    """
    (b_lp, a_lp), (b_hp, a_hp) = design_filters(fs)

    df_out = df.copy()

    def _filter_one(seq_obj):
        x = parse_sequence(seq_obj)           # shape (T, 3)
        # Low-pass then high-pass along time axis
        x_lp = filtfilt(b_lp, a_lp, x, axis=0)
        x_bp = filtfilt(b_hp, a_hp, x_lp, axis=0)
        return x_bp

    df_out['sequence'] = df_out['sequence'].apply(_filter_one)

    if save_path is not None:
        # convert arrays to lists so we can save to CSV
        df_save = df_out.copy()
        df_save['sequence'] = df_save['sequence'].apply(lambda a: a.tolist())
        df_save.to_csv(save_path, index=False)
        print(f"✅ Saved filtered sequences to: {save_path}")

    return df_out


In [30]:
# Apply the filters with fs=128 Hz
df_filtered = apply_filters_to_sequences(
    data,
    fs=128.0,
    save_path=Parent_folder + "SequencesData_window_labels_filtered.csv"
)

✅ Saved filtered sequences to: C:\Users\Student\Desktop\Abouhashem\DeepLearningProject\SequencesData_window_labels_filtered.csv


In [31]:
seq = df_filtered['sequence'][0]  # shape (128, 3)

In [32]:
seq.shape

(256, 3)

In [33]:
seq 

array([[-1.60873095e-03,  2.36066583e-03,  3.02996931e-04],
       [-2.07691711e-03,  4.55961056e-03, -1.71824418e-03],
       [-1.99658327e-03,  5.99450534e-03, -3.61723713e-03],
       [-1.41327135e-03,  6.21116340e-03, -5.03638134e-03],
       [-7.71752851e-04,  5.18821597e-03, -5.59652059e-03],
       [-4.19607831e-04,  3.42968980e-03, -5.21846236e-03],
       [-4.11511338e-04,  1.82970486e-03, -4.20787630e-03],
       [-6.61825654e-04,  1.07102962e-03, -3.14949704e-03],
       [-1.17359396e-03,  1.15137138e-03, -2.61820341e-03],
       [-1.93308631e-03,  1.65183814e-03, -2.84224337e-03],
       [-2.59861904e-03,  2.32647300e-03, -3.71016887e-03],
       [-2.66961712e-03,  3.29101740e-03, -5.12422287e-03],
       [-2.05763284e-03,  4.74699659e-03, -7.06510613e-03],
       [-1.19043408e-03,  6.61548693e-03, -9.24332928e-03],
       [-4.92743680e-04,  8.46220663e-03, -1.11126181e-02],
       [ 2.42297043e-05,  9.76367530e-03, -1.23287986e-02],
       [ 5.98618815e-04,  1.02086133e-02

In [34]:
df_filtered.head()

Unnamed: 0,sequence,window_label,subject
0,"[[-0.0016087309472044842, 0.002360665832874116...",0,07285e
1,"[[-0.002828209376519699, -0.001741191604220398...",0,07285e
2,"[[-0.0009826247031032818, -0.00563225855124668...",0,07285e
3,"[[0.000900899003014051, 0.0009491892832060866,...",0,07285e
4,"[[0.0004069949071238669, 0.002595688129624597,...",0,07285e


In [35]:
len(df_filtered['subject'].value_counts())

37

In [36]:
len(df_filtered['subject'])

21858

In [37]:
###Dividing the data into training and testing based on Patient indpentendent" 
import pandas as pd
import numpy as np

def split_patient_independent(
    df,
    test_ratio=0.1,
    random_state=42,
    save_prefix=None
):
    """
    Split window-level data into train/test in a patient-independent way.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns:
            - 'sequence'     : (T, 3) window data (array or list)
            - 'window_label' : scalar 0/1 per window
            - 'subject'      : patient ID
    test_ratio : float, default=0.1
        Target fraction of windows in the test set (e.g., 0.1 = 10%).
    random_state : int
        Seed for reproducible shuffling of subjects.
    save_prefix : str or None
        If given, will save:
            f"{save_prefix}_train.csv"
            f"{save_prefix}_test.csv"

    Returns
    -------
    train_df, test_df : (pd.DataFrame, pd.DataFrame)
        Patient-independent splits.
    """
    assert 0.0 < test_ratio < 1.0, "test_ratio must be between 0 and 1"

    # Count windows per subject
    counts = df.groupby('subject').size()
    total_windows = int(counts.sum())
    target_test_windows = test_ratio * total_windows

    print(f"Total windows:       {total_windows}")
    print(f"Target test windows: {target_test_windows:.1f} (~{test_ratio*100:.1f}%)")

    # Shuffle subjects
    rng = np.random.RandomState(random_state)
    subjects = counts.index.to_numpy()
    rng.shuffle(subjects)

    test_subjects = []
    cumulative = 0

    # Greedily add subjects to test until we reach target_test_windows
    for subj in subjects:
        if cumulative >= target_test_windows:
            break
        test_subjects.append(subj)
        cumulative += counts[subj]

    print(f"Chosen test subjects: {test_subjects}")
    print(f"Actual test windows:  {cumulative} "
          f"({cumulative / total_windows * 100:.2f}%)")

    # Build masks
    is_test = df['subject'].isin(test_subjects)
    test_df = df[is_test].reset_index(drop=True)
    train_df = df[~is_test].reset_index(drop=True)

    print(f"Train windows: {len(train_df)} "
          f"({len(train_df) / total_windows * 100:.2f}%)")
    print(f"Test windows:  {len(test_df)} "
          f"({len(test_df) / total_windows * 100:.2f}%)")

    if save_prefix is not None:
        # Note: sequences are arrays/lists; pandas will store them as strings in CSV
        train_df.to_pickle(f"{save_prefix}_train.pkl")
        test_df.to_pickle(f"{save_prefix}_test.pkl")

        print(f"✅ Saved: {save_prefix}_train.pkl")
        print(f"✅ Saved: {save_prefix}_test.pkl")

    return train_df, test_df



In [38]:

train_df = pd.read_pickle("FoG_windows_train.pkl")
test_df  = pd.read_pickle("FoG_windows_test.pkl")


In [39]:
train_df['sequence']

0        [[-0.0016087309472044842, 0.002360665832874116...
1        [[-0.002828209376519699, -0.001741191604220398...
2        [[-0.0009826247031032818, -0.00563225855124668...
3        [[0.000900899003014051, 0.0009491892832060866,...
4        [[0.0004069949071238669, 0.002595688129624597,...
                               ...                        
19875    [[-0.004181495873876922, -0.019645960119555265...
19876    [[0.011228856518335533, 0.00859156913929681, -...
19877    [[0.002034287557147306, -0.0004327593307638737...
19878    [[0.0037749095346735553, -0.005622713657329966...
19879    [[-0.005214028866865398, 0.0063796750587285745...
Name: sequence, Length: 19880, dtype: object

In [40]:
train_df['sequence'].values

array([array([[-1.60873095e-03,  2.36066583e-03,  3.02996931e-04],
              [-2.07691711e-03,  4.55961056e-03, -1.71824418e-03],
              [-1.99658327e-03,  5.99450534e-03, -3.61723713e-03],
              [-1.41327135e-03,  6.21116340e-03, -5.03638134e-03],
              [-7.71752851e-04,  5.18821597e-03, -5.59652059e-03],
              [-4.19607831e-04,  3.42968980e-03, -5.21846236e-03],
              [-4.11511338e-04,  1.82970486e-03, -4.20787630e-03],
              [-6.61825654e-04,  1.07102962e-03, -3.14949704e-03],
              [-1.17359396e-03,  1.15137138e-03, -2.61820341e-03],
              [-1.93308631e-03,  1.65183814e-03, -2.84224337e-03],
              [-2.59861904e-03,  2.32647300e-03, -3.71016887e-03],
              [-2.66961712e-03,  3.29101740e-03, -5.12422287e-03],
              [-2.05763284e-03,  4.74699659e-03, -7.06510613e-03],
              [-1.19043408e-03,  6.61548693e-03, -9.24332928e-03],
              [-4.92743680e-04,  8.46220663e-03, -1.11126181e-

In [41]:
# prepraring the data as Pytorch dataset and and data-loader
#
train_df, test_df = split_patient_independent(
    df_filtered,
    test_ratio=0.1,
    random_state=42,
    save_prefix=Parent_folder + "FoG_windows"
)



Total windows:       21858
Target test windows: 2185.8 (~10.0%)
Chosen test subjects: ['4dc2f8', '3b2b7a', '242a3e']
Actual test windows:  2488 (11.38%)
Train windows: 19880 (90.95%)
Test windows:  1978 (9.05%)
✅ Saved: C:\Users\Student\Desktop\Abouhashem\DeepLearningProject\FoG_windows_train.pkl
✅ Saved: C:\Users\Student\Desktop\Abouhashem\DeepLearningProject\FoG_windows_test.pkl


In [42]:
train_df.head()

Unnamed: 0,sequence,window_label,subject
0,"[[-0.0016087309472044842, 0.002360665832874116...",0,07285e
1,"[[-0.002828209376519699, -0.001741191604220398...",0,07285e
2,"[[-0.0009826247031032818, -0.00563225855124668...",0,07285e
3,"[[0.000900899003014051, 0.0009491892832060866,...",0,07285e
4,"[[0.0004069949071238669, 0.002595688129624597,...",0,07285e


In [48]:
len(test_df['window_label'] ==0)

1978

In [49]:
len(test_df['window_label'] ==1)

1978

In [43]:
len(train_df['subject'].value_counts()) #training patients

34

In [44]:
len(test_df['subject'].value_counts()) #Testing patients

3

In [45]:
#so our data set is ready for applying the machine learning model.