In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import os

In [2]:
cd /scratch/npr264/BioDeepL/dreamt/physionet.org/files/dreamt/2.0.0/data_64Hz

/scratch/npr264/BioDeepL/dreamt/physionet.org/files/dreamt/2.0.0/data_64Hz


In [4]:
df = pd.read_csv('S002_whole_df.csv')

In [5]:
df

Unnamed: 0,TIMESTAMP,BVP,ACC_X,ACC_Y,ACC_Z,TEMP,EDA,HR,IBI,Sleep_Stage,Obstructive_Apnea,Central_Apnea,Hypopnea,Multiple_Events
0,0.000000,5.14,31.0,8.0,55.0,35.53,0.073005,49.00,,P,,,,
1,0.015625,4.28,31.0,8.0,55.0,35.53,0.073005,49.00,,P,,,,
2,0.031250,3.51,31.0,8.0,55.0,35.53,0.073005,49.00,,P,,,,
3,0.046875,3.02,31.0,8.0,55.0,35.53,0.073005,49.00,,P,,,,
4,0.062500,2.94,28.0,8.0,55.0,35.53,0.073005,49.00,,P,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013692,31463.937500,120.17,-33.0,-24.0,50.0,35.37,0.142168,78.13,1.046875,W,,,,
2013693,31463.953125,106.78,-33.0,-24.0,50.0,35.37,0.142168,78.13,1.046875,W,,,,
2013694,31463.968750,98.52,-33.0,-24.0,51.0,35.37,0.142168,78.13,1.046875,W,,,,
2013695,31463.984375,95.02,-33.0,-24.0,51.0,35.37,0.142168,78.13,1.046875,W,,,,


## Initial downsampling

In [6]:
acc_df = df.iloc[::2].reset_index(drop=True)

In [5]:
acc_df = df[['TIMESTAMP', 'ACC_X', 'ACC_Y', 'ACC_Z']].iloc[::2].reset_index(drop=True)

In [9]:
five_min_df = df[df['TIMESTAMP'] % 300 == 0].reset_index(drop=True)
hrtemp_df = five_min_df[['TIMESTAMP', 'TEMP', 'HR']]

In [10]:
hrtemp_df

Unnamed: 0,TIMESTAMP,TEMP,HR
0,0.0,35.53,49.00
1,300.0,36.13,72.10
2,600.0,36.23,75.10
3,900.0,36.33,72.45
4,1200.0,36.00,68.32
...,...,...,...
100,30000.0,34.33,82.05
101,30300.0,34.21,68.62
102,30600.0,35.16,64.05
103,30900.0,35.23,64.22


In [6]:
acc_df

Unnamed: 0,TIMESTAMP,ACC_X,ACC_Y,ACC_Z
0,0.00000,31.0,8.0,55.0
1,0.03125,31.0,8.0,55.0
2,0.06250,28.0,8.0,55.0
3,0.09375,29.0,8.0,55.0
4,0.12500,31.0,8.0,53.0
...,...,...,...,...
1006844,31463.87500,-33.0,-24.0,50.0
1006845,31463.90625,-33.0,-24.0,50.0
1006846,31463.93750,-33.0,-24.0,50.0
1006847,31463.96875,-33.0,-24.0,51.0


In [10]:
acc_df.to_csv('/scratch/npr264/BioDeepL/project/acc_data.csv', index = False)

In [15]:
# Sleep stage mapping as before

def safe_float(value, default=np.nan):
    """
    Safely converts a value to a float.
    If the conversion fails, returns a default value.
    """
    try:
        return float(value)
    except (ValueError):
        return np.nan

SLEEP_STAGE_MAPPING = {
    "W": 0,    # Wake
    "N1": 1,   # non-REM stage 1
    "N2": 2,   # non-REM stage 2
    "N3": 3,   # non-REM stage 3
    "R": 4,    # REM
    "Missing": -1  # Missing label
}

def forward_fill(x):
    """
    Performs forward fill on a tensor.
    If x is 1D (shape [T]), it is temporarily unsqueezed to [T, 1].
    Assumes the first value is valid, or fills it with zero if needed.
    """
    single_channel = False
    if x.dim() == 1:
        x = x.unsqueeze(1)
        single_channel = True

    T, C = x.shape
    for c in range(C):
        if torch.isnan(x[0, c]):
            x[0, c] = 0.0
        for t in range(1, T):
            if torch.isnan(x[t, c]):
                x[t, c] = x[t - 1, c]
    if single_channel:
        x = x.squeeze(1)
    return x

numeric_columns = [
    'TIMESTAMP', 'BVP', 'ACC_X', 'ACC_Y', 'ACC_Z', 'TEMP',
    'EDA', 'HR', 'IBI'
]
converters = {col: safe_float for col in numeric_columns}

class SleepDataset(Dataset):
    def __init__(self, subjects_list, data_dir, x_values ,max_length=2493810,debug=False):
        """ x_values = 'acc' or 'TEMPBVP'"""
        self.subjects = [{} for _ in range(len(subjects_list))]
        self.x_values = x_values
        if x_values == 'acc':
            downsample_freq=32
            cols = ['ACC_X', 'ACC_Y', 'ACC_Z']
        elif x_values == 'TEMPBVP':
            downsample_freq = 0.2
            cols = ['TEMP', 'BVP']
        else:
            print(x_values = 'acc' or 'TEMPBPV')
            return
        self.downsample = int(64 // downsample_freq)  # Downsample factor
        max_length = int(max_length // self.downsample)
        self.max_length = max_length
            
        all_cols = ['TIMESTAMP']+ cols
        #print(all_cols)
        
        for subjectNo, SID in enumerate(subjects_list):
            # Load the data for each subject
            file_path = os.path.join(data_dir, f"{SID}_whole_df.csv")
            if os.path.exists(file_path):
                df = pd.read_csv(
                    file_path,
                    dtype={'Sleep_Stage': 'category'},
                    converters=converters,
                    low_memory=True
                )
                if debug:
                    print(f"loaded data for {SID}:")

                # Downsample the data if needed
                if self.downsample != 1:
                    df = df.iloc[::self.downsample].reset_index(drop=True)
                    if debug:
                        print(f"After downsampling by factor {self.downsample}, rows: {len(df)}")
                
                df = df[df['Sleep_Stage'] != 'P'] # remove data before PSG start
                for col in all_cols:
                    #print(df.columns)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                df_X = df[all_cols].copy()
                # Normalize the features (z-score normalization per subject)
                columns_to_normalize = cols  # Exclude TIMESTAMP
                df_X[columns_to_normalize] = (df_X[columns_to_normalize] - df_X[columns_to_normalize].mean()) / df_X[columns_to_normalize].std()
                df['Sleep_Stage'] = df['Sleep_Stage'].astype(str).str.strip()
                df_Y = df['Sleep_Stage'].map(SLEEP_STAGE_MAPPING)
                
                # Pad/truncate the data to the downsampled max_length
                if len(df_X) > max_length:
                    if debug:
                        print(f"Truncating data for {SID} from {len(df_X)} to {max_length} samples.")
                    df_X = df_X.iloc[:max_length]
                    df_Y = df_Y.iloc[:max_length]
                else:
                    padding_length = max_length - len(df_X)
                    padding = pd.DataFrame(np.nan, index=np.arange(padding_length), columns=df_X.columns)
                    df_X = pd.concat([df_X, padding], ignore_index=True)
                    print(df_X.columns)
                    df_Y = pd.concat([df_Y, pd.Series([-1] * padding_length)], ignore_index=True)
                self.subjects[subjectNo] = {
                    'data': df_X.values.astype(np.float32),  # shape: [T, C]
                    'labels': df_Y.to_numpy(),                 # shape: [T]
                    'SID': SID
                }
                if debug:
                    print(f"Data shape for {SID}: {df_X.shape}, Labels shape: {df_Y.shape}")
            else:
                warning(f"File {file_path} does not exist. Skipping subject {SID}.")
    def __len__(self):
        return len(self.subjects)

    def __getitem__(self, idx):
        subject = self.subjects[idx]
        data = torch.tensor(subject['data'], dtype=torch.float32)
        labels = torch.tensor(subject['labels'], dtype=torch.long)

        data = forward_fill(data) # fill NaNs with previous values
        labels = forward_fill(labels) # fill NaNs with previous values
        return data, labels

In [16]:
# Example usage
subject_ids = ["S002", "S003", "S004"]
data_directory = "/scratch/npr264/BioDeepL/dreamt/physionet.org/files/dreamt/2.0.0/data_64Hz"  
demo_dataset = SleepDataset(subjects_list=subject_ids,
                                 data_dir=data_directory, x_values='TEMPBVP')

# print("Total samples in sliding-window dataset:", len(demo_dataset))
# sample, label, sid = demo_dataset[10]
# print(f"Sample shape: {sample.shape} (epoch_samples, num_chans), Label: {label}, Subject ID: {sid}")

Index(['TIMESTAMP', 'TEMP', 'BVP'], dtype='object')
Index(['TIMESTAMP', 'TEMP', 'BVP'], dtype='object')
Index(['TIMESTAMP', 'TEMP', 'BVP'], dtype='object')


In [8]:
data, labels = demo_dataset[0]

In [9]:
labels.shape

torch.Size([7817])

In [10]:
data.shape

torch.Size([7817, 3])

In [11]:
#soemthing wrong here too need to fix
#the bpv and temp should be each their own thing no?
data[:,0].shape

torch.Size([7817])

In [14]:
data

tensor([[ 9.1762e+03, -2.1243e+00,  1.3196e-01],
        [ 9.1812e+03, -2.1069e+00, -7.6298e-03],
        [ 9.1862e+03, -2.1243e+00,  1.2635e-01],
        ...,
        [ 3.1461e+04,  7.3471e-01, -9.3457e-01],
        [ 3.1461e+04,  7.3471e-01, -9.3457e-01],
        [ 3.1461e+04,  7.3471e-01, -9.3457e-01]])

In [55]:
class SleepChunkDataset(Dataset):
    def __init__(self, subjects_list, data_dir, x_values, chunk_duration=600, chunk_stride=300, debug=True):
        """
        Args:
            subjects_list (list): List of subject IDs, e.g. ["SID1", "SID2", ...].
            data_dir (str): Directory where files like "SID_whole_df.csv" are stored.
            chunk_duration (int): Chunk length in seconds (default 600 s for 10 minutes).
            chunk_stride (int): Time in seconds to step forward between chunks (default 300 s, for 50% overlap).
            downsample_freq (int): Desired sampling frequency after downsampling (original data are at 64 Hz).
            debug (bool): If True, print status messages.
        """
        self.x_values = x_values
        if x_values == 'acc':
            downsample_freq=32
            cols = ['ACC_X', 'ACC_Y', 'ACC_Z']
        elif x_values == 'TEMPBVP':
            downsample_freq = 0.2
            cols = ['TEMP', 'BVP']
        else:
            print(x_values = 'acc' or 'TEMPBPV')
            return
        self.downsample = int(64 // downsample_freq)  # Downsample factor
            
        all_cols = ['TIMESTAMP']+ cols
        self.chunks = []  # List to store each generated chunk (with its corresponding data, labels, and SID)
        # Effective sampling rate after downsampling becomes downsample_freq Hz.
        self.chunk_length = int(chunk_duration * downsample_freq)
        self.stride = int(chunk_stride * downsample_freq)

        for SID in subjects_list:
            file_path = os.path.join(data_dir, f"{SID}_whole_df.csv")
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, dtype={'Sleep_Stage': 'category'}, converters=converters, low_memory=True)
                if debug:
                    print(f"Loaded data for subject {SID}")
                
                # Downsample: every self.downsample-th row
                if self.downsample != 1:
                    df = df.iloc[::self.downsample].reset_index(drop=True)
                    if debug:
                        print(f"After downsampling (factor {self.downsample}), rows: {len(df)}")
                
                # Remove rows with "Preparation" phase if labeled 'P'
                df = df[df['Sleep_Stage'] != 'P']

                # Ensure numeric conversion for required columns
                for col in all_cols:
                    #print(df.columns)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                
                df_X = df[all_cols].copy()
                # Normalize the features (z-score normalization per subject)
                columns_to_normalize = cols  # Exclude TIMESTAMP
                df_X[columns_to_normalize] = (df_X[columns_to_normalize] - df_X[columns_to_normalize].mean()) / df_X[columns_to_normalize].std()
                df['Sleep_Stage'] = df['Sleep_Stage'].astype(str).str.strip()
                df_Y = df['Sleep_Stage'].map(SLEEP_STAGE_MAPPING)
                
                # Convert features and labels to numpy arrays
                data_arr = df_X.values.astype(np.float32)  # shape: [T, C]
                labels_arr = df_Y.to_numpy()                # shape: [T]
                T = data_arr.shape[0]

                # If the record is too short (less than one chunk), pad it with NaNs (-1 for labels)
                if T < self.chunk_length:
                    pad_size = self.chunk_length - T
                    padding_data = np.full((pad_size, data_arr.shape[1]), np.nan, dtype=np.float32)
                    data_arr = np.concatenate([data_arr, padding_data], axis=0)
                    padding_labels = np.full((pad_size,), -1)
                    labels_arr = np.concatenate([labels_arr, padding_labels], axis=0)
                    T = self.chunk_length  # update length

                # Slide a window over the data with the defined stride to create overlapping chunks
                for start in range(0, T - self.chunk_length + 1, self.stride):
                    end = start + self.chunk_length
                    chunk_data = data_arr[start:end, :]
                    chunk_labels = labels_arr[start:end]
                    self.chunks.append({
                        'data': chunk_data,
                        'labels': chunk_labels,
                        'SID': SID
                    })
                if debug:
                    num_chunks = (T - self.chunk_length) // self.stride + 1
                    print(f"Subject {SID}: {T} samples processed, generated {num_chunks} chunks")
            else:
                print(f"File {file_path} does not exist. Skipping subject {SID}")

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunk = self.chunks[idx]
        data = torch.tensor(chunk['data'], dtype=torch.float32)
        labels = torch.tensor(chunk['labels'], dtype=torch.long)
        # Use forward_fill to replace any NaNs with previous values.
        data = forward_fill(data)
        labels = forward_fill(labels)
        return data, labels

In [56]:
# Example usage
subject_ids = ["S002", "S003", "S004"]
data_directory = "/scratch/npr264/BioDeepL/dreamt/physionet.org/files/dreamt/2.0.0/data_64Hz"  
demo_dataset = SleepChunkDataset(subjects_list=subject_ids,
                                 data_dir=data_directory, x_values='TEMPBVP')

Loaded data for subject S002
After downsampling (factor 319), rows: 6313
Subject S002: 4472 samples processed, generated 73 chunks
Loaded data for subject S003
After downsampling (factor 319), rows: 6433
Subject S003: 4989 samples processed, generated 82 chunks
Loaded data for subject S004
After downsampling (factor 319), rows: 6214
Subject S004: 5032 samples processed, generated 82 chunks


In [57]:
82+82+73

237

In [54]:
len(demo_dataset)

237

In [43]:
data, labels = demo_dataset[0]

In [50]:
len(demo_dataset.chunks)

237

In [53]:
demo_dataset.chunks[0]['data'].shape

(120, 3)