In [95]:
import os
import numpy as np
from sklearn.decomposition import PCA
import random
import matplotlib.pyplot as plt
import sys
sys.path.insert(1, '../src/')
from config import raw_data_path, univariate_data_path, processed_data_path
from scipy.signal import resample
from preprocessing_modules import EHGRecord, trim_target, filter_target, z_normalize_target, check_normalize_target, remove_records, z_normalize_signals



In [96]:
dataset = 'tpehgt'
print('Now processing: ', dataset)
data_dir = os.path.join(processed_data_path, dataset + "_preprocessed.npy")
data = np.load(data_dir, allow_pickle=True)
print(len(data))
print(data[0])

Now processing:  tpehgt
31
{'record_name': 'tpehgt_n001', 'signal': array([[-0.41324028,  0.08254295,  2.80689661],
       [-0.50354411,  0.14420602,  2.48570512],
       [-0.5817202 ,  0.20468731,  2.16917653],
       ...,
       [ 0.01856902, -1.57993672, -0.12397152],
       [ 0.02478136, -1.5803039 , -0.13328576],
       [ 0.0302683 , -1.57971707, -0.14092694]]), 'metadata': {'fs': 20, 'sig_len': 35300, 'n_sig': 8, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'], 'comments': ['Comments:', 'RecID\ttpehgt_n001', 'RecType Non-pregnant', 'Gestation N/A', 'Rectime N/A', 'Age None', 'Parity None', 'Abortions None', 'Weight None', 'Placental_position N/A', 'Smoker None']}}


In [97]:
# raw_data_dir = os.path.join(raw_data_path, dataset + "_data.npy")
# print(raw_data_dir)
# raw_data = np.load(data_dir, allow_pickle=True)
# print(len(raw_data))

In [98]:
# print(raw_data[0])

In [99]:
def convert_target_to_univariate(data):
    """
    Converts a list of multivariate time series records to univariate ones.
    
    Parameters:
    - data: list of dicts, each with keys: 'record_name', 'signal', 'fs', 'preterm'
    
    Returns:
    - List of univariate records
    """
    univariate_data = []
    
    for record in data:
        signal = record['signal']  # Shape: (T, C)
        fs = record['fs']
        preterm = record['preterm']
        base_name = record['record_name']
        
        num_channels = signal.shape[1]
        
        for i in range(num_channels):
            univariate_signal = signal[:, i]
            univariate_data.append({
                'record_name': f"{base_name}-chan{i}",
                'signal': univariate_signal,
                'fs': fs,
                'preterm': preterm
            })
    
    return univariate_data

def convert_source_to_univariate(data):
    """
    Converts a list of multivariate time series records to univariate ones.
    
    Parameters:
    - data: list of dicts, each with keys: 'record_name', 'signal', 'fs', 'preterm'
    
    Returns:
    - List of univariate records
    """
    univariate_data = []
    
    for record in data:
        signal = record['signal']  # Shape: (T, C)
        fs = record['metadata']['fs']
        base_name = record['record_name']
        metadata = record['metadata']
        
        num_channels = signal.shape[1]
        
        for i in range(num_channels):
            univariate_signal = signal[:, i]
            univariate_data.append({
                'record_name': f"{base_name}-chan{i}",
                'signal': univariate_signal,
                'fs': fs,
                'metadata': metadata,
            })
    
    return univariate_data


In [100]:
# save_dir = os.path.join(univariate_data_path, dataset + "_univariate_raw.npy")
# np.save(save_dir, np.array(univariate_raw, dtype=object))

In [101]:
def downsample_target(univariate_data, target_fs=20):
    """
    Downsamples all signals OF TARGET SET in univariate_data to the target frequency (default: 20Hz).
    
    Parameters:
    - univariate_data (list of dicts): Each entry should have 'metadata' with 'fs' (sampling rate) and 'signal'.
    - target_fs (int): The target sampling frequency (default is 20 Hz).
    
    Returns:
    - downsampled_data (list of dicts): Same structure as input but with downsampled signals.
    """
    downsampled_data = []
    
    for entry in univariate_data:
        original_fs = entry['fs']
        print('óriginaloriginal_fs', original_fs)
        signal = entry['signal']
        
        # Compute the new length after downsampling
        new_length = int(len(signal) * target_fs / original_fs)
        print(target_fs, original_fs)
        print(len(signal), new_length)
        
        # Resample the signal
        downsampled_signal = resample(signal, new_length)
        print('old signal lengt', len(signal))
        print('new length', len(downsampled_signal))
        # Store the downsampled signal with updated metadata
        downsampled_entry = {
            'fs': target_fs,
            'signal': downsampled_signal,
            'record_name': entry['record_name'],
            'preterm': entry['preterm']
        }
        downsampled_data.append(downsampled_entry)
    downsampled_data = z_normalize_target(downsampled_data)
    save_dir = os.path.join(univariate_data_path, dataset + "_univariate_no_PCA.npy")
    np.save(save_dir, np.array(downsampled_data, dtype=object))

    print(f"univariate no pca transformation complete. Saved as '{save_dir}'.")
    return(downsampled_data)

def downsample_signal(univariate_data, target_fs=20):
    """
    Downsamples all signals in univariate_data to the target frequency (default: 20Hz).
    
    Parameters:
    - univariate_data (list of dicts): Each entry should have 'metadata' with 'fs' (sampling rate) and 'signal'.
    - target_fs (int): The target sampling frequency (default is 20 Hz).
    
    Returns:
    - downsampled_data (list of dicts): Same structure as input but with downsampled signals.
    """
    downsampled_data = []
    
    for entry in univariate_data:
        original_fs = entry['fs']
        signal = entry['signal']
        
        # Compute the new length after downsampling
        new_length = int(len(signal) * target_fs / original_fs)
        print(target_fs, original_fs)
        print(len(signal), new_length)
        
        # Resample the signal
        downsampled_signal = resample(signal, new_length)
        print('old signal lengt', len(signal))
        print('new length', len(downsampled_signal))
        # Store the downsampled signal with updated metadata
        downsampled_entry = {
            'metadata': {**entry['metadata'], 'fs': target_fs},
            'signal': downsampled_signal,
            'record_name': entry['record_name']
        }
        downsampled_data.append(downsampled_entry)
    
    downsampled_data = z_normalize_signals(downsampled_data)
    save_dir = os.path.join(univariate_data_path, dataset + "_univariate_no_PCA.npy")
    np.save(save_dir, np.array(downsampled_data, dtype=object))

    print(f"univariate no pca transformation complete. Saved as '{save_dir}'.")
    
    return downsampled_data
    
if dataset == 'target':
    univariate_dataset = convert_target_to_univariate(data)
    downsampled_data = downsample_target(univariate_dataset)
else: 
    univariate_dataset = convert_source_to_univariate(data)
    downsampled_data = downsample_signal(univariate_dataset)


20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
32880 32880
old signal lengt 32880
new length 32880
20 20
32880 32880
old signal lengt 32880
new length 32880
20 20
32880 32880
old signal lengt 32880
new length 32880
20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
32900 32900
old signal lengt 32900
new length 32900
20 20
33060 33060
old signal lengt 33060
new length 33060
20 20
33060 33060
old signal lengt 33060
new length 33060
20 20
33060 33060
old signal lengt 33060
new length 33060
20 20
32800 32800
old signal lengt 32800
new length 32800
20 20
32800 32800
old signal lengt 32800
new length 32800
20 20
32800 32800
old signal lengt 32800
new length 32800
20 20
33600 33600
old signal lengt 33600
new length 33600
20 20
33600 33600
old signal lengt 33600
new length 33600
20 20
33600 33

In [102]:

data_dir = os.path.join(univariate_data_path, dataset + "_univariate_no_PCA.npy")
data = np.load(data_dir, allow_pickle=True)
print(len(data))

93


In [103]:
print(data[0])
print(len(data[0]['signal']))

{'record_name': 'tpehgt_n001-chan0', 'signal': array([[-0.41325038],
       [-0.50355642],
       [-0.58173442],
       ...,
       [ 0.01856947],
       [ 0.02478197],
       [ 0.03026904]]), 'metadata': {'fs': 20, 'sig_len': 35300, 'n_sig': 8, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'], 'comments': ['Comments:', 'RecID\ttpehgt_n001', 'RecType Non-pregnant', 'Gestation N/A', 'Rectime N/A', 'Age None', 'Parity None', 'Abortions None', 'Weight None', 'Placental_position N/A', 'Smoker None']}}
32900
