Merges all source data into one .npy file

In [1]:
import os
import numpy as np
import sys
sys.path.insert(1, '../src/')
from config import raw_data_path, univariate_data_path, processed_data_path

In [2]:

datasets = ['ehgdb1', 'ehgdb2', 'icehgds', 'nifeadb', 'ninfea', 'tpehgdb', 'tpehgt']
data_list = []

for dataset in datasets:
    file_path = os.path.join(univariate_data_path, dataset + "_univariate.npy")
    
    if os.path.exists(file_path):
        data = np.load(file_path, allow_pickle=True)
        data_list.append(data)
        print(f"Loaded {dataset}, shape: {data.shape}")
    else:
        print(f"File not found: {file_path}")

# Merge all datasets into one large NumPy array
if data_list:
    merged_data = np.concatenate(data_list, axis=0)  # Assuming they have the same structure
    output_file = os.path.join(univariate_data_path, "merged_univariate.npy")
    
    np.save(output_file, merged_data)
    print(f"Saved merged dataset to {output_file}, shape: {merged_data.shape}")
else:
    print("No data loaded. Check file paths.")



Loaded ehgdb1, shape: (61,)
Loaded ehgdb2, shape: (62,)
Loaded icehgds, shape: (126,)
Loaded nifeadb, shape: (26,)
Loaded ninfea, shape: (60,)
Loaded tpehgdb, shape: (300,)
Loaded tpehgt, shape: (31,)
Saved merged dataset to ../data/univariate/merged_univariate.npy, shape: (666,)


In [3]:


# Load the merged dataset
merged_file = os.path.join(univariate_data_path, "merged_univariate.npy")

if os.path.exists(merged_file):
    merged_data = np.load(merged_file, allow_pickle=True)
    
    # Print basic statistics
    print(f"Dataset loaded successfully: {merged_file}")
    print(f"Total instances: {len(merged_data)}")
    
    # Print first few entries as a "header"
    print("\nFirst 3 entries:")
    for i, entry in enumerate(merged_data[:3]):
        print(f"\nEntry {i+1}:")
        print(f"  Record Name: {entry['record_name']}")
        print(f"  Signal Shape: {entry['signal'].shape}")
        print(entry)
        # print(f"  Metadata: {entry['metadata']}")
else:
    print(f"File not found: {merged_file}")


Dataset loaded successfully: ../data/univariate/merged_univariate.npy
Total instances: 666

First 3 entries:

Entry 1:
  Record Name: ice001_l_1of1
  Signal Shape: (7600, 1)
{'record_name': 'ice001_l_1of1', 'signal': array([[-1.7358303 ],
       [-0.30347557],
       [-0.40749874],
       ...,
       [-3.09738299],
       [-2.90981482],
       [-3.22768386]], shape=(7600, 1)), 'metadata': {'fs': 20, 'sig_len': 100000, 'n_sig': 16, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'], 'comments': ['Info:', 'ID:ice001', 'Record type:labour', 'Record number:1/1', 'Age(years):31', 'BMI before pregnancy:23.3', 'BMI at recording:27.6', 'Gravidity:3', 'Parity:2', 'Previous caesarean:No', 'Placental position:Fundus', 'Gestational age at recording(w/d):39/3', 'Gestational age at delivery:39/3', 'Mode of delivery:Vaginal', 'Synthetic oxytocin use in labour:No', 'Epidural during labour:No', 'Comments for re