In [1]:
import sys
import os
import numpy as np
sys.path.insert(1, '../src/')
from config import raw_data_path, univariate_data_path, processed_data_path, models_path
from preprocessing_modules import EHGRecord, trim_target, filter_target
import scipy.io
import pandas as pd

In [32]:

mat_files_dir = os.path.join('..', '..', '..', '..', '..', 'projects', 'prjs1386', 'mat_files')

ehg_records = []
for file_name in os.listdir(mat_files_dir):
    if file_name.endswith(".mat"):  # Process only .mat files
        file_path = os.path.join(mat_files_dir, file_name)  # Full path
        
        # Pass the full file path as record_name
        ehg_record = EHGRecord(file_path)
        ehg_records.append(ehg_record)

# Print summary of extracted records
print(f"Extracted {len(ehg_records)} EHG records:")
for record in ehg_records[:5]:  # Print first 5 records as a preview
    print(record)


Extracted 167 EHG records:
<preprocessing_modules.EHGRecord object at 0x148db2049350>
<preprocessing_modules.EHGRecord object at 0x148d8b0f5090>
<preprocessing_modules.EHGRecord object at 0x148d866db8d0>
<preprocessing_modules.EHGRecord object at 0x148d8aafff50>
<preprocessing_modules.EHGRecord object at 0x148db2048fd0>


In [33]:
print(len(ehg_records))

167


In [34]:
print(ehg_records[0].record_name)
print(ehg_records[0].ehg_signals)

../../../../../projects/prjs1386/mat_files/Hopper-2023_05_25_12_03_38-0000010090-0003.mat
[[-1.14440918e-01 -1.14440918e-01  0.00000000e+00  1.14440918e-01
   0.00000000e+00  1.14440918e-01]
 [ 9.84191895e+00  9.95635986e+00 -1.00708008e+01 -1.99127197e+01
   1.14440918e-01 -2.00271606e+01]
 [ 1.31607056e+01  1.32751465e+01 -1.33895874e+01 -2.65502930e+01
   1.14440918e-01 -2.66647339e+01]
 ...
 [ 1.43051147e+02 -1.87088013e+03  1.58718115e+03  1.44413000e+03
  -2.01393127e+03  3.45806128e+03]
 [ 1.82579041e+03 -1.36024475e+03 -1.47365576e+03 -3.29944629e+03
  -3.18603516e+03 -1.13411011e+02]
 [ 1.91116333e+03 -9.61761475e+02 -1.79821021e+03 -3.70937354e+03
  -2.87292480e+03 -8.36448730e+02]]


In [35]:
import os
import numpy as np
import pandas as pd

# Path to clinical data CSV
clinical_data_path = os.path.join('..', '..', '..', '..', '..', 'projects', 'prjs1386', 'clinical_data_cocoon.csv')

# Load clinical data
clinical_df = pd.read_csv(clinical_data_path)
clinical_df.replace({'ga_weeks_at_birth': {'nan': np.nan, 'NaN': np.nan, '': np.nan}}, inplace=True)

# Convert 'ga_weeks_at_birth' to preterm labels, ensuring None for missing values
def get_preterm_label(ga_weeks):
    if pd.isna(ga_weeks):  # Explicitly check for missing values
        return None
    return 1 if ga_weeks < 37 else 0

# Fill missing sampling frequencies with 128
clinical_df['fs'] = clinical_df['sampling_freq'].fillna(128)

# Create mappings for lookup
record_to_preterm = {row['sensor_record_id']: get_preterm_label(row['ga_weeks_at_birth']) for _, row in clinical_df.iterrows()}
record_to_fs = {row['sensor_record_id']: row['fs'] for _, row in clinical_df.iterrows()}

# Prepare data for .npy file
ehg_data = []
for record in ehg_records:
    # Extract only the relevant part of the record name
    record_name = os.path.basename(record.record_name).replace('.mat', '')

    # Ensure missing preterm values are correctly assigned
    preterm = record_to_preterm.get(record_name, None)  # Now properly assigns None
    fs = record_to_fs.get(record_name, 128)  # Default to 128 if missing

    # Append as a tuple
    ehg_data.append((record_name, record.ehg_signals, preterm, fs))

# Convert to structured NumPy array
dtype = [('record_name', 'U100'), ('signal', 'O'), ('preterm', 'O'), ('fs', 'int')]
structured_array = np.array(ehg_data, dtype=dtype)

# Save the updated dataset
output_path = os.path.join(raw_data_path, 'target_data.npy')
np.save(output_path, structured_array)

print(f"Saved {len(ehg_data)} records to {output_path}")

# Check if missing preterm values match the expected count
missing_preterm_records = [record for record in structured_array if record['preterm'] is None]
print(f"Records with missing 'preterm': {len(missing_preterm_records)} (Expected: 27)")


Saved 167 records to ../data/raw/target_data.npy
Records with missing 'preterm': 24 (Expected: 27)


In [5]:
import numpy as np

# Load the dataset
ehg_dataset_path = os.path.join(raw_data_path, 'target_data.npy')
ehg_data = np.load(ehg_dataset_path, allow_pickle=True)

# Filter records where 'preterm' is None
missing_preterm_records = [record for record in ehg_data if record['preterm'] is None]

# Print results
# print(f"Total records with missing label: {len(missing_preterm_records)}")

# for record in missing_preterm_records:  # Show only first 5 for preview
#     print(f"Record Name: {record['record_name']}")

# print(f"Missing values in 'ga_weeks_at_birth': {clinical_df['ga_weeks_at_birth'].isna().sum()}")
print(type(ehg_data))
print(ehg_data[0])


<class 'numpy.ndarray'>
('Hopper-2023_05_25_12_03_38-0000010090-0003', array([[-1.14440918e-01, -1.14440918e-01,  0.00000000e+00,
         1.14440918e-01,  0.00000000e+00,  1.14440918e-01],
       [ 9.84191895e+00,  9.95635986e+00, -1.00708008e+01,
        -1.99127197e+01,  1.14440918e-01, -2.00271606e+01],
       [ 1.31607056e+01,  1.32751465e+01, -1.33895874e+01,
        -2.65502930e+01,  1.14440918e-01, -2.66647339e+01],
       ...,
       [ 1.43051147e+02, -1.87088013e+03,  1.58718115e+03,
         1.44413000e+03, -2.01393127e+03,  3.45806128e+03],
       [ 1.82579041e+03, -1.36024475e+03, -1.47365576e+03,
        -3.29944629e+03, -3.18603516e+03, -1.13411011e+02],
       [ 1.91116333e+03, -9.61761475e+02, -1.79821021e+03,
        -3.70937354e+03, -2.87292480e+03, -8.36448730e+02]],
      shape=(436608, 6), dtype=float32), 0, 128)


In [37]:
import os
import pandas as pd

# Paths
mat_files_dir = os.path.join('..', '..', '..', '..', '..', 'projects', 'prjs1386', 'mat_files')
clinical_data_path = os.path.join('..', '..', '..', '..', '..', 'projects', 'prjs1386', 'clinical_data_cocoon.csv')

# Get record names from ehg_records (Extract only the filename without extension)
mat_file_records = {os.path.basename(record.record_name).replace('.mat', '') for record in ehg_records}

# Load CSV and extract record names
clinical_df = pd.read_csv(clinical_data_path)

csv_records = set(clinical_df['sensor_record_id'].dropna().astype(str))

# Find mismatches
mat_not_in_csv = mat_file_records - csv_records
csv_not_in_mat = csv_records - mat_file_records

# Print results
print(f"Records in .mat files but NOT in CSV ({len(mat_not_in_csv)}):")
for record in sorted(mat_not_in_csv):
    print(f"- {record}")

print(f"\nRecords in CSV but NOT in .mat files ({len(csv_not_in_mat)}):")
for record in sorted(csv_not_in_mat):
    print(f"- {record}")


Records in .mat files but NOT in CSV (4):
- Hopper-2023_01_11_10_44_42-0000010090-0002
- Hopper-2023_01_11_11_08_48-0000010090-0003
- Hopper-2023_02_14_08_43_21-0000010090-0001
- Hopper-2023_11_01_17_18_35-0000010181-0003

Records in CSV but NOT in .mat files (0):
