## Computing in Cardiology challenge 2021

https://physionet.org/content/challenge-2021/1.0.3/#files-panel

In [None]:
!wget -r -N -c -np https://physionet.org/files/challenge-2021/1.0.3/

In [2]:
import json
import numpy as np
import os
import pandas as pd
import wfdb
from scipy.signal import resample
from tqdm import tqdm

## SNOMED IDs

Cardiology Challenge-2021 provides diagnosis information as [SNOMED IDs](http://bioportal.bioontology.org/ontologies/SNOMEDCT).

In [42]:
snomed_ids = {
    '164889003': 'AFIB',
    '164890007': 'AFLT',
    '59118001': 'RBBB',
    '164909002': 'LBBB',
    # Complete
    '733534002': 'LBBB',
    '713427006': 'RBBB',
    # Incomplete
    '251120003': 'LBBB',
    '713426002': 'RBBB',
    '270492004': '1dAVB',
    '426783006': 'NORM',
}

## Utility method

1) Skip records less than 10s or less than 100Hz
2) Skip records having diagnosis condition outside our requirement
3) Reduce frequency to 100Hz
4) Reduce duration to 10s
5) Save

In [37]:
def process_ecg(patient_file, save_as, target_freq=100, target_s=10, create_extra=['LBBB', 'AFLT', '1dAVB']):
    # Read the record
    record = wfdb.rdrecord(patient_file)
    data = record.p_signal  # Shape: (5000, 12) for a 10s recording at 500Hz

    # --- Step 1: Extract record details ---
    details = {}
    for comment in record.comments:
        if ':' in comment:
            key, value = comment.split(':', 1)
            details[key.strip()] = value.strip()

    original_freq = record.fs
    original_len = data.shape[0]
    original_s = original_len / original_freq

    patient_age = details.get("Age", None)
    patient_sex = details.get("Sex", None)
    dx_list = details.get("Dx", list())

    # --- Step 2: Skip records having low freq, duration or diagnosis other than specified ---
    diagnosis = [snomed_ids[dx] for dx in dx_list.split(",") if dx in snomed_ids]
    if (
        original_freq < target_freq or
        original_s < target_s or
        record.n_sig != 12 or
        not diagnosis
    ):
        return False

    # Cleaning
    diagnosis.sort()
    # If both CRBBB and IRBBB are specified for a case,
    # can lead to RBBB being put in twice in diagnosis
    diagnosis = list(set(diagnosis))
    # NORM should only be present if no other condition diagnosed
    if 'NORM' in diagnosis and len(diagnosis)>1:
        diagnosis.remove('NORM')
    diagnosis.sort()

    # --- Step 3: Resample and crop ---
    # Resample to 100Hz
    num_target_samples = int((original_len / original_freq) * target_freq)
    data_downsampled = resample(data, num_target_samples)

    # Crop to 10s in the middle (10s * 100Hz = 1000 samples)
    crop_len = target_s * target_freq
    skipped_window = 0
    for i in range (0, num_target_samples//crop_len):
        data_cropped = data_downsampled[i*crop_len:(i+1)*crop_len]
        # Skip periods having missing leads
        if any(
            (data_cropped[:, lead].min() == 0 and data_cropped[:, lead].max() == 0)
            or np.isnan(data_cropped[:, lead]).sum() > crop_len//2
            for lead in range(12)
        ):
            skipped_window += 1
            continue

        # Save the new data using wfdb
        record_name = os.path.basename(save_as)+(f'_{i}' if (i-skipped_window) > 0 else '')
        record_dir = os.path.dirname(save_as)+('/extra' if (i-skipped_window) > 0 else '')
        original_cwd = os.getcwd()
        os.chdir(record_dir)
        wfdb.wrsamp(
            record_name=record_name,
            fs=target_freq,
            units=record.units,
            sig_name=record.sig_name,
            p_signal=data_cropped,
            fmt=['16'] * record.n_sig,
            comments=[
                f"Age: {patient_age}",
                f"Sex: {patient_sex}",
                f"Diagnosis: {','.join(diagnosis)}",
            ]
        )
        os.chdir(original_cwd)

        # Create extra records if longer than 10s
        if len(diagnosis) == 1 and diagnosis[0] not in create_extra:
            break

    return True

## Preprocess

In [None]:
%cd /home/nepython/projects/COMP6011/Task3

In [4]:
root_dir = os.path.join(os.getcwd(), 'physionet.org/files/challenge-2021/1.0.3/training/')
results_dir = os.path.join(os.getcwd(), 'challenge-2021')
extra_dir = os.path.join(results_dir, 'extra') # stores cropping after 10s
os.makedirs(results_dir, exist_ok=True)
os.makedirs(extra_dir, exist_ok=True)

In [43]:
for dataset in os.listdir(root_dir):
    dataset_path = os.path.join(root_dir, dataset)
    if not os.path.isdir(dataset_path) or dataset.startswith('.ipynb'):
        continue
    for group in tqdm(os.listdir(dataset_path), desc=dataset):
        group_path = os.path.join(dataset_path, group)
        if not os.path.isdir(group_path) or group.startswith('.ipynb'):
            continue
        for pid in os.listdir(group_path):
            if not pid.endswith('.hea'):
                continue
            pid = pid.split('.')[0]
            patient_file = os.path.join(group_path, pid)
            process_ecg(
                patient_file=patient_file,
                save_as=f'{results_dir}/{dataset}_{group}_{pid}',
                target_freq=100,
                target_s=10
            )

st_petersburg_incart: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.07s/it]
ptb-xl: 100%|███████████████████████████████████████████████████████████████████████████████| 23/23 [02:22<00:00,  6.17s/it]
cpsc_2018_extra: 100%|████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.60s/it]
cpsc_2018: 100%|██████████████████████████████████████████████████████████████████████████████| 8/8 [00:39<00:00,  4.96s/it]
ptb: 100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.28s/it]
ningbo: 100%|███████████████████████████████████████████████████████████████████████████████| 36/36 [02:08<00:00,  3.57s/it]
chapman_shaoxing: 100%|█████████████████████████████████████████████████████████████████████| 12/12 [00:38<00:00,  3.20s/it]
georgia: 100%|██████████████████████████████████████████████████████████████████████████████| 12/12 [00:35<00:00,  2.98s/it]


## View distribution
### 1. Original records

In [44]:
diagnosis_list = list()
diagnosis_freq = dict()

for pid in tqdm(os.listdir(results_dir), desc='Cases'):
    if pid.startswith('.ipynb') or not pid.endswith('.hea'):
        continue
    patient_file = os.path.join(results_dir, pid.split('.')[0])
    record = wfdb.rdrecord(patient_file)
    _, labels = record.comments[2].split(': ')
    if labels not in diagnosis_freq:
        diagnosis_freq[labels] = 0
    diagnosis_freq[labels] += 1
    diagnosis_list.append({
        'filename_lr': patient_file,
        'scp_codes': labels.split(',')
    })

Cases: 100%|████████████████████████████████████████████████████████████████████████| 98166/98166 [00:51<00:00, 1908.66it/s]


In [45]:
# Sort by values
diagnosis_freq = {k: v for k, v in sorted(diagnosis_freq.items(), key=lambda item: item[1], reverse=True)}
diagnosis_freq

{'NORM': 26294,
 'AFLT': 7239,
 'RBBB': 4741,
 'AFIB': 4382,
 '1dAVB': 2948,
 'LBBB': 1063,
 'AFLT,RBBB': 818,
 'AFIB,RBBB': 593,
 '1dAVB,RBBB': 324,
 'AFIB,LBBB': 204,
 '1dAVB,LBBB': 203,
 'AFLT,LBBB': 166,
 'AFIB,AFLT': 29,
 'LBBB,RBBB': 22,
 '1dAVB,AFIB': 12,
 'AFIB,LBBB,RBBB': 9,
 '1dAVB,AFIB,RBBB': 9,
 '1dAVB,AFLT': 9,
 'AFLT,LBBB,RBBB': 7,
 '1dAVB,LBBB,RBBB': 3,
 '1dAVB,AFIB,LBBB': 3,
 'AFIB,AFLT,RBBB': 2,
 '1dAVB,AFLT,RBBB': 1,
 'AFIB,AFLT,LBBB': 1}

### 2. Additional records

In [46]:
diagnosis_list_extra = list()
diagnosis_freq_extra = dict()

for pid in tqdm(os.listdir(extra_dir), desc='Cases'):
    if pid.startswith('.ipynb') or not pid.endswith('.hea'):
        continue
    patient_file = os.path.join(extra_dir, pid.split('.')[0])
    record = wfdb.rdrecord(patient_file)
    
    # pretty_json = json.dumps(header_info, indent=4)
    _, labels = record.comments[2].split(': ')
    if labels not in diagnosis_freq_extra:
        diagnosis_freq_extra[labels] = 0
    diagnosis_freq_extra[labels] += 1

    diagnosis_list_extra.append({
        'filename_lr': patient_file,
        'scp_codes': labels.split(',')
    })

Cases: 100%|████████████████████████████████████████████████████████████████████████████| 750/750 [00:00<00:00, 1730.84it/s]


In [47]:
# Sort by values
diagnosis_freq_extra = {k: v for k, v in sorted(diagnosis_freq_extra.items(), key=lambda item: item[1], reverse=True)}
diagnosis_freq_extra

{'1dAVB': 181,
 'LBBB': 81,
 'AFIB,RBBB': 42,
 'AFLT': 38,
 '1dAVB,RBBB': 15,
 'AFIB,LBBB': 9,
 '1dAVB,AFIB': 5,
 '1dAVB,LBBB': 4}

## Save

1. Save diagnosis as a df for easier retrieval
2. Rectify class imbalance as much as possible

In [54]:
from collections import defaultdict
import random

df = pd.DataFrame(diagnosis_list + diagnosis_list_extra)

# Step 1: Map class labels to row indices
label_to_indices = defaultdict(list)

for idx, labels in df['scp_codes'].items():
    for label in labels:
        label_to_indices[label].append(idx)

# Step 2: Sample up to MAX_SAMPLES indices per label
# Set your max per class
MAX_SAMPLES = 20000
random.seed(42)
selected_indices = set()
for label, indices in label_to_indices.items():
    sampled = random.sample(indices, min(MAX_SAMPLES, len(indices)))
    selected_indices.update(sampled)

# Step 3: Create balanced dataframe
balanced_df = df.loc[list(selected_indices)].reset_index(drop=True)
balanced_df.to_csv(f'{results_dir}/records.csv', index=False)

In [55]:
from collections import Counter

counter = Counter()
for labels in balanced_df['scp_codes']:
    counter.update(labels)

print(counter)  # Should be <= 500 for all

Counter({'NORM': 20000, 'AFLT': 8310, 'RBBB': 6586, 'AFIB': 5300, '1dAVB': 3717, 'LBBB': 1775})


In [56]:
balanced_df.scp_codes.value_counts()

scp_codes
[NORM]                 20000
[AFLT]                  7277
[RBBB]                  4741
[AFIB]                  4382
[1dAVB]                 3129
[LBBB]                  1144
[AFLT, RBBB]             818
[AFIB, RBBB]             635
[1dAVB, RBBB]            339
[AFIB, LBBB]             213
[1dAVB, LBBB]            207
[AFLT, LBBB]             166
[AFIB, AFLT]              29
[LBBB, RBBB]              22
[1dAVB, AFIB]             17
[AFIB, LBBB, RBBB]         9
[1dAVB, AFIB, RBBB]        9
[1dAVB, AFLT]              9
[AFLT, LBBB, RBBB]         7
[1dAVB, LBBB, RBBB]        3
[1dAVB, AFIB, LBBB]        3
[AFIB, AFLT, RBBB]         2
[1dAVB, AFLT, RBBB]        1
[AFIB, AFLT, LBBB]         1
Name: count, dtype: int64

## View record (optional)

In [7]:
patient_file = 'challenge-2021/chapman_shaoxing_g1_JS00001'
record = wfdb.rdrecord(patient_file)
header_info = record.__dict__
signal_data = header_info.pop('p_signal')

pretty_json = json.dumps(header_info, indent=4)
print(pretty_json)

{
    "record_name": "chapman_shaoxing_g1_JS00001",
    "n_sig": 12,
    "fs": 100,
    "counter_freq": null,
    "base_counter": null,
    "sig_len": 1000,
    "base_time": null,
    "base_date": null,
    "comments": [
        "Age: 85",
        "Sex: Male",
        "Diagnosis: AFIB"
    ],
    "sig_name": [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6"
    ],
    "d_signal": null,
    "e_p_signal": null,
    "e_d_signal": null,
    "file_name": [
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS00001.dat",
        "chapman_shaoxing_g1_JS0000

In [44]:
signal_data

array([[-0.15361226,  0.13458555,  0.28763639, ...,  0.40632207,
         0.41141828,  0.34716744],
       [-0.29090262,  0.27193392,  0.56277814, ...,  0.82077116,
         0.81770061,  0.56019897],
       [-0.29121782,  0.06234323,  0.35388257, ...,  0.26987981,
         0.2738889 ,  0.16185599],
       ...,
       [ 0.00417983, -0.04024062, -0.04436006, ..., -0.16216826,
        -0.15978517,  0.07163244],
       [ 0.03467205,  0.00730972, -0.0273679 , ..., -0.10702476,
        -0.10314703,  0.13433581],
       [ 0.00798965, -0.07061932, -0.07853577, ..., -0.2615811 ,
        -0.26472979,  0.09182279]])