# MIT-BIH Arrhythmia Database (_mitdb_)

Part of the ECG Database Collection:

| Short Name | Long Name |
| :--- | :--- |
| _mitdb_ | MIT-BIH Arrhythmia Database |
| _svdb_ | MIT-BIH Supraventricular Arrhythmia Database |
| _ltdb_ | MIT-BIH Long-Term ECG Database |

[Docu](https://wfdb.readthedocs.io/en/latest) of the `wfdb`-package.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import wfdb
import os
from typing import Final
from collections.abc import Callable
import matplotlib.pyplot as plt
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets
from IPython.display import display, Markdown, Latex

In [2]:
dataset_collection_name = "MITDB"
source_folder = os.path.join(data_raw_folder, "MIT-BIH Arrhythmia DB")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Arrhythmia DB and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


In [3]:
def load_dataset_names() -> list[str]:
    with open(os.path.join(source_folder, "RECORDS"), 'r') as f:
        records = [l.rstrip('\n') for l in f]
    return records

In [4]:
ann_normal = ["N", "/", "L", "R"]
ann_beat = ["F", "f", "S", "A", "a", "V", "J", "j", "E", "e"]
ann_no_beat = ["x"]
ann_fibr_start = "["
ann_fibr_end = "]"
ann_fibr = [ann_fibr_start, "!", ann_fibr_end]
ann_ext = ["Q", "|"]
ann_ignore = ["+", "~", '"']

def transform_and_label(source_file: str, target: str) -> int:
    print(f"Transforming {os.path.basename(source_file)}")
    # load dataset
    record = wfdb.rdrecord(source_file)
    df_record = pd.DataFrame(record.p_signal, columns=record.sig_name)
    print(f"  record {record.file_name[0]} loaded")

    # load annotation file
    atr = wfdb.rdann(source_file, "atr")
    assert record.fs == atr.fs, "Sample frequency of records and annotations does not match!"
    df_annotation = pd.DataFrame({"position": atr.sample, "label": atr.symbol})
    # remove ignored annotations
    df_annotation = df_annotation[~df_annotation["label"].isin(ann_ignore)]
    df_annotation = df_annotation.reset_index(drop=True)
    print(f"  {len(df_annotation)}/{atr.ann_len} beat annotations for {source_file} loaded (others were ignored)")

    # calculate normal beat length
    print("  preparing windows for labeling...")
    df_normal_beat = df_annotation.copy()
    df_normal_beat["prev_position"] = df_annotation["position"].shift()
    df_normal_beat["prev_label"] = df_annotation["label"].shift()
    df_normal_beat = df_normal_beat[(df_normal_beat["label"].isin(ann_normal)) & (df_normal_beat["prev_label"].isin(ann_normal))]
    df_normal_beat = df_normal_beat.drop(columns=["label", "prev_label"])
    s_normal_beat_lengths = df_normal_beat["position"] - df_normal_beat["prev_position"]
    print(f"    normal beat distance samples = {len(s_normal_beat_lengths)}")
    normal_beat_length = s_normal_beat_lengths.median()
    if (normal_beat_length % 2) == 0:
        normal_beat_length += 1
    beat_window_size = int(normal_beat_length)
    beat_window_margin = (beat_window_size - 1)//2
    del df_normal_beat
    del s_normal_beat_lengths
    print(f"    window size = {beat_window_size}")
    print(f"    window margins (left and right) = {beat_window_margin}")

    # calculate beat windows
    ## ~ and other annotations are ignored!
    ## for fibrillation
    # we only need start and end marked with `[` and `]` respectively
    s_fibr_start = df_annotation.loc[df_annotation["label"] == ann_fibr_start, "position"]
    s_index = s_fibr_start.index
    s_fibr_start = s_fibr_start.reset_index(drop=True)
    s_fibr_end = df_annotation.loc[df_annotation["label"] == ann_fibr_end, "position"]
    s_fibr_end = s_fibr_end.reset_index(drop=True)
    df_fibr = pd.DataFrame({"index": s_index, "window_start": s_fibr_start, "window_end": s_fibr_end})
    df_fibr = df_fibr.set_index("index")
    df_fibr["position"] = df_fibr["window_start"]
    print(f"    {len(df_fibr)} windows for fibrillation anomalies ({','.join(ann_fibr)})")
    ## for external anomalies
    df_ext = df_annotation[df_annotation["label"].isin(ann_ext)].copy()
    df_ext["window_start"] = np.maximum(0, df_ext["position"]-beat_window_margin)
    df_ext["window_end"] = np.minimum(record.sig_len - 1, df_ext["position"]+beat_window_margin)
    df_ext = df_ext[["position", "window_start", "window_end"]]
    print(f"    {len(df_ext)} windows for external anomalies ({','.join(ann_ext)})")
    ## anomalous beats
    # exclude additional non-beat annotations
    df_svf = df_annotation[~df_annotation["label"].isin(["|", ann_fibr_start, ann_fibr_end])].copy()
    df_svf["position_next"] = df_svf["position"].shift(-1)
    df_svf["position_prev"] = df_svf["position"].shift(1)
    #df_svf = df_svf[(df_svf["position_prev"].notnull()) & (df_svf["position_next"].notnull())]
    df_svf = df_svf[df_svf["label"].isin(ann_beat)]
    df_svf["window_start"] = np.maximum(0, np.minimum(df_svf["position"].values-beat_window_margin, df_svf["position_prev"].values+beat_window_margin))
    df_svf["window_end"] = np.minimum(record.sig_len - 1, np.maximum(df_svf["position"].values+beat_window_margin, df_svf["position_next"].values-beat_window_margin))
    df_svf = df_svf[["position", "window_start", "window_end"]]
    print(f"    {len(df_svf)} windows for anomalous beats ({','.join(ann_beat)})")
    # missing beats
    df_no_beat = df_annotation[df_annotation["label"].isin(ann_no_beat)].drop(columns=["label"]).copy()
    df_no_beat["window_start"] = df_no_beat["position"]
    if not df_no_beat.empty:
        df_normal_windows = df_annotation[df_annotation["label"].isin(ann_normal)].copy()
        df_normal_windows = df_normal_windows.drop(columns=["label"])
        df_normal_windows["window_start"] = np.maximum(0, df_normal_windows["position"]-beat_window_margin)
        df_normal_windows["window_end"] = np.minimum(record.sig_len - 1, df_normal_windows["position"]+beat_window_margin)
        df_lut = df_annotation[~df_annotation["label"].isin(ann_no_beat)].merge(pd.concat([df_ext, df_svf, df_fibr, df_normal_windows]), on="position", how="left")
        def find_next_window_start(pos: int):
            next_window_start = df_lut.loc[df_lut["position"] > pos, "window_start"].iloc[0]
            return max(pos, next_window_start)
        df_no_beat["window_end"] = df_no_beat["position"].transform(find_next_window_start)
        del df_normal_windows
        del df_lut
    else:
        df_no_beat["window_end"] = df_no_beat["position"]
    print(f"    {len(df_no_beat)} windows for missing beats ({','.join(ann_no_beat)})")
    ## merge
    df_windows = pd.concat([df_ext, df_svf, df_fibr, df_no_beat])
    df_windows.sort_index(inplace=True)
    print(f"  ...done.")

    # add labels based on anomaly windows
    print("  labeling")
    df_record["is_anomaly"] = 0
    for _, (_, t1, t2) in df_windows.iterrows():
        tmp = df_record[df_record.index >= t1]
        tmp = tmp[tmp.index <= t2]
        df_record["is_anomaly"].values[tmp.index] = 1
    del tmp

    # reconstruct timestamps and set as index
    print("  reconstructing timestamps")
    df_record["timestamp"] = pd.to_datetime(df_record.index.values * 1e+9/record.fs, unit='ns')
    df_record = df_record.set_index("timestamp")
    df_record.to_csv(target)
    print(f"Dataset {os.path.basename(source_file)} transformed and saved!")
    
    # return dataset length
    return record.sig_len

In [5]:
# shared by all datasets
dataset_type = "real"
input_type = "multivariate"
datetime_index = True
train_type = "unsupervised"
train_is_normal = False

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/MITDB already exist


In [6]:
# dataset transformation
transform_file: Callable[[str, str], int] = transform_and_label

for dataset_name in load_dataset_names():
    # intentionally no file suffix (.dat)
    source_file = os.path.join(source_folder, dataset_name)
    filename = f"{dataset_name}.test.csv"
    path = os.path.join(dataset_subfolder, filename)
    target_filepath = os.path.join(target_subfolder, filename)
            
    # transform file and label it
    dataset_length = transform_file(source_file, target_filepath)
    print(f"Processed source dataset {source_file} -> {target_filepath}")

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = None,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Transforming 100
  record 100.dat loaded
  2273/2274 beat annotations for /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Arrhythmia DB/100 loaded (others were ignored)
  preparing windows for labeling...
    normal beat distance samples = 2204
    window size = 287
    window margins (left and right) = 143
    0 windows for fibrillation anomalies ([,!,])
    0 windows for external anomalies (Q,|)
    34 windows for anomalous beats (F,f,S,A,a,V,J,j,E,e)
    0 windows for missing beats (x)
  ...done.
  labeling
  reconstructing timestamps
Dataset 100 transformed and saved!
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Arrhythmia DB/100 -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/MITDB/100.test.csv
Transforming 101
  record 101.dat loaded
  1869/1874 beat annotations for /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Arrhythmia DB/101 loaded (others were ignored)
  preparing windows for labeling...
    

In [8]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MITDB,100,,multivariate/MITDB/100.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,101,,multivariate/MITDB/101.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,102,,multivariate/MITDB/102.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,103,,multivariate/MITDB/103.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,104,,multivariate/MITDB/104.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,105,,multivariate/MITDB/105.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,106,,multivariate/MITDB/106.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,107,,multivariate/MITDB/107.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,108,,multivariate/MITDB/108.test.csv,real,True,,unsupervised,False,multivariate,650000
MITDB,109,,multivariate/MITDB/109.test.csv,real,True,,unsupervised,False,multivariate,650000


In [9]:
dm.get_dataset_df((dataset_collection_name, "207"))

Unnamed: 0,timestamp,MLII,V1,is_anomaly
0,1970-01-01 00:00:00.000000000,-0.215,0.095,0
1,1970-01-01 00:00:00.002777777,-0.215,0.095,0
2,1970-01-01 00:00:00.005555555,-0.215,0.095,0
3,1970-01-01 00:00:00.008333333,-0.215,0.095,0
4,1970-01-01 00:00:00.011111111,-0.215,0.095,0
...,...,...,...,...
649995,1970-01-01 00:30:05.541666666,-1.245,-0.540,0
649996,1970-01-01 00:30:05.544444444,-1.230,-0.525,0
649997,1970-01-01 00:30:05.547222222,-1.190,-0.465,0
649998,1970-01-01 00:30:05.550000000,-1.135,-0.400,0


## Dataset transformation walk-through

In [None]:
def print_obj_attr(obj, name="Object"):
    print(name)
    tmp = vars(obj)
    for key in tmp:
        print(key, tmp[key])
    print("")
records = load_dataset_names()

### Load and parse dataset

In [None]:
records.index("219")

In [None]:
# dataset
record = wfdb.rdrecord(os.path.join(source_folder, records[37]))
#print_obj_attr(record, "Record object")

df_record = pd.DataFrame(record.p_signal, columns=record.sig_name)
df_record

Add timestamp information based on sample interval ($$[fs] = samples/second$$):

In [None]:
display(Latex(f"Samples per second: $$fs = {record.fs} \\frac{{1}}{{s}}$$"))
display(Markdown(f"This gives a sample interval of {1e+9/record.fs} nanoseconds"))
df_record["timestamp"] = pd.to_datetime(df_record.index.values * 1e+9/record.fs, unit='ns')
df_record

### Load and parse annotations

In [None]:
# find all annotations
annotations = {}
for r in records:
    atr = wfdb.rdann(os.path.join(source_folder, r), "atr")
    df_annotation = pd.DataFrame(atr.symbol, index=atr.sample, columns=["Label"])
    for an in df_annotation["Label"].unique():
        if an not in annotations:
            annotations[an] = set()
        annotations[an].add(atr.record_name)

for an in annotations:
    annotations[an] = ", ".join(annotations[an])
annotations

Annotations

| Annotation | Description |
| :--------- | :---------- |
|| **Considered normal** |
| `N` | Normal beat |
| `/` | Paced beat (normal beat if pacemaker is used) |
| `L` | Left bundle branch block beat (also normal?) |
| `R` | Right bundle branch block beat (also normal?)  |
|| **Anomalous beats** (use double-window labeling) |
| `F` | Fusion of ventricular and normal beat |
| `f` | Fusion of paced and normal beat |
| `S` | Supraventricular premature or ectopic beat |
| `A` | Atrial premature beat |
| `a` | Aberrated atrial premature beat |
| `V` | Premature ventricular contraction |
| `J` | Nodal (junctional) premature beat |
| `j` | Nodal (junctional) escape beat |
| `E` | Ventricular escape beat |
| `e` | Atrial escape beat |
|| **Anomaly from `x` until next beat window start** |
| `x` | Non-conducted P-wave (blocked APC) (no beat follows annotation) |
|| **Entire section of fibrillation is regarded anomalous** (a single window from `[` to `]`) |
| `[` | Start of ventricular flutter/fibrillation |
| `!` | Ventricular flutter wave |
| `]` | End of ventricular flutter/fibrillation |
|| **External anomalies** (single window labeling) |
| `Q` | Unclassifiable beat |
| `\|` | Isolated QRS-like artifact |
|| **Ignored, bc hard to parse and to label** |
| `+` | Rythm change |
| `~` | Change in signal quality (usually noise level changes) |
| `"` | Tape slippage (unknown; variable length) |

In [None]:
ann_normal = ["N", "/", "L", "R"]
ann_beat = ["F", "f", "S", "A", "a", "V", "J", "j", "E", "e"]
ann_no_beat = ["x"]
ann_fibr_start = "["
ann_fibr_end = "]"
ann_fibr = [ann_fibr_start, "!", ann_fibr_end]
ann_ext = ["Q", "|"]
ann_ignore = ["+", "~", '"']

In [None]:
atr = wfdb.rdann(os.path.join(source_folder, records[37]), "atr")
#print_obj_attr(atr, "Annotation object")
assert record.fs == atr.fs, "Sample frequency of records and annotations does not match!"

df_annotation = pd.DataFrame(atr.symbol, index=atr.sample, columns=["Label"])
df_annotation = df_annotation.reset_index()
df_annotation.columns = ["position", "label"]
df_annotation.groupby("label").count()

### Remove ignored annotations

In [None]:
df_annotation = df_annotation[~df_annotation["label"].isin(ann_ignore)]
df_annotation = df_annotation.reset_index(drop=True)
df_annotation.groupby("label").count()

### Calculate beat window

We assume that the normal beats (e.g. annotated with `N`) occur in a regular interval and that the expert annotations (from the dataset) are directly in the middle of a beat window.
A beat window is a fixed length subsequence of the time series and shows a heart beat in its direct (local) context.

We calculate the beat window length for each dataset based on the median distance between normal beats.
The index (autoincrementing integers) serves as the measurement unit.

Shifted-by-one self-join and filter out all beat-pairs that contain anomalous beats.
We want to calculate the beat windows only based on the normal beats.
We then calculate the distance between two neighboring heart beats:

In [None]:
df_normal_beat = df_annotation.copy()
df_normal_beat["prev_position"] = df_annotation["position"].shift()
df_normal_beat["prev_label"] = df_annotation["label"].shift()
df_normal_beat = df_normal_beat[(df_normal_beat["label"].isin(ann_normal)) & (df_normal_beat["prev_label"].isin(ann_normal))]
df_normal_beat = df_normal_beat.drop(columns=["label", "prev_label"])
df_normal_beat["length"] = df_normal_beat["position"] - df_normal_beat["prev_position"]
df_normal_beat.describe()

The median of all normal beat lengths is the beat window size.
We require the beat window size to be odd.
This allows us to center the window at the beat annotation.

In [None]:
normal_beat_length = df_normal_beat["length"].median()
if (normal_beat_length%2) == 0:
    normal_beat_length += 1
beat_window_size = int(normal_beat_length)
beat_window_margin = (beat_window_size - 1)//2
print(f"window size = {beat_window_size}\nwindow margins (left and right) = {beat_window_margin}")

### Calculate anomalous windows

The experts from PhysioNet annotated only the beats itself with a label, but the actual anomaly is also comprised of the beat surroundings.

We assume that anomalous beats (such as `V` or `F`; see table above) require looking at a window around the actual beat as being anomalous.
External anomalies (such as `|`; see table above) also mark a window around it as anomalous, because those artefacts comprise multiple points.

We completely ignore `~`, `"`, and `+`-annotations that indicate signal quality or rythm changes, because they are not relevant for our analysis.

We automatically label a variable-sized window around an annotated beat as an anomalous subsequence using the following technique:

1. For anomalous annotations (such as `S`, `V`, or `F` annotations):
   - Remove `"`, `~`, `+`, `[`, `]`, and `|` annotations
   - Calculate anomaly window using `beat_window_size` aligned with its center on the beat annotation.
   - Calculate end of previous beat window _e_ and beginning of next beat window _b_.
     Use _e_ as beginning and _b_ as end for a second anomaly window.
   - Mark the union of both anomaly windows' points as anomalous.
2. For `|` and `Q` annotations, mark all points of an anomaly window centered on the annotation as anomalous.
3. For `[`, `!`, and `]` annotations, mark all points within the region from `[` until `]` as anomalous.
4. For `x` annotations, mark the annotated and all following points until the beginning of the next beat window as anomalous.
5. Mark all other points as normal.

> **Explain, why we used the combined windows for anomalous beats!!**
>
> - pattern/shape of signal may be ok
> - but we consider distance to other beats also
> - if too narrow or too far away, it's also anomalous

The figure shows an anomalous beat with its anomaly window (in red) and the windows of its previous and subsequent normal beats (in green).
We mark all points in the interval $$[min(W_{end}, X_{start}), max(X_{end}, Y_{start})]$$

In [None]:
name = df_annotation[df_annotation["label"] == ann_fibr_start].iloc[0].name
df_annotation[df_annotation.index >= name]

In [None]:
# reverse lookup from timestamp to annotation index in df_beat
p = df_record[df_record["timestamp"] == "1970-01-01 00:11:03.000"].index.values[0]
df_annotation[df_annotation["position"] >= p].index[0]

In [None]:
def plot_window(pos, color="blue", **kvs):
    start = pos - beat_window_margin
    end = pos + beat_window_margin
    plt.axvspan(start, end, color=color, alpha=0.5, **kvs)


index = 39

beat_n = df_annotation.loc[index, "position"]
print("Selected beat is annotated as", df_annotation.loc[index, "label"])
print("with timestamp", df_record.loc[beat_n, "timestamp"])
ax = df_record.iloc[beat_n-1000:beat_n+1000].plot(kind='line', y=["MLII", "V1"], use_index=True, figsize=(20,10))
plot_window(df_annotation.loc[index-1, "position"], label="$W$")
plot_window(beat_n, color="orange", label="$X$")
plot_window(df_annotation.loc[index+1, "position"], label="$Y$")

labels = df_annotation[(df_annotation["position"] > beat_n-1000) & (df_annotation["position"] < beat_n+1000)]
for i, (position, label) in labels.iterrows():
    plt.text(position, -1.2, label)
plt.legend()
plt.show()

#### Windows for fibrillation

In [None]:
# we only need start and end marked with `[` and `]` respectively
s_fibr_start = df_annotation.loc[df_annotation["label"] == ann_fibr_start, "position"]
s_index = s_fibr_start.index
s_fibr_start = s_fibr_start.reset_index(drop=True)

s_fibr_end = df_annotation.loc[df_annotation["label"] == ann_fibr_end, "position"]
s_fibr_end = s_fibr_end.reset_index(drop=True)

df_fibr = pd.DataFrame({"index": s_index, "window_start": s_fibr_start, "window_end": s_fibr_end})
df_fibr = df_fibr.set_index("index")
df_fibr["position"] = df_fibr["window_start"]
df_fibr

#### Windows for external anomalies

In [None]:
df_ext = df_annotation[df_annotation["label"].isin(ann_ext)].copy()
df_ext["window_start"] = df_ext["position"]-beat_window_margin
df_ext["window_end"] = df_ext["position"]+beat_window_margin
df_ext = df_ext[["position", "window_start", "window_end"]]
df_ext.head()

#### Windows for anomalous beats

In [None]:
# exclude additional non-beat annotations
df_tmp = df_annotation[~df_annotation["label"].isin(["|", ann_fibr_start, ann_fibr_end])].copy()
df_tmp["position_next"] = df_tmp["position"].shift(-1)
df_tmp["position_prev"] = df_tmp["position"].shift(1)
#df_tmp = df_tmp[(df_tmp["position_prev"].notnull()) & (df_tmp["position_next"].notnull())]
df_tmp = df_tmp[df_tmp["label"].isin(ann_beat)]
df_tmp["window_start"] = np.minimum(df_tmp["position"].values-beat_window_margin, df_tmp["position_prev"].values+beat_window_margin)
df_tmp["window_end"] = np.maximum(df_tmp["position"].values+beat_window_margin, df_tmp["position_next"].values-beat_window_margin)
df_svf = df_tmp[["position", "window_start", "window_end"]]
df_tmp.groupby("label").count()

#### Windows for missing beats (such as `x` annotation)

In [None]:
df_normal_windows = df_annotation[df_annotation["label"].isin(ann_normal)].copy()
df_normal_windows = df_normal_windows.drop(columns=["label"])
df_normal_windows["window_start"] = df_normal_windows["position"]-beat_window_margin
df_normal_windows["window_end"] = df_normal_windows["position"]+beat_window_margin

df_lut = df_annotation[~df_annotation["label"].isin(ann_no_beat)].merge(pd.concat([df_ext, df_svf, df_fibr, df_normal_windows]), on="position", how="left")
df_lut

In [None]:
def find_next_window_start(pos: int):
    next_window_start = df_lut.loc[df_lut["position"] > pos, "window_start"].iloc[0]
    return max(pos, next_window_start)

df_no_beat = df_annotation[df_annotation["label"].isin(ann_no_beat)].drop(columns=["label"]).copy()
df_no_beat["window_start"] = df_no_beat["position"]
df_no_beat["window_end"] = df_no_beat["position"].transform(find_next_window_start)
df_no_beat.head()

#### Merge everything together

In [None]:
df_windows = pd.concat([df_ext, df_svf, df_fibr, df_no_beat])
df_windows.sort_index(inplace=True)
df_windows

In [None]:
index = 798

beat = df_windows.loc[index, "position"]
start = df_windows.loc[index, "window_start"]
end = df_windows.loc[index, "window_end"]
print("Selected beat is annotated as", df_beat.loc[index, "label"])
print("with timestamp", df_record.loc[beat, "timestamp"])
ax = df_record.iloc[beat-500:beat+500].plot(kind='line', y=['ECG1', 'ECG2'], use_index=True, figsize=(20,10))
plt.axvspan(beat-500, start-1, color="green", alpha=0.5, label="normal region 1", ymin=.5)
plt.axvspan(start, end, color="red", alpha=0.5, label="anomalous region", ymin=.5)
plt.axvspan(end+1, beat+500, color="green", alpha=0.5, label="normal region 2", ymin=.5)
plot_window(df_beat.loc[index-1, "position"], label="$W$", ymax=.5)
plot_window(beat_n, color="orange", label="$X$", ymax=.5)
plot_window(df_beat.loc[index+1, "position"], label="$Y$", ymax=.5)
plt.legend()
plt.show()

### Add labels

In [None]:
df = df_record.copy()
df["is_anomaly"] = 0

for _, (_, t1, t2) in df_windows.iterrows():
    tmp = df[df.index >= t1]
    tmp = tmp[tmp.index <= t2]
    df["is_anomaly"].values[tmp.index] = 1

#df = df.set_index("timestamp")
df[df["is_anomaly"] == 1]

In [None]:
index = 370176
snippet_size = 1500

start = max(0, index - snippet_size//2)
end = min(len(df), index + snippet_size//2)
df_show = df.loc[start:end]
df_show.plot(kind='line', y=["MLII", "V1", "is_anomaly"], use_index=True, figsize=(20,10))

labels = df_annotation[(df_annotation["position"] >= start) & (df_annotation["position"] <= end)]
for i, (position, label) in labels.iterrows():
    plt.text(position, -2, label)
plt.legend()
plt.show()

## Experimentation

In [None]:
df = pd.merge(df_record, df_annotation, left_index=True, right_index=True, how="outer")
#df = df.fillna(value={"Label": ".", "is_anomaly": 0})
df.groupby(["is_anomaly"]).count()

In [None]:
df[df["Label"].notna()]

In [None]:
import matplotlib.pyplot as plt
df_show = df.loc[27000:28000]
df_show.plot(kind='line', y=['ECG1', 'ECG2', 'is_anomaly'], use_index=True, figsize=(20,10))
plt.show()

In [None]:
df = pd.read_csv(os.path.join(dataset_subfolder, "800.test.csv"), index_col="timestamp")
df.loc["1970-01-01 00:21:20":"1970-01-01 00:21:40"].plot(figsize=(20,10))
plt.show()