In [2]:
import os
import wfdb
import pandas as pd

data_dir = 'mit-bih-arrhythmia-database-1.0.0'
record_names = [f.split('.')[0] for f in os.listdir(data_dir) if f.endswith('.dat')]
record_names = list(set(record_names))

all_data = []

for record in record_names:
    rec = wfdb.rdrecord(os.path.join(data_dir, record))
    signals = rec.p_signal
    leads = rec.sig_name
    try:
        ann = wfdb.rdann(os.path.join(data_dir, record), 'atr')
        ann_samples = ann.sample
        ann_symbols = ann.symbol
    except Exception:
        ann_samples = []
        ann_symbols = []
    df = pd.DataFrame(signals, columns=leads)
    df['sample'] = df.index
    df['annotation'] = ''
    df.loc[ann_samples, 'annotation'] = ann_symbols
    df['record'] = record  # Add record identifier
    all_data.append(df)

# Concatenate all records
combined_df = pd.concat(all_data, ignore_index=True)
combined_df.to_csv('mitdb_all_patients.csv', index=False)


In [3]:
df = pd.read_csv('mitdb_all_patients.csv')
df.head(10)

Unnamed: 0,V5,V2,sample,annotation,record,MLII,V1,V4
0,-0.2,0.005,0,,102,,,
1,-0.2,0.005,1,,102,,,
2,-0.2,0.005,2,,102,,,
3,-0.2,0.005,3,,102,,,
4,-0.2,0.005,4,,102,,,
5,-0.2,0.005,5,,102,,,
6,-0.2,0.005,6,,102,,,
7,-0.2,0.005,7,,102,,,
8,-0.19,0.005,8,,102,,,
9,-0.175,0.005,9,,102,,,


In [41]:
# print unique number of records
print(f"Unique records: {df['record'].nunique()}")

# print value counts of annotations
print(f"Annotation Counts:\n{df['annotation'].value_counts()}")

Unique records: 48
Annotation Counts:
annotation
N    75052
L     8075
R     7259
V     7130
/     7028
A     2546
+     1291
f      982
F      803
~      616
!      472
"      437
j      229
x      193
a      150
|      132
E      106
J       83
Q       33
e       16
[        6
]        6
S        2
Name: count, dtype: int64


In [8]:
# count the nulls for each column
df.isnull().sum()

V5            27950000
V2            28600000
sample               0
annotation    31087353
record               0
MLII           1300000
V1             5200000
V4            30550000
dtype: int64