# Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas 		     as pd


# Metadata

In [None]:
dataFolder = "../data/CODE15"
traceFile  = "exams_part1.hdf5"

# Data dictionary

1. **exam_id**: id used for identifying the exam;
1. **age**: patient age in years at the moment of the exam;
1. **is_male**: true if the patient is male;
1. **nn_predicted_age**: age predicted by a neural network to the patient. As described in the paper "Deep neural network estimated electrocardiographic-age as a mortality predictor" bellow.
1. **1dAVb**: Whether or not the patient has 1st degree AV block;
1. **RBBB**: Whether or not the patient has right bundle branch block;
1. **LBBB**: Whether or not the patient has left bundle branch block;
1. **SB**: Whether or not the patient has sinus bradycardia;
1. **AF**: Whether or not the patient has atrial fibrillation;
1. **ST**: Whether or not the patient has sinus tachycardia;
1. **patient_id**: id used for identifying the patient;
1. **normal_ecg**: True if automatic annotation system say it is a normal ECG;
1. **death**: true if the patient dies in the follow-up time. This data is available only in the first exam of the patient. Other exams will have this as an empty field;
1. **timey**: if the patient dies it is the time to the death of the patient. If not, it is the follow-up time. This data is available only in the first exam of the patient. Other exams will have this as an empty field;
1. **trace_file**: identify in which hdf5 file the file corresponding to this patient is located.


available at: https://zenodo.org/records/4916206

# Load dataset

In [None]:
exams = pd.read_csv(f"{ dataFolder }/exams.csv")

In [None]:
exams.head()

In [None]:
exams.shape

# Preprocessing the dataset

In [None]:
exams = exams.drop(
    labels = [ 'nn_predicted_age', 'death', 'timey' ],
    axis   = 'columns'
)

In [None]:
exams.head()

# Exploratory data analysis

Number of unique patients

In [None]:
exams["patient_id"].unique().shape

Proportion of women and men

In [None]:
plt.title("Proportion of men and women")

isMaleValueCounts = exams["is_male"].value_counts()

bars = plt.bar(
    x 	   	  = ["Men", "Woman"],
    height 	  = isMaleValueCounts * 100 / exams["is_male"].count(),
    color  	  = ["orange", "limegreen"],
    edgecolor = "black"
)

for sex, bar in zip([False, True], bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = isMaleValueCounts[sex],
		ha = 'center',
		va = 'bottom'
	)


plt.xlabel("Sex")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Disease proportion

In [None]:
diseases = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']

In [None]:
plt.title("Proportion of disease")

diseasesSum = exams[diseases].sum()

bars = plt.bar(
    x 	   	  = diseases,
    height 	  = diseasesSum * 100 / exams['patient_id'].count(),
    color  	  = plt.cm.tab20.colors,
    edgecolor = "black"
)

for disease, bar in zip(diseases, bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = diseasesSum[disease],
		ha = 'center',
		va = 'bottom'
	)


plt.xlabel("Disease")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Proportion of Normal ECG

In [None]:
plt.title("Proportion of normal ECG")

isNormalValueCounts = exams["normal_ecg"].value_counts()

bars = plt.bar(
    x 	   	  = ["No", "Yes"],
    height 	  = isNormalValueCounts * 100 / exams["normal_ecg"].count(),
    color  	  = ["orange", "limegreen"],
    edgecolor = "black"
)

for isNormal, bar in zip([False, True], bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = isNormalValueCounts[isNormal],
		ha = 'center',
		va = 'bottom'
	)

plt.xlabel("Is normal?")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Histogram of ages at first examination

In [None]:
uniquePacientsExams = exams \
	.drop_duplicates(
		subset = 'patient_id',
		keep   = 'first'
	) \
	.drop(
        labels = 'patient_id', 
        axis   = 'columns'
    )

uniquePacientsExams.head()

In [None]:
uniquePacientsAges = uniquePacientsExams['age']

In [None]:
figure, axes = plt.subplots(
	nrows  = 2,
    ncols  = 1,
    sharex = True
)

figure.suptitle("Histogram of ages at first examination")

axes[0].hist(
    x    	= uniquePacientsAges,
    bins 	= 25,
    density = True
)
axes[1].hist(
    x    	   = uniquePacientsAges,
    bins 	   = 25,
    density    = True,
    cumulative = True
)

axes[1].yaxis.set_major_formatter(
    lambda value, _: round(100 * value, 2)
)

axes[1].set_xlabel("Age")

axes[0].set_ylabel("Frequency (%)")
axes[1].set_ylabel("Cumulative frequency (%)")

plt.tight_layout()
plt.show()

plt.close()

# Exploratory data analysis of subset

The subset

In [None]:
examsSubset = exams[exams['trace_file'] == traceFile]

Number of unique patients

In [None]:
examsSubset["patient_id"].unique().shape

Proportion of women and men

In [None]:
plt.title("Proportion of men and women")

isMaleValueCounts = examsSubset["is_male"].value_counts()

bars = plt.bar(
    x 	   	  = ["Men", "Woman"],
    height 	  = isMaleValueCounts * 100 / examsSubset["is_male"].count(),
    color  	  = ["orange", "limegreen"],
    edgecolor = "black"
)

for sex, bar in zip([False, True], bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = isMaleValueCounts[sex],
		ha = 'center',
		va = 'bottom'
	)


plt.xlabel("Sex")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Disease proportion

In [None]:
diseases = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']

In [None]:
plt.title("Proportion of disease")

diseasesSum = examsSubset[diseases].sum()

bars = plt.bar(
    x 	   	  = diseases,
    height 	  = diseasesSum * 100 / examsSubset['patient_id'].count(),
    color  	  = plt.cm.tab20.colors,
    edgecolor = "black"
)

for disease, bar in zip(diseases, bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = diseasesSum[disease],
		ha = 'center',
		va = 'bottom'
	)


plt.xlabel("Disease")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Proportion of Normal ECG

In [None]:
plt.title("Proportion of normal ECG")

isNormalValueCounts = examsSubset["normal_ecg"].value_counts()

bars = plt.bar(
    x 	   	  = ["No", "Yes"],
    height 	  = isNormalValueCounts * 100 / examsSubset["normal_ecg"].count(),
    color  	  = ["orange", "limegreen"],
    edgecolor = "black"
)

for isNormal, bar in zip([False, True], bars):
	plt.text(
		x  = bar.get_x() + bar.get_width() / 2,
		y  = bar.get_height(),
		s  = isNormalValueCounts[isNormal],
		ha = 'center',
		va = 'bottom'
	)


plt.xlabel("Is normal?")
plt.ylabel("Proportion (%)")

plt.tight_layout()
plt.show()

plt.close()

Histogram of ages at first examination

In [None]:
uniquePacientsExams = examsSubset \
	.drop_duplicates(
		subset = 'patient_id',
		keep   = 'first'
	) \
	.drop(
        labels = 'patient_id', 
        axis   = 'columns'
    )

uniquePacientsExams.head()

In [None]:
uniquePacientsAges = uniquePacientsExams['age']

In [None]:
figure, axes = plt.subplots(
	nrows  = 2,
    ncols  = 1,
    sharex = True
)

figure.suptitle("Histogram of ages at first examination")

axes[0].hist(
    x    	= uniquePacientsAges,
    bins 	= 25,
    density = True
)
axes[1].hist(
    x    	   = uniquePacientsAges,
    bins 	   = 25,
    density    = True,
    cumulative = True
)

axes[1].yaxis.set_major_formatter(
    lambda value, _: round(100 * value, 2)
)

axes[1].set_xlabel("Age")

axes[0].set_ylabel("Frequency (%)")
axes[1].set_ylabel("Cumulative frequency (%)")

plt.tight_layout()
plt.show()

plt.close()