# ECG-ViEW II Data Processing

From a paper on "Explainable Prediction of Acute Myocardial Infarction using Machine Learning and Shapley Values"

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pylab as plt
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# Set the global default size of matplotlib figures
plt.rc('figure', figsize=(10, 5))

# Size of matplotlib histogram bins
bin_size = 40

Using TensorFlow backend.


## 1. ECG Data Processing

In [2]:
df_ECG = pd.read_csv('Electrocardiogram.csv')
df_ECG.head()

FileNotFoundError: [Errno 2] File Electrocardiogram.csv does not exist: 'Electrocardiogram.csv'

In [None]:
df_ECG.shape

In [None]:
df_ECG.info()

In [None]:
df_ECG.describe()

In [None]:
df_ECG.isnull().sum()

In [None]:
# Drop instances that have missing values 
df_ECG_full = df_ECG.dropna(subset=['RR', 'PR', 'QRS', 'P_wave_axis','QRS_axis', 'T_wave_axis']);

In [None]:
df_ECG_full.isnull().sum()

In [None]:
df_ECG_full.shape

In [None]:
# Drop unrelevant information 
df_ECG_full = df_ECG_full.drop(columns = ['ecgdate', 'ecgsource', 'ecgdept'])

In [None]:
df_ECG_full.head()

In [None]:
df_ECG_full['personid'].value_counts();

## 2. Diagnosis Code + Personal Info (Sex)

In [None]:
df_diag = pd.read_csv('Diagnosis.csv', encoding='latin-1')
df_diag.head()

In [None]:
df_diag.shape

In [None]:
# Drop duplicate entries (repeated entry of the same ID)
df_diag_two = df_diag.sort_values('diagdate').drop_duplicates('personid',keep='last')
df_diag_two.sort_values(by=['personid']);

In [None]:
df_diag_two['personid'].value_counts();

In [None]:
df_diag_two.shape

In [None]:
df_person = pd.read_csv('Person.csv')
df_person.head()

In [None]:
df_person.shape

In [None]:
# Merge the diagnosis dataset with the personal info dataset with the common column (personid)
df_diag_person = pd.merge(df_diag_two, df_person, on='personid')

In [None]:
df_diag_person.head()

In [None]:
df_diag_person.shape

In [None]:
df_diag_person.isnull().sum()

In [None]:
####################### Create a new feature base on the diagnosis code: MI (1 = MI, 0 = no) ################

# To select rows whose column value is in list 
code = ['DC13972','DC3641','DC2108','DC8232','DC910','DC7547',
        'DC2624','DC2485']
condition = df_diag_person.diaglocalcode.isin(code)

df_diag_person['MI'] = np.where(condition, '1', '0')
df_diag_person.head()

In [None]:
df_diag_person.shape

In [None]:
# Check the number of entries of a specific diagnostic code 
df_diag_person['diaglocalcode'][df_diag_person['diaglocalcode'] == "DC14546"].value_counts()

In [None]:
df_diag_person['diaglocalcode'].value_counts();

In [None]:
df_diag_person['MI'].value_counts()

In [None]:
df_diag_person['MI'].value_counts().plot(kind='bar', title='MI Count')

In [None]:
df_diag_person.head()

## 3. Merge To Obtain The Final Dataset 

In [None]:
# Merge the ecg dataset with the combined diagnosis/personal info dataset using the common column (personid)
df_1 = pd.merge(df_ECG_full, df_diag_person, on='personid')
df_1.head()

In [None]:
df_1.shape

In [None]:
df_2 = df_1.drop(columns = ['diagdate', 'diagcode', 'diaglocalcode','diagdept', 'personid', 'ethnicity'])
col_names = df_2.columns
df_2.head()

In [None]:
df_2['Birthyeargroup'].value_counts();

In [None]:
df_2.shape

In [None]:
df_2['MI'].value_counts()

In [None]:
df_2['MI'].value_counts().plot(kind='bar', title='MI Count')

# 4. Robust Scaling & SMOTE 

In [None]:
# Get training data from dataframe
train_data = df_2.values

In [None]:
# Defining the scaler
robust_scaler = RobustScaler().fit(train_data)

# Scaling test data using robust scaling
train_data_robust = robust_scaler.transform(train_data)

# Training data features
train_features_robust= train_data_robust[:, 0:11]

# 'MI' column values
train_target_robust = train_data_robust[:, 11]

In [None]:
# Minority oversampling using SMOTE
over = SMOTE(sampling_strategy=0.25) 
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# Transform the dataset
x_resampled, y_resampled = pipeline.fit_resample(train_features_robust, train_target_robust)

In [None]:
# Split 80-20 train vs test data
x_train, x_test, y_train, y_test = train_test_split(x_resampled, 
                                                    y_resampled, 
                                                    test_size=0.20, 
                                                    random_state=0,
                                                    shuffle=True)

In [None]:
# Export data as a csv
df_x_train=pd.DataFrame(data=x_train,y_train)
df_x_train.to_csv('train_justcheckin.csv', index=False)
# df_cnn_train_y=pd.DataFrame(data=cnn_train_y)
# df_cnn_train_y.to_csv('trainy.csv', index=False)
# df_cnn_test_x=pd.DataFrame(data=cnn_test_x)
# df_cnn_test_x.to_csv('testx.csv', index=False)
# df_cnn_test_y=pd.DataFrame(data=cnn_test_y)
# df_cnn_test_y.to_csv('testy.csv', index=False)