# Atrial Fibrillation Project - Data Pre-processing (Raw Data)

Student Name: Michael (s2767708) & Pedro da Silva (s2799057)
Group No: 123

In [1]:
# Import libraries
import pandas as pd
pd.options.display.max_columns = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
import numpy as np
from scipy import stats
import os
import glob
import datetime
import sweetviz as sv
from tqdm import tqdm, trange
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Suppress the warnings
import warnings
warnings.filterwarnings(action='ignore')

print('Imported all libraries successfully.')

Imported all libraries successfully.


In [2]:
current_path = os.getcwd()
current_path

'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF'

In [3]:
# Define a function to read the raw txt files
def read_files(filename):
    with open(filename, 'r') as file:
        data = file.read().splitlines()
        data = map(lambda x: x.rstrip().lstrip().split(), data)
        data = [list(map(str, line)) for line in data]
        return data

In [4]:
# Define a function to pre-process the raw datasets

def preproc_files(ecg_file_path, control_file_path):
    
    # Read ECG file
    ecg_file = read_files(ecg_file_path)
    
    # Convert the list into a DataFrame
    ecg_df = pd.DataFrame(ecg_file)
    
    # Create a single "annotation" column to join all annotations
    ecg_df['annotation'] = ecg_df[ecg_df.columns[3:]].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    
    # Drop the old annotation columns, keep the merged annotation column in the last position
    ecg_df.drop(ecg_df.iloc[:,3:(len(ecg_df.columns)-1)], axis=1, inplace=True)
    
    # Trim the white spaces in the annotation column
    ecg_df['annotation'] = ecg_df['annotation'].str.strip()

    # Rename the final columns
    col_name = ['time', 'rr_interval', 'heartbeat_type', 'annotation']
    ecg_df.columns = col_name
    ecg_df['rr_interval'] = ecg_df['rr_interval'].astype(int)
    
    # Keep the rows where there are no annotations
    ecg_df = ecg_df[ecg_df['annotation']=='']
    
    # Handle outliers
    ecg_df = ecg_df.loc[(ecg_df['rr_interval'] >= 100) & (ecg_df['rr_interval'] <= 1600)]
    
    # Drop unnecessary columns
    ecg_df_preproc = ecg_df.drop(['heartbeat_type','annotation'], axis=1)
    
    # Drop duplicates, if any
    ecg_df_preproc.drop_duplicates()
    
    # Read Control file
    control_file = read_files(control_file_path)
    
    # Convert the list into a DataFrame
    control_df = pd.DataFrame(control_file)
    
    # Rename the final columns
    col_name = ['time', 'af']
    control_df.columns = col_name
    
    # Delete records with Control = -1
    control_df = control_df[control_df['af']!='-1']
    
    # Convert time column into H:M:S format
    control_df['time'] = control_df['time'].str[:- 4]

    # Convert into time dtypes
    control_df['time'] = pd.to_datetime(control_df['time'], format='%H:%M:%S')
    
    # Generate 30s time stamp for df_label_new
    control_df['time_30'] = control_df['time'].map(lambda x: pd.date_range(start=x, periods=30, freq='s'))
    
    # Explode the time_30 column
    control_df = control_df.explode('time_30')
    control_df['time_30'] = control_df['time_30'].dt.time # Get the H:M:S format
    control_df['time_30'] = control_df['time_30'].astype(str) # Convert to string for inner join with ECG data
    
    # Drop unnecessary columns
    control_df_preproc = control_df.drop(['time'], axis=1)
    
    # Drop duplicates, if any
    control_df_preproc.drop_duplicates()
    
    # Merge ECG & Control data
    ecg_control_df = pd.merge(ecg_df_preproc, control_df_preproc,
                              how='inner', left_on='time', right_on='time_30',
                              left_index=False, right_index=False)
    
    # Drop unnecessary columns
    ecg_control_df = ecg_control_df.drop(['time_30'], axis=1)

    # Drop duplicates, if any
    ecg_control_df.drop_duplicates()

    # Reset index
    ecg_control_df = ecg_control_df.reset_index(drop=True)
    
    # Creating overlaping windows of 30 secs and 50% overlap
    rr_interval_list = []
    af_list = []
    window_size = 30
    step_size = 15    
    for i in range(0, ecg_control_df.shape[0]-window_size, step_size):
        rr_intervals = ecg_control_df['rr_interval'].values[i:i+window_size]
        afs = stats.mode(ecg_control_df['af'][i:i+window_size])[0][0]
        rr_interval_list.append(rr_intervals)
        af_list.append(afs)
    
    # Convert the lists into a DataFrame
    rr_af_df = pd.DataFrame({'rr_intervals': rr_interval_list, 'af': af_list})
    
    return rr_af_df

In [5]:
# Take all ECG file paths in the local directory
ecg_txt_paths = glob.glob(current_path + '\\dataset\\ECG_Data\\Data*.txt')
ecg_txt_paths = sorted(ecg_txt_paths, key = lambda x: int(x.split("/")[-1].split(".")[0].split("Data")[-1])) # sort by the integer of filenames
print('First 5 ecg_txt files:\n', ecg_txt_paths[0:5])
print('\nLast 5 ecg_txt files:\n', ecg_txt_paths[-5:])

First 5 ecg_txt files:
 ['C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data1.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data2.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data3.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data4.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data5.txt']

Last 5 ecg_txt files:
 ['C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data800.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\ECG_Data\\Data801.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\Uni

In [6]:
# Take all Control file paths in the local directory
control_txt_paths = glob.glob(current_path + '\\dataset\\Class\\Control*.txt')
control_txt_paths = sorted(control_txt_paths, key = lambda x: int(x.split("/")[-1].split(".")[0].split("Control")[-1])) # sort by the integer of filenames
print('First 5 control_txt files:\n', control_txt_paths[0:5])
print('\nLast 5 control_txt files:\n', control_txt_paths[-5:])

First 5 control_txt files:
 ['C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control1.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control2.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control3.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control4.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control5.txt']

Last 5 control_txt files:
 ['C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control800.txt', 'C:\\Users\\HP\\Desktop\\MASTER\\UniversityofTwente\\Master\\202302 Q2\\Data Science\\Project\\AF\\dataset\\Class\\Control801.txt', 'C:\\Users\\HP\\Desktop\\MAS

In [7]:
num_files_to_read = 200 # CHANGE IF NECESSARY!!!

af_df_preproc = pd.DataFrame()

for i in tqdm(range(0, num_files_to_read)):
    # tqdm(range(0, num_files_to_read)).set_description(f'Processing Data: {i+1} of {num_files_to_read}')
    rr_af_df_part = preproc_files(ecg_txt_paths[i], control_txt_paths[i])
    af_df_preproc = af_df_preproc.append(rr_af_df_part)

print(af_df_preproc.shape)
print('\nNumber of AF cases:')
print(af_df_preproc['af'].value_counts(), '\n')
af_df_preproc.head()

100%|██████████| 200/200 [1:38:12<00:00, 29.46s/it]

(1049997, 2)

Number of AF cases:
0    1016245
1      33752
Name: af, dtype: int64 






Unnamed: 0,rr_intervals,af
0,"[535, 765, 780, 775, 775, 765, 770, 775, 770, ...",0
1,"[775, 790, 785, 780, 785, 795, 795, 795, 785, ...",0
2,"[785, 795, 795, 785, 785, 790, 790, 795, 790, ...",0
3,"[790, 775, 780, 780, 790, 795, 805, 785, 785, ...",0
4,"[795, 800, 805, 800, 795, 775, 790, 785, 790, ...",0


In [58]:
# Save to parquet
af_df_preproc.to_parquet(current_path + '\\af_df_preproc_' + str(num_files_to_read) + '.parquet', index=False)