# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

Imports

In [None]:
import zipfile
import os
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV

from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# Run this to use from colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import zipfile
import os
import pandas as pd

with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')



**Pre-processing**

Separate labels

In [None]:
# Data has a 'label' column indicating the class
label = data['label']

# Separate data based on the label
if sum(data['label']) > len(data) / 2:
    normal_data = data[label == 0]
    abnormal_data = data[label == 1]
else:
    normal_data = data[label == 1]
    abnormal_data = data[label == 0]

# Create data without the labels
data_no_label = data.drop('label', axis=1)  # All features

Missing data handling

In [None]:
##### Check for missing data
# Check for any None values in data
has_missing = data.isnull().values.any()
print(f"Missing values present? {has_missing}")

# Check for any zeros in data
has_zeros = (data_no_label == 0).values.any()
print(f"Zero values present? {has_zeros}")

# Calculate total number of zeros
total_zeros = (data_no_label == 0).sum().sum()
print(f"Total zeros in DataFrame: {total_zeros}")

##### Overview of where zeros are to decide missing data handling strategy
# Count how many rows have at least one zero
rows_with_zero = (data_no_label == 0).any(axis=1).sum()
print(f"Number of rows with at least one zero: {rows_with_zero}")

# Count how many columns have at least one zero
columns_with_zero = (data_no_label == 0).any(axis=0).sum()
print(f"Number of columns with at least one zero: {columns_with_zero}")

# Create table with zero count for the rows
zero_counts_per_row = (data_no_label == 0).sum(axis=1)
zero_count_table = pd.DataFrame({'Row_Index': data_no_label.index, 'Zero_Count': zero_counts_per_row})
zero_count_table.set_index('Row_Index', inplace=True)

# Create table with zero count for the columns
zero_counts_per_column = (data_no_label == 0).sum(axis=0)
zero_count_table = pd.DataFrame({'Column_Name': zero_counts_per_column.index, 'Zero_Count': zero_counts_per_column.values})
zero_count_table.set_index('Column_Name', inplace=True)

##### Remove missing data
# Remove rows with more than 10 zeros
zero_counts_per_row = (data_no_label == 0).sum(axis=1)
rows_to_keep = zero_counts_per_row[zero_counts_per_row <= 10].index
filtered_data = data_no_label.loc[rows_to_keep]

# Print removed rows with zeros
data_with_zeros = data_no_label[(data_no_label == 0).any(axis=1)]
zero_counts_per_row = (data_with_zeros == 0).sum(axis=1)
print(zero_counts_per_row)

# Check if all rows with zeros are removed
rows_with_zero = (filtered_data == 0).any(axis=1).sum()
print(f"Number of rows with at least one zero (filtered data): {rows_with_zero}")

# Calculate total number of zeros to make sure all are removed from the whole dataframe
total_zeros = (filtered_data == 0).sum().sum()
print(f"Total zeros in the DataFrame (excluding last column) after removing rows with zeros: {total_zeros}")

Missing values present? False
Zero values present? True
Total zeros in DataFrame: 10500
Number of rows with at least one zero: 14
Number of columns with at least one zero: 4500
177    750
251    750
269    750
321    750
323    750
385    750
434    750
446    750
537    750
542    750
575    750
601    750
784    750
790    750
dtype: int64
Number of rows with at least one zero (filtered data): 0
Total zeros in the DataFrame (excluding last column) after removing rows with zeros: 0


Train and test data

In [None]:
# Split the data into training and testing set (for final evaluation als dit nodig is)
data_train, data_test, label_train, label_test = train_test_split(data_no_label, label, test_size=0.2, random_state=42, stratify=label)

# Define K-fold cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

Scaling: normalisation or standardisation

In [None]:
# Standardization
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_no_label)
data_standardized = pd.DataFrame(data_standardized, columns=data_no_label.columns, index=data_no_label.index)  # Convert back to DataFrame

# Normalization
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data_no_label)
data_normalized = pd.DataFrame(data_normalized, columns=data_no_label.columns, index=data_no_label.index)  # Convert back to DataFrame

# Decide to use standardization or normalization, based on performance metrics (accuracy, precision, recall)
# For now start with standardization --> default choice

**Feature selection and extraction**

In [None]:
# 1. Preliminary filtering using univariate statistical testing: ANOVA f-test
selector = SelectKBest(f_classif, k=1000) # Select top 1000 features
data_selected = selector.fit_transform(data_train, label_train) # Fit to the training data

# Get the names of the top 1000 features
selected_feature_indices = selector.get_support(indices=True)  # Get indices of selected features
selected_feature_names = data_train.columns[selector.get_support()] # Data is pandas dataframe

# Dataframe with selected features for training data
data_selected = pd.DataFrame(data_selected, columns=selected_feature_names, index=data_train.index)

print('Univariatiate statistical feature selection performed: 1000 features left.')

# 2. Dimensionality reduction using PCA
pca = PCA(n_components=100)  # Reduce to 100 features
data_pca_selected = pca.fit_transform(data_selected) # Fit to the training data

# Dataframe with PCA-transformed features for training data
data_pca_selected = pd.DataFrame(data_pca_selected, index=data_selected.index)

print('PCA feature selection performed: 100 features left.')

# 3. Visualize new features with t-SNE
tsne = TSNE(n_components=2, random_state=42) # Reduce to 2 dimensions for plotting
data_tsne = tsne.fit_transform(data_pca_selected)

# Create a scatter plot
#plt.figure(figsize=(8, 6))
#plt.scatter(data_tsne[label_train == 0, 0], data_tsne[label_train == 0, 1], label='Label 0', marker='o')  # Plot points for label 0
#plt.scatter(data_tsne[label_train == 1, 0], data_tsne[label_train == 1, 1], label='Label 1', marker='x')  # Plot points for label 1
#plt.legend()  # Add a legend to identify the labels
#plt.title('t-SNE Visualization of Selected Features')
#plt.xlabel('t-SNE Dimension 1')
#plt.ylabel('t-SNE Dimension 2')
#plt.show()