# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

Imports

In [None]:
## Imports
import zipfile
import os
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV

from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

## Classifiers
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
#from sklearn import svm
from sklearn.svm import SVC
from xgboost import XGBClassifier

## Ensembling
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

In [None]:
# Run this to use from colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import zipfile
import os
import pandas as pd

with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')



**Pre-processing**

Separate labels

In [None]:
# Data has a 'label' column indicating the class
label = data['label']

# Separate data based on the label
if sum(data['label']) > len(data) / 2:
    normal_data = data[label == 0]
    abnormal_data = data[label == 1]
else:
    normal_data = data[label == 1]
    abnormal_data = data[label == 0]

# Create data without the labels
data_no_label = data.drop('label', axis=1)  # All features

Missing data handling

In [None]:
##### Check for missing data
# Check for any None values in data
has_missing = data.isnull().values.any()
print(f"Missing values present? {has_missing}")

# Check for any zeros in data
has_zeros = (data_no_label == 0).values.any()
print(f"Zero values present? {has_zeros}")

# Calculate total number of zeros
total_zeros = (data_no_label == 0).sum().sum()
print(f"Total zeros in DataFrame: {total_zeros}")

##### Overview of where zeros are to decide missing data handling strategy
# Count how many rows have at least one zero
rows_with_zero = (data_no_label == 0).any(axis=1).sum()
print(f"Number of rows with at least one zero: {rows_with_zero}")

# Count how many columns have at least one zero
columns_with_zero = (data_no_label == 0).any(axis=0).sum()
print(f"Number of columns with at least one zero: {columns_with_zero}")

# Create table with zero count for the rows
zero_counts_per_row = (data_no_label == 0).sum(axis=1)
zero_count_table = pd.DataFrame({'Row_Index': data_no_label.index, 'Zero_Count': zero_counts_per_row})
zero_count_table.set_index('Row_Index', inplace=True)

# Create table with zero count for the columns
zero_counts_per_column = (data_no_label == 0).sum(axis=0)
zero_count_table = pd.DataFrame({'Column_Name': zero_counts_per_column.index, 'Zero_Count': zero_counts_per_column.values})
zero_count_table.set_index('Column_Name', inplace=True)

##### Remove missing data
# Remove rows with more than 10 zeros
zero_counts_per_row = (data_no_label == 0).sum(axis=1)
rows_to_keep = zero_counts_per_row[zero_counts_per_row <= 10].index
filtered_data = data_no_label.loc[rows_to_keep]

# Print removed rows with zeros
data_with_zeros = data_no_label[(data_no_label == 0).any(axis=1)]
zero_counts_per_row = (data_with_zeros == 0).sum(axis=1)
print(zero_counts_per_row)

# Check if all rows with zeros are removed
rows_with_zero = (filtered_data == 0).any(axis=1).sum()
print(f"Number of rows with at least one zero (filtered data): {rows_with_zero}")

# Calculate total number of zeros to make sure all are removed from the whole dataframe
total_zeros = (filtered_data == 0).sum().sum()
print(f"Total zeros in the DataFrame (excluding last column) after removing rows with zeros: {total_zeros}")

Missing values present? False
Zero values present? True
Total zeros in DataFrame: 10500
Number of rows with at least one zero: 14
Number of columns with at least one zero: 4500
177    750
251    750
269    750
321    750
323    750
385    750
434    750
446    750
537    750
542    750
575    750
601    750
784    750
790    750
dtype: int64
Number of rows with at least one zero (filtered data): 0
Total zeros in the DataFrame (excluding last column) after removing rows with zeros: 0


Train and test data

In [None]:
# Split the data into training and testing set (for final evaluation als dit nodig is)
data_train, data_test, label_train, label_test = train_test_split(data_no_label, label, test_size=0.2, random_state=42, stratify=label)

# Define K-fold cross-validation
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

Scaling: normalisation or standardisation

In [None]:
# Standardization
scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_no_label)
data_standardized = pd.DataFrame(data_standardized, columns=data_no_label.columns, index=data_no_label.index)  # Convert back to DataFrame

# Normalization
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data_no_label)
data_normalized = pd.DataFrame(data_normalized, columns=data_no_label.columns, index=data_no_label.index)  # Convert back to DataFrame

# Decide to use standardization or normalization, based on performance metrics (accuracy, precision, recall)
# For now start with standardization --> default choice

**Feature selection and extraction**

In [None]:
# 1. Preliminary filtering using univariate statistical testing: ANOVA f-test
selector = SelectKBest(f_classif, k=1000) # Select top 1000 features
data_selected = selector.fit_transform(data_train, label_train) # Fit to the training data

# Get the names of the top 1000 features
selected_feature_indices = selector.get_support(indices=True)  # Get indices of selected features
selected_feature_names = data_train.columns[selector.get_support()] # Data is pandas dataframe

# Dataframe with selected features for training data
data_selected = pd.DataFrame(data_selected, columns=selected_feature_names, index=data_train.index)

print('Univariatiate statistical feature selection performed: 1000 features left.')

# 2. Dimensionality reduction using PCA
pca = PCA(n_components=100)  # Reduce to 100 features
data_pca_selected = pca.fit_transform(data_selected) # Fit to the training data

# Dataframe with PCA-transformed features for training data
data_pca_selected = pd.DataFrame(data_pca_selected, index=data_selected.index)

print('PCA feature selection performed: 100 features left.')

# 3. Visualize new features with t-SNE
tsne = TSNE(n_components=2, random_state=42) # Reduce to 2 dimensions for plotting
data_tsne = tsne.fit_transform(data_pca_selected)

# Create a scatter plot
#plt.figure(figsize=(8, 6))
#plt.scatter(data_tsne[label_train == 0, 0], data_tsne[label_train == 0, 1], label='Label 0', marker='o')  # Plot points for label 0
#plt.scatter(data_tsne[label_train == 1, 0], data_tsne[label_train == 1, 1], label='Label 1', marker='x')  # Plot points for label 1
#plt.legend()  # Add a legend to identify the labels
#plt.title('t-SNE Visualization of Selected Features')
#plt.xlabel('t-SNE Dimension 1')
#plt.ylabel('t-SNE Dimension 2')
#plt.show()

# **Classify**

## **Try classifiers**

### Colorplot def

In [None]:
def colorplot(clf, ax, x, y, h=100):
    '''
    Overlay the decision areas as colors in an axes.

    Input:
        clf: trained classifier
        ax: axis to overlay color mesh on
        x: feature on x-axis
        y: feature on y-axis
        h(optional): steps in the mesh
    '''
    # Create a meshgrid the size of the axis
    xstep = (x.max() - x.min() ) / 20.0
    ystep = (y.max() - y.min() ) / 20.0
    x_min, x_max = x.min() - xstep, x.max() + xstep
    y_min, y_max = y.min() - ystep, y.max() + ystep
    h = max((x_max - x_min, y_max - y_min))/h
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    if len(Z.shape) > 1:
        Z = Z[:, 1]

    # Put the result into a color plot
    cm = plt.cm.RdBu_r
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    del xx, yy, x_min, x_max, y_min, y_max, Z, cm

### Loop over different classfiers

In [None]:
svmlin = SVC(kernel='linear', gamma='scale')
svmrbf = SVC(kernel='rbf', gamma='scale')
svmpoly = SVC(kernel='poly', degree=3, gamma='scale')
svmsig = SVC(kernel='sigmoid', gamma='scale')

clfs = [LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(),
        LogisticRegression(), GaussianNB(), SGDClassifier(),
        KNeighborsClassifier(), DecisionTreeClassifier(),
        svmlin, svmrbf, svmpoly, svmsig]

for clf in clfs:
  start_time = time.time()
  clf = clf.fit(data_train, label_train)
  y_pred = clf.predict(data_train)
  t = ("Misclassified: %d / %d" % ((label_train != y_pred).sum(), data_train.shape[0]))
  end_time = time.time()
  runtime = end_time - start_time
  print(f"Clf: {clf}, {t}")
  print(f"Runtime: {runtime:.2f} seconds")

NameError: name 'data_train' is not defined

### Linear

In [None]:
start_time = time.time()
lda = LinearDiscriminantAnalysis()
lda = lda.fit(data_train, label_train)
y_pred_lda = lda.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_lda).sum(), data_train.shape[0]))
end_time = time.time()
runtime = end_time - start_time
print(t)
print(f"Runtime: {runtime:.2f} seconds")

### Quadratic

In [None]:
start_time = time.time()
qda = QuadraticDiscriminantAnalysis()
qda = qda.fit(data_train, label_train)
y_pred_qda = qda.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_qda).sum(), data_train.shape[0]))
print(t)

In [None]:
y_pred_qda_test = qda.predict(data_test)
t = ("Misclassified: %d / %d" % ((label_test != y_pred_qda_test).sum(), data_train.shape[0]))
print(t)

### tNN

In [None]:
start_time = time.time()
NN = KNeighborsClassifier(n_neighbors=1)
NN = NN.fit(data_train, label_train)
y_pred_nn = NN.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_nn).sum(), data_train.shape[0]))
end_time = time.time()
runtime = end_time - start_time
print(t)
print(f"Runtime: {runtime:.2f} seconds")

### Decision tree

In [None]:
start_time = time.time()
dt = DecisionTreeClassifier()
dt = dt.fit(data_train, label_train)
y_pred_dt = dt.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_dt).sum(), data_train.shape[0]))
end_time = time.time()
runtime = end_time - start_time
print(t)
print(f"Runtime: {runtime:.2f} seconds")

### SVM

In [None]:
start_time = time.time()
svm = SVC()
svm = svm.fit(data_train, label_train)
y_pred_svm = svm.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_svm).sum(), data_train.shape[0]))
end_time = time.time()
runtime = end_time - start_time
print(t)
print(f"Runtime: {runtime:.2f} seconds")

In [None]:
svmlin = SVC(kernel='linear', gamma='scale')
svmrbf = SVC(kernel='rbf', gamma='scale')
svmpoly = SVC(kernel='poly', degree=3, gamma='scale')
svmsig = SVC(kernel='sigmoid', gamma='scale')

clfs = [svmlin, svmrbf, svmpoly, svmsig]

# Make plot without classifiers:
num = 0
fig = plt.figure(figsize=(15,15))

for clf in clfs:
  start_time = time.time()
  clf = clf.fit(data_train, label_train)
  y_pred = clf.predict(data_train)
  t = ("Misclassified: %d / %d" % ((label_train != y_pred).sum(), data_train.shape[0]))
  end_time = time.time()
  runtime = end_time - start_time
  print(f"Clf: {clf}, {t}")
  print(f"Runtime: {runtime:.2f} seconds")

  # For plotting, only works using 2 features
  ax = fig.add_subplot(3, 2, num + 1)
  x = data_train.iloc[:, 0]
  y = data_train.iloc[:, 1]
  ax.scatter(x, y, marker='o', c=label_train,
      s=25, edgecolor='k', cmap=plt.cm.Paired)
  colorplot(clf, ax, x, y)
  ax.set_title(f"Clf: {clf}, {t}")
  num += 1

In [None]:
## Colorplot only using two features to see what the SVC's do

svmlin = SVC(kernel='linear', gamma='scale')
svmrbf = SVC(kernel='rbf', gamma='scale')
svmpoly = SVC(kernel='poly', degree=2, gamma='scale')
svmsig = SVC(kernel='sigmoid', gamma='scale')

clfs = [svmlin, svmrbf, svmpoly, svmsig]

# # Make plot without classifiers:
# num = 0
# fig = plt.figure(figsize=(8,15))

# Loop over classifiers
for clf in clfs:
  start_time = time.time()
  x = data_train.iloc[:, 100:102]
  clf = clf.fit(x, label_train)
  y_pred = clf.predict(x)
  t = ("Misclassified: %d / %d" % ((label_train != y_pred).sum(), data_train.shape[0]))
  end_time = time.time()
  runtime = end_time - start_time
  print(f"Clf: {clf}, {t}")
  print(f"Runtime: {runtime:.2f} seconds")

  # # Plotting colorplot
  # ax = fig.add_subplot(4, 1, num + 1)
  # ax.scatter(x.iloc[:,0], x.iloc[:,1], marker='o', c=label_train,
  #     s=25, edgecolor='k', cmap=plt.cm.Paired)
  # colorplot(clf, ax, x.iloc[:,0], x.iloc[:,1])
  # ax.set_title(f"Clf: {clf}, {t}")
  # num += 1

### Random Forest

In [None]:
n_trees = [1, 5, 10, 50, 100]

# # Make plot without classifiers:
# num = 0
# fig = plt.figure(figsize=(15,15))

# Now use the classifiers on all datasets
for n_tree in n_trees:
  start_time = time.time()
  rf = RandomForestClassifier(n_estimators=n_tree) # Om vast te zetten: random_state=42
  rf.fit(data_train, label_train)
  y_pred_rf = rf.predict(data_train)
  t = ("Misclassified: %d / %d" % ((label_train != y_pred_rf).sum(), data_train.shape[0]))
  end_time = time.time()
  runtime = end_time - start_time
  print(f"Tree: {n_tree}, {t}, Runtime: {runtime:.2f} seconds")

  ## For plotting, only works using 2 features
  # ax = fig.add_subplot(3, 2, num + 1)
  # x = data_train.iloc[:, 0]
  # y = data_train.iloc[:, 1]
  # ax.scatter(x, y, marker='o', c=label_train,
  #     s=25, edgecolor='k', cmap=plt.cm.Paired)
  # colorplot(clf, ax, x, y)
  # ax.set_title(f"Tree: {n_tree}, {t}")
  # num += 1

### XGBoost

In [None]:
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(data_train, label_train)
y_pred_XGB = clf.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_XGB).sum(), data_train.shape[0]))

In [None]:
n_est = [1, 5, 10, 50, 100]

for n in n_est:
  start_time = time.time()
  xgb = XGBClassifier(n_estimators=100, random_state=42)
  xgb.fit(data_train, label_train)
  y_pred_rf = clf.predict(data_train)
  t = ("Misclassified: %d / %d" % ((label_train != y_pred_rf).sum(), data_train.shape[0]))
  end_time = time.time()
  runtime = end_time - start_time
  print(f"Tree: {n}, {t}, Runtime: {runtime:.2f} seconds")

## Ensembling

### Voting

In [None]:
voting_ensemble_soft = VotingClassifier(
    estimators=[('KNN', KNeighborsClassifier()), ('tree', DecisionTreeClassifier()), ('rf', RandomForestClassifier())],
    voting='soft')
voting_ensemble_hard = VotingClassifier(
    estimators=[('KNN', KNeighborsClassifier()), ('tree', DecisionTreeClassifier()), ('rf', RandomForestClassifier())],
    voting='hard')

ves = voting_ensemble_soft.fit(data_train, label_train)
y_pred_ves = ves.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_ves).sum(), data_train.shape[0]))
print(t)

veh = voting_ensemble_hard.fit(data_train, label_train)
y_pred_veh = veh.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_veh).sum(), data_train.shape[0]))
print(t)


### Averaging

In [None]:
class AveragingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        self.estimators = estimators

    def fit(self, X, y):
        for name, estimator in self.estimators: # Unpack the tuple into name and estimator
            estimator.fit(X, y)
        return self

    def predict_proba(self, X):
        # Get probability predictions from each classifier
        proba = np.stack([estimator.predict_proba(X) for name, estimator in self.estimators])

        # Average the probabilities
        averaged_proba = np.mean(proba, axis=0)
        return averaged_proba

    def predict(self, X):
        # Get class predictions based on averaged probabilities
        averaged_proba = self.predict_proba(X)
        predictions = np.argmax(averaged_proba, axis=1)
        return predictions

In [None]:
averaging = AveragingClassifier(
    estimators=[('KNN', KNeighborsClassifier()), ('tree', DecisionTreeClassifier()), ('rf', RandomForestClassifier())])
averaging = averaging.fit(data_train, label_train)
y_pred_averaging = averaging.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_averaging).sum(), data_train.shape[0]))
print(t)

### Stacking (duurt ~1 min om te runnen)

In [None]:
# Stacking allows to use the strength of each individual estimator by using
# their output as input of a final estimator

stacking = StackingClassifier(
    estimators=[('KNN', KNeighborsClassifier()), ('tree', DecisionTreeClassifier()), ('rf', RandomForestClassifier())],
    )
stacking = stacking.fit(data_train, label_train)
y_pred_stacking = stacking.predict(data_train)
t = ("Misclassified: %d / %d" % ((label_train != y_pred_stacking).sum(), data_train.shape[0]))
print(t)

### Selection + voting/averaging