# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [1]:
import zipfile
import pandas as pd
import os

with zipfile.ZipFile(os.path.join('tm10007_ml','ecg','ecg_data.zip'), 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 827
The number of columns: 9001


In [None]:
# General packages
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import model_selection

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

In [None]:
np.random.seed(42)

labels = data.iloc[:,-1]
x = data.iloc[:,:-1]

X_train, X_test, y_train, y_test = model_selection.train_test_split(x, labels, test_size=0.25, stratify=labels, random_state=42)

pipe = Pipeline([('feature_selection', SelectKBest(f_classif)), 
                 ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1, max_features='sqrt'))
                 ])

param_grid = {
    'feature_selection__k': [300, 400, 500, 9000],
    'classifier__n_estimators': [30, 40, 50],
    'classifier__min_samples_split': [5, 10, 15],
    'classifier__max_depth': [5, 10, 15, None],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__bootstrap': [True, False]
}

grid_search = model_selection.GridSearchCV(
                                            pipe, 
                                            param_grid=param_grid, 
                                            cv=model_selection.StratifiedKFold(n_splits=4), 
                                            scoring='roc_auc', 
                                            n_jobs=-1
                                            )
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'classifier__bootstrap': False, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50, 'feature_selection__k': 300}
Best Score: 0.8331604220045411
