**TM10007 Machine Learning**

## Import dependencies and data

In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
import shutil
import subprocess
import shutil
import os
import stat
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from os import path
from sklearn import model_selection
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA, KernelPCA
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics.pairwise import rbf_kernel, sigmoid_kernel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, fbeta_score

## if data.csv is not present download it from github
if not os.path.isfile("ecg_data.csv"):
    ## clone repo from githun
    if not os.path.isdir("tm10007_ml"):
        !git clone https://github.com/jveenland/tm10007_ml.git
    ## extract zip file
    if not os.path.isfile("tm10007_ml/ecg/ecg_data.csv"):
        with zipfile.ZipFile('tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
            zip_ref.extractall('tm10007_ml/ecg')
    ## move data file to root folder
    shutil.move('tm10007_ml/ecg/ecg_data.csv', 'ecg_data.csv')

    ## Delete cloned repo
    for root, dirs, files in os.walk("./tm10007_ml"):  
        for dir in dirs:
            os.chmod(path.join(root, dir), stat.S_IRWXU)
        for file in files:
            os.chmod(path.join(root, file), stat.S_IRWXU)
    shutil.rmtree('./tm10007_ml')

data = pd.read_csv('ecg_data.csv', index_col=0)

## Inspect imported data and clean missing data

### Plot number of missing data per features and per patient

In [None]:
# exclude label column
values_features = data.drop(['label'], axis=1)

# plot the distribution of missing data per feature
num_zeros_features = (values_features == 0).sum(axis=0)
plt.scatter(range(len(num_zeros_features)),num_zeros_features)
plt.title("Distribution of number of missing data per feature")
plt.ylabel('Number of missing data')
plt.xlabel('Feature')
plt.show()

# plot the distribution of missing data per patient
num_zeros_patients=(values_features == 0).sum(axis=1)
plt.scatter(range(len(num_zeros_patients)),num_zeros_patients)
plt.title("Distribution of number of missing data per patient")
plt.ylabel('Number of missing data')
plt.xlabel('Patient')
plt.show()

### Delete rows with missing data

In [None]:
values_data = data.drop(['label'], axis=1)
mask = (values_data != 0).all(axis=1)
clean_data=data[mask]
dirty_data=data[~mask]

### Plot number of missing data per features and per patient after data cleanup

In [None]:
# exclude label column
values_clean_data = clean_data.drop(['label'], axis=1)
# plot the distribution of missing data per feature
num_zeros_features = (values_clean_data == 0).sum(axis=0)
plt.scatter(range(len(num_zeros_features)),num_zeros_features)
plt.title("Distribution of number of missing data per feature")
plt.ylabel('Number of missing data')
plt.xlabel('Feature')
plt.show()

# plot the distribution of missing data per patient
num_zeros_patients=(values_clean_data == 0).sum(axis=1)
plt.scatter(range(len(num_zeros_patients)),num_zeros_patients)
plt.title("Distribution of number of missing data per patient")
plt.ylabel('Number of missing data')
plt.xlabel('Patient')
plt.show()

## Split data into test and train data

In [None]:
ECG_ABNORMAL = data[data['label'] == 1]
ECG_NORMAL = data[data['label'] == 0]

x_train_abnormal, x_test_abnormal = model_selection.train_test_split(ECG_ABNORMAL, test_size=0.2, random_state=30)
x_train_normal, x_test_normal = model_selection.train_test_split(ECG_NORMAL,test_size = 0.2,random_state=30)

train_set = pd.concat([x_train_normal,x_train_abnormal])
test_set = pd.concat([x_test_normal,x_test_abnormal])


## Scale features of train and test set

In [None]:
# deal with outliers in the data
transformer = RobustScaler().fit(train_set)
train_set_robust = pd.DataFrame(transformer.transform(train_set))
test_set_robust = pd.DataFrame(transformer.transform(test_set))

# scaling the data between 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))
# fit the scaler on the train set
scaler.fit(train_set_robust)
# transform both train and test data set with the scaler
# rename the column names to the ones from 'data' again
train_set_scaled = pd.DataFrame(scaler.transform(train_set))
train_set_scaled.columns=data.columns.values
test_set_scaled = pd.DataFrame(scaler.transform(test_set))
test_set_scaled.columns=data.columns.values

In [None]:
# scaling the data
scaler = MinMaxScaler(feature_range=(0, 1))
# fit the scaler on the train set
scaler.fit(train_set)
# transform both train and test data set with the scaler
# rename the column names to the ones from 'data' again
train_set_scaled = pd.DataFrame(scaler.transform(train_set))
train_set_scaled.columns=data.columns.values
test_set_scaled = pd.DataFrame(scaler.transform(test_set))
test_set_scaled.columns=data.columns.values

In [None]:
ecg_train_abnormal = train_set_scaled[train_set_scaled['label'] == 1]
ecg_train_normal = train_set_scaled[train_set_scaled['label'] == 0]

v_train_abnormal, v_val_abnormal = model_selection.train_test_split(ecg_train_abnormal, test_size=0.2, random_state=30)
v_train_normal, v_val_normal = model_selection.train_test_split(ecg_train_normal,test_size = 0.2, random_state=30)

train_set_scaled = pd.concat([v_train_normal,v_train_abnormal])
validation_set = pd.concat([v_val_normal,v_val_abnormal])

In [None]:
x = train_set_scaled.iloc[:, :-1].values
y = train_set_scaled.iloc[:, -1].values

pca = PCA(n_components=2)
X_pca = pca.fit_transform(x)

X_val = validation_set.iloc[:,:-1].values
y_val = validation_set.iloc[:,-1].values
X_val_pca = pca.transform(X_val)

df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df_pca['Target'] = y

sns.scatterplot(x='PC1', y='PC2', hue='Target', data=df_pca)
plt.show()

print(pca.explained_variance_ratio_)

In [None]:
# Colorplot
def colorplot(clf, ax, x, y, h=100, precomputer=None):
    '''
    Overlay the decision areas as colors in an axes.
    
    Input:
        clf: trained classifier
        ax: axis to overlay color mesh on
        x: feature on x-axis
        y: feature on y-axis
        h(optional): steps in the mesh
    '''
    # Create a meshgrid the size of the axis
    xstep = (x.max() - x.min() ) / 20.0
    ystep = (y.max() - y.min() ) / 20.0
    x_min, x_max = x.min() - xstep, x.max() + xstep
    y_min, y_max = y.min() - ystep, y.max() + ystep
    h = max((x_max - x_min, y_max - y_min))/h
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    features = np.c_[xx.ravel(), yy.ravel()]
    if precomputer is not None:
        if type(precomputer) is RBFSampler:
            features = precomputer.transform(features)
        elif precomputer is rbf_kernel:
            features = rbf_kernel(features, X)
            
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(features)
    elif hasattr(clf, "predict_proba"):
        Z = clf.predict_proba(features)
    else:
        Z = clf.predict(features)
        
    if len(Z.shape) > 1:
        Z = Z[:, 1]
    
    # Put the result into a color plot
    cm = plt.cm.RdBu_r
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    del xx, yy, x_min, x_max, y_min, y_max, Z, cm

In [None]:
# KNN classifier
knn = KNeighborsClassifier()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
knn.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(knn, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(knn.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by KNN classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")

In [None]:
# Quadratic Discriminant Analysis
qda = QuadraticDiscriminantAnalysis()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
qda.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(qda, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(qda.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by QDA classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")

In [None]:
# Gaussian
gauss = GaussianNB()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
gauss.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(gauss, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(gauss.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by Gaussian classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")

In [None]:
# SVM RBF
svmrbf = SVC(kernel='rbf', gamma='scale')
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
svmrbf.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(svmrbf, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(svmrbf.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by SVM RBF classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")


In [None]:
# SVM polynomial
svmpoly = SVC(kernel='poly', degree=3, gamma='scale')
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
svmpoly.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(svmpoly, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(svmpoly.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by SVM polynomial classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")

In [None]:
# Decision tree classifier
decision_tree = DecisionTreeClassifier()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
decision_tree.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(decision_tree, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(decision_tree.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by decision tree classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")

In [None]:
# Random forest classifier
random_forest = RandomForestClassifier()
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot()
ax.scatter(X_pca[:, 0], X_pca[:, 1], marker='o', c=y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
random_forest.fit(X_pca,y)
ax = fig.add_subplot()
ax.scatter(X_pca[:,0],X_pca[:,1],marker='o',c=y,s=8,edgecolor='k',cmap=plt.cm.Paired)
colorplot(random_forest, ax, X_pca[:,0], X_pca[:,1])
y_pred = pd.DataFrame(random_forest.predict(X_val_pca))
correct = pd.DataFrame(y_val)
percentage = pd.DataFrame((correct != y_pred).sum() / len(y_pred))
percentage_value = percentage.iloc[:,:].values
float_percentage = 100-(float(percentage_value)*100)
rounded_percentage = "{:.2f}".format(float_percentage)
t = "Correctly classified by random forest classifier: " + rounded_percentage + "%"
ax.set_title(t)

# Compute the accuracy of the classifier
accuracy = accuracy_score(correct, y_pred)
print(f"The accuracy of the classifier is {accuracy}")

# Compute the precision of the classifier
precision = precision_score(correct, y_pred)
print(f"The precision of the classifier is {precision}")

# Compute the recall of the classifier
recall = recall_score(correct, y_pred)
print(f"The recall of the classifier is {recall}")

# Compute the F1 score of the classifier
f1 = f1_score(correct, y_pred)
print(f"The F1 score of the classifier is {f1}")

# Compute the weighted F1 score of the classifier
fbeta = fbeta_score(correct, y_pred, beta=2)
print(f"The weighted F1 score of the classifier is {fbeta}")