In [1]:
import os
import time
import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.fftpack import fft
from IPython.display import display

import pywt
import scipy.stats

import datetime as dt
from collections import defaultdict, Counter

from sklearn.ensemble import GradientBoostingClassifier

In [2]:
def calculate_entropy(list_values):
    counter_values = Counter(list_values).most_common()
    probabilities = [elem[1]/len(list_values) for elem in counter_values]
    entropy=scipy.stats.entropy(probabilities)
    return entropy

def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]

def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]

def get_features(list_values):
    entropy = calculate_entropy(list_values)
    crossings = calculate_crossings(list_values)
    statistics = calculate_statistics(list_values)
    return [entropy] + crossings + statistics

def get_uci_har_features(dataset, labels, waveletname):
    uci_har_features = []
    for signal_no in range(0, len(dataset)):
        features = []
        for signal_comp in range(0,dataset.shape[2]):
            signal = dataset[signal_no, :, signal_comp]
            list_coeff = pywt.wavedec(signal, waveletname)
            for coeff in list_coeff:
                features += get_features(coeff)
        uci_har_features.append(features)
    X = np.array(uci_har_features)
    Y = np.array(labels)
    return X, Y

def get_train_test(df, y_col, x_cols, ratio):
    """ 
    This method transforms a dataframe into a train and test set, for this you need to specify:
    1. the ratio train : test (usually 0.7)
    2. the column with the Y_values
    """
    mask = np.random.rand(len(df)) < ratio
    df_train = df[mask]
    df_test = df[~mask]
       
    Y_train = df_train[y_col].values
    Y_test = df_test[y_col].values
    X_train = df_train[x_cols].values
    X_test = df_test[x_cols].values
    return df_train, df_test, X_train, Y_train, X_test, Y_test

### load ECG data

In [4]:
filename = './data/ECGdata/ECGData.mat'
ecg_data = sio.loadmat(filename)
ecg_signals = ecg_data['ECGData'][0][0][0]
ecg_labels_ = ecg_data['ECGData'][0][0][1]
ecg_labels = list(map(lambda x: x[0][0], ecg_labels_))

dict_ecg_data = defaultdict(list)
for ii, label in enumerate(ecg_labels):
    dict_ecg_data[label].append(ecg_signals[ii])

In [21]:
ecg_data['ECGData'][0][0][0]

array([[-0.0978573 , -0.15688126, -0.13781616, ..., -0.07490498,
        -0.11652732, -0.18042101],
       [ 0.1193611 ,  0.18755853,  0.16956925, ..., -0.09721826,
        -0.12393573, -0.10648055],
       [-0.02460175, -0.03627935, -0.0359677 , ..., -0.18614223,
        -0.12998497, -0.12235631],
       ...,
       [-0.355     , -0.355     , -0.345     , ..., -0.335     ,
        -0.315     , -0.315     ],
       [-0.275     , -0.245     , -0.285     , ..., -0.205     ,
        -0.145     , -0.165     ],
       [ 0.125     ,  0.005     ,  0.025     , ..., -0.225     ,
        -0.275     , -0.205     ]])

In [23]:
ecg_labels

['ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'ARR',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',
 'CHF',


### calculate features

In [9]:
list_labels = []
list_features = []
for k, v in dict_ecg_data.items():
    yval = list(dict_ecg_data.keys()).index(k)
    for signal in v:
        features = []
        list_labels.append(yval)
        list_coeff = pywt.wavedec(signal, 'db4')
        for coeff in list_coeff:
            features += get_features(coeff)
        list_features.append(features)
df = pd.DataFrame(list_features)
ycol = 'y'
xcols = list(range(df.shape[1]))
df.loc[:,ycol] = list_labels

df_train, df_test, X_train, Y_train, X_test, Y_test = get_train_test(df, ycol, xcols, ratio = 0.8)

In [12]:
df_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,y
count,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,...,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0,126.0
mean,2.639057,0.507937,2.230159,-31.280279,-27.655234,-9.213527,-5.88401,-17.829613,-18.326657,10.727835,...,-0.050284,-0.01091,0.010916,0.052765,-3.4e-05,-7e-06,0.054018,0.003936,0.024688,0.674603
std,8.917241e-16,0.969503,1.544863,33.567841,31.800501,24.631038,24.187067,20.83734,23.525696,12.899415,...,0.03017,0.004079,0.004079,0.034303,0.00054,0.000499,0.032029,0.004688,0.011573,0.856312
min,2.639057,0.0,1.0,-202.61785,-185.895101,-78.446543,-72.079237,-81.230428,-87.41928,0.725577,...,-0.18427,-0.038768,0.004015,0.014857,-0.002285,-0.002255,0.011022,0.000121,0.00766,0.0
25%,2.639057,0.0,1.0,-37.138273,-33.852034,-20.076138,-16.827191,-26.300359,-27.885167,4.34781,...,-0.065011,-0.012397,0.008373,0.026086,-0.000269,-3.7e-05,0.028178,0.000794,0.016028,0.0
50%,2.639057,0.0,2.0,-23.483966,-20.786922,-6.763839,-5.432084,-12.300213,-13.411266,6.092616,...,-0.041971,-0.010573,0.010563,0.039669,-4.8e-05,4e-06,0.047382,0.002245,0.021624,0.0
75%,2.639057,1.0,3.0,-10.065795,-7.685729,6.92802,10.524595,-5.637314,-4.19419,11.850306,...,-0.027775,-0.008495,0.012461,0.072577,0.000181,5.2e-05,0.069301,0.004803,0.031683,1.75
max,2.639057,5.0,7.0,8.553686,11.497213,60.379769,66.102062,13.580764,25.958356,82.204673,...,-0.015464,-0.004393,0.038426,0.19647,0.001687,0.001487,0.161185,0.02598,0.063793,2.0


In [10]:
cls = GradientBoostingClassifier(n_estimators=2000)
cls.fit(X_train, Y_train)
train_score = cls.score(X_train, Y_train)
test_score = cls.score(X_test, Y_test)
print("The Train Score is {}".format(train_score))
print("The Test Score is {}".format(test_score))

The Train Score is 1.0
The Test Score is 0.9444444444444444
