# Final Script -- 28/11/2019

In [3]:
np.random.seed(seed)
rn.seed(seed)
seed = 10

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import random as rn
import pywt
import math
from itertools import product
from tqdm import tqdm_notebook as tqdm
import scipy
import csv
import numpy as np
import sys
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import (PCA, LatentDirichletAllocation)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import f_regression
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingClassifier )
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
import xgboost as xgb
from biosppy.signals import (ecg, tools)

'''import data'''
sample_path = "sample.csv"
train_path = "X_train.csv"
target_path =  "y_train.csv"
test_path = "X_test.csv"

wavelets = False
num_cols = 18154
sampling_rate = 300
cols = ["id"] + ["x" + str(i) for i in range(num_cols)]


# Load Data

In [6]:
xtrain = pd.read_csv(train_path, names = cols)[1:]
xtrain.drop("id", axis=1, inplace = True)

xtest =  pd.read_csv(test_path, names = cols)[1:]
id_test = xtest.columns[0]
xtest.drop("id", axis=1, inplace = True)

ytrain = pd.read_csv(target_path)
ytrain.drop("id", axis=1, inplace = True)

print(xtrain.shape, xtest.shape, ytrain.shape)

(5117, 18154) (3411, 18154) (5117, 1)


# Define functions -- (Feature Extraction and Submission)

In [7]:
def mean_sqrd_diff(rpeaks):
    diff = np.diff(rpeaks)
    mean_sqrd = np.mean(diff*diff)
    
    return mean_sqrd 

def obtain_features(signal, sampling_rate):
    X = list()
    
    '''features obtained from biosppy'''
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal, sampling_rate, show = False)
    
    '''
    Correct R-peak locations to the maximum --- introduce some tolerance level
    '''
    rpeaks = ecg.correct_rpeaks(signal = signal, rpeaks = rpeaks, sampling_rate = sampling_rate, tol = 0.01)  
    
    '''
    extracting values of R-peaks -- Note: rpeaks gives only indices for R-peaks location
    '''
    peak_values = signal[rpeaks]
    
    if len(heart_rate) < 2:
        heart_rate = [0, 1]
    if len(heart_rate_ts) < 2:
        heart_rate_ts = [0, 1]
           
    X.append(np.mean(peak_values))
    X.append(np.median(peak_values))
    X.append(np.min(peak_values))
    X.append(np.max(peak_values))
    X.append(np.std(peak_values))
    X.append(np.sqrt(mean_sqrd_diff(rpeaks)))
    X.append(np.mean(rpeaks))
    X.append(np.median(rpeaks))
    X.append(np.min(rpeaks))
    X.append(np.max(rpeaks))
    X.append(np.std(rpeaks))
    X.append(np.mean(np.diff(rpeaks)))
    X.append(np.median(np.diff(rpeaks)))
    X.append(np.min(np.diff(rpeaks)))
    X.append(np.max(np.diff(rpeaks)))
    X.append(np.std(np.diff(rpeaks)))           
    X.append(np.mean(heart_rate))
    X.append(np.median(heart_rate))
    X.append(np.min(heart_rate))
    X.append(np.max(heart_rate))
    X.append(np.std(heart_rate))
    X.append(np.mean(np.diff(heart_rate)))
    X.append(np.median(np.diff(heart_rate)))
    X.append(np.min(np.diff(heart_rate)))
    X.append(np.max(np.diff(heart_rate)))
    X.append(np.std(np.diff(heart_rate)))
    X.append(np.mean(heart_rate_ts))
    X.append(np.median(heart_rate_ts))
    X.append(np.min(heart_rate_ts))
    X.append(np.max(heart_rate_ts))
    X.append(np.std(heart_rate_ts))
    X.append(np.mean(np.diff(heart_rate_ts)))
    X.append(np.median(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.max(np.diff(heart_rate_ts)))
    X.append(np.std(np.diff(heart_rate_ts)))
    X.append(np.sum(filtered-signal))
    
    X += list(np.mean(templates, axis = 0))
    X += list(np.median(templates, axis = 0))
    
    '''removed fft -- no improvements by adding it'''
    #X += list(np.abs(np.fft.rfft(np.mean(templates, axis=0), axis=0))[0:45])   ### adding FFT (choose only half of entries)
    
    X += list(np.min(templates, axis=0))
    X += list(np.max(templates, axis=0))
    X += list(np.std(templates, axis = 0))
    
    '''convert X into numpy array for later analysis'''
    X = np.array(X)
    
    return X

def make_submission(name_csv, predictions_vec):
    sample =  pd.read_csv(sample_path)
    sample["y"] = predictions_vec
    sample.to_csv(name_csv, index = False)

# Obtain features from raw signal

In [12]:
sampling_rate = float(sampling_rate)
hb_features_train = list()
hb_features_test = list()

for id in tqdm(range(xtrain.shape[0])):
    signal_train = np.array(pd.to_numeric(xtrain.iloc[id].dropna()))
    hb_features_train.append(obtain_features(signal_train, sampling_rate))
    
for id in tqdm(range(xtest.shape[0])):
    signal_test = np.array(pd.to_numeric(xtest.iloc[id].dropna()))
    hb_features_test.append(obtain_features(signal_test, sampling_rate))
    
X_train = np.array(hb_features_train)
y_train = np.ravel(np.array(ytrain.values))    
X_test = np.array(hb_features_test)

print(X_train.shape, y_train.shape, X_test.shape)

HBox(children=(IntProgress(value=0, max=5117), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3411), HTML(value='')))

(5117, 938) (5117,) (3411, 938)


# Model Selection 

In [14]:
'''
use random subset of initial dataframe X for model selection  
'''

X_train = pd.DataFrame(X_train) 
X_train['y'] = y_train
X_sub = pd.DataFrame(X_train).sample(frac = 0.40, replace = False, axis = 0)
y_sub = X_sub['y']
X_sub = X_sub.drop('y', axis = 1).values
X_train = X_train.drop('y', axis = 1)
print(y_sub.shape, X_sub.shape)

'''define score function'''
scorer_f1 = make_scorer(f1_score, greater_is_better = True, average = 'micro')

(2047,) (2047, 938)


In [7]:
'''
Gradient Boosting APPROACH -- GRID-SEARCH CV
''' 
steps = [("impute", SimpleImputer()),
         ("scaler", preprocessing.StandardScaler()), 
         ("classifier", GradientBoostingClassifier())]
pipeline = Pipeline(steps = steps)

parameters = {"impute__strategy": ["mean", "median", "constant"],
              "impute__fill_value": [0],
              "classifier__max_depth": [3,4,5,6,7,8],
              "classifier__n_estimators": [200,250,300],
              "classifier__learning_rate": [0.1,0.08,0.05,0.03],
              "classifier__max_features": [40,50,60]
             }

grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)


'''
## SVC APPROACH -- GRID-SEARCH CV

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", SVC())]
pipeline = Pipeline(steps = steps)

parameters = {"classifier__kernel": ["rbf", "poly"],
              "classifier__gamma": ["auto"],
              "classifier__C": [15,30,45,60,75],  
              "classifier__class_weight": ["balanced"],
              "classifier__degree": [2,4,6,8]
             }

grid = GridSearchCV(pipeline, parameters, cv = 5, scoring = scorer_f1, verbose = 2)

grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

estimator = SVC(C = grid.best_params_['classifier__C'], gamma = 'auto', 
                class_weight = 'balanced', 
                kernel = grid.best_params_['classifier__kernel'], 
                degree = grid.best_params_['classifier__degree'])

estimator.fit(xtrain_scaled, y)
pred = estimator.predict(xtest_scaled)
make_submission("prediction_trial.csv", pred)
'''

'''
## XGB APPROACH -- GRID-SEARCH CV


steps = [("scaler", preprocessing.StandardScaler()), ("classifier", xgb.XGBClassifier())]
pipeline = Pipeline(steps = steps)

parameters = {"classifier__max_depth": [5,10,15],
              "classifier__n_estimators": [200],
              "classifier__learning_rate": [0.05,0.1],
              "classifier__max_features": [20,40]
             }

grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)

grid.fit(X_sub, y_sub)
print(grid.best_score_)
print(grid.best_params_)

'''

Fitting 2 folds for each of 648 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed: 577.2min finished


0.8151260504201681
{'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__max_features': 60, 'classifier__n_estimators': 250, 'impute__fill_value': 0, 'impute__strategy': 'median'}


'\n## XGB APPROACH -- GRID-SEARCH CV\n\n\nsteps = [("scaler", preprocessing.StandardScaler()), ("classifier", xgb.XGBClassifier())]\npipeline = Pipeline(steps = steps)\n\nparameters = {"classifier__max_depth": [5,10,15],\n              "classifier__n_estimators": [200],\n              "classifier__learning_rate": [0.05,0.1],\n              "classifier__max_features": [20,40]\n             }\n\ngrid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)\n\ngrid.fit(X_sub, y_sub)\nprint(grid.best_score_)\nprint(grid.best_params_)\n\n'

# Make Submission

In [20]:
'''replacing NaNs with median of columns'''

impute = SimpleImputer(strategy = 'median', fill_value = 0)
X_train = impute.fit_transform(X_train)
X_test = impute.fit_transform(X_test)

'''rescaling data'''
scaler = StandardScaler() 
scaler.fit(X_train)
x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)


'''using best parameter given by GS'''
estimator = GradientBoostingClassifier(n_estimators = 250, 
                                       max_depth = 5,
                                       learning_rate = 0.1, 
                                       max_features = 60)


'''
estimator = GradientBoostingClassifier(n_estimators = grid.best_params_['classifier__n_estimators'], 
                                       max_depth = grid.best_params_['classifier__max_depth'],
                                       learning_rate = grid.best_params_['classifier__learning_rate'], 
                                       max_features= grid.best_params_['classifier__max_features'])
'''
'''making predictions'''
estimator.fit(x_train_scaled, y_train)
predictions = estimator.predict(x_test_scaled)
make_submission("FINAL_SUBMISSION.csv", predictions)