In [103]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sys
print('Python: {}'.format(sys.version))

import scipy
print('scipy: {}'.format(scipy.__version__))

import csv
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import (PCA, LatentDirichletAllocation)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import f_regression
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingClassifier )
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer


import random as rn
from biosppy.signals import (ecg, tools)
import pywt
from tqdm import tqdm_notebook as tqdm

import math
from itertools import product
# ============= CONSTS =============
TRAIN_FILE_PATH = "X_train.csv"
TARGET_FILE_PATH =  "y_train.csv"
TEST_FILE_PATH = "X_test.csv"
SAMPLE_FILE_PATH = "sample.csv"

seed = 42
NUM_MAX_COLS = 18154
SAMPLING_RATE=300
USE_WAVE_LETS = False
my_cols = ["id"] + ["x" + str(i) for i in range(NUM_MAX_POINTS)]
# ============= CONSTS =============

np.random.seed(seed)
rn.seed(seed)

Python: 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
scipy: 1.2.1


# Loadind Data

In [86]:
xtrain = pd.read_csv(TRAIN_FILE_PATH, names=my_cols)[1:]
xtrain.drop("id", axis=1, inplace=True)

ytrain = pd.read_csv(TARGET_FILE_PATH)
ytrain.drop("id", axis=1, inplace = True)

xtest =  pd.read_csv(TEST_FILE_PATH, names=my_cols)[1:]
id_test = xtest.columns[0]
xtest.drop("id", axis=1, inplace=True)


# Define functions -- Submission and Feature Extraction

In [109]:
def mean_sqrd_diff(rpeaks):
    diff = np.diff(rpeaks)
    sqrd_diff = math.pow(diff,2)
    mean_sqrd_diff = np.mean(sqrd_diff)
    

def make_submission(filename, predictions):
    sample =  pd.read_csv(SAMPLE_FILE_PATH)
    sample["y"] = predictions
    sample.to_csv(filename, index= False)

def get_features_from_raw_qrs(signal, sampling_rate):
    X = list()
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal, sampling_rate, show = False)
    
    '''
    Correct R-peak locations to the maximum within a tolerance
    '''
    rpeaks = ecg.correct_rpeaks(signal = signal, rpeaks=rpeaks, sampling_rate = sampling_rate, tol=0.01)  
    
    '''
    extracting values of R-peaks -- Note: rpeaks gives only indices for R-peaks location
    '''
    peaks = signal[rpeaks]
    
    if len(heart_rate) < 2:
        heart_rate = [0, 1]
    if len(heart_rate_ts) < 2:
        heart_rate_ts = [0, 1]
           
    X.append(np.sqrt(mean_sqrd_diff))  ## remove if results worsen
    X.append(np.mean(peaks))
    X.append(np.min(peaks))
    X.append(np.max(peaks))
    X.append(np.std(peaks))
    X.append(np.mean(np.diff(rpeaks)))
    X.append(np.min(np.diff(rpeaks)))
    X.append(np.max(np.diff(rpeaks)))
    X.append(np.std(np.diff(rpeaks)))           
    X.append(np.mean(heart_rate))
    X.append(np.min(heart_rate))
    X.append(np.max(heart_rate))
    X.append(np.std(heart_rate))
    X.append(np.mean(np.diff(heart_rate)))
    X.append(np.min(np.diff(heart_rate)))
    X.append(np.max(np.diff(heart_rate)))
    X.append(np.std(np.diff(heart_rate)))
    X.append(np.mean(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.max(np.diff(heart_rate_ts)))
    X.append(np.std(np.diff(heart_rate_ts)))
    X.append(np.sum(filtered-signal))
    
    X += list(np.mean(templates, axis=0))
    X += list(np.min(templates, axis=0))
    X += list(np.max(templates, axis=0))
    X = np.array(X)
    
    X[np.isnan(X)] = 0
    return X

# Obtain features from raw signal

In [88]:
features = list()
sampling_rate = float(SAMPLING_RATE)
for id in tqdm(range(xtrain.shape[0])):
    signal = np.array(pd.to_numeric(xtrain.iloc[id].dropna()))
    features.append(get_features_from_raw_qrs(signal, sampling_rate))
    
    
X = np.array(features)
y = np.ravel(np.array(ytrain.values))

features_test = list()
for id in tqdm(range(xtest.shape[0])):
    signal = np.array(pd.to_numeric(xtest.iloc[id].dropna()))
    features_test.append(get_features_from_raw_qrs(signal, sampling_rate))
    
X_test = np.array(features_test)

HBox(children=(IntProgress(value=0, max=5117), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3411), HTML(value='')))

# Create Model and Make Submission

In [93]:
## consdier filtering class 0 beforehand

scorer_f1 = make_scorer(f1_score, greater_is_better=True, average='micro')

## XGB APPROACH -- GRID-SEARCH CV

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", GradientBoostingClassifier())]
pipeline = Pipeline(steps = steps)

parameters = {"classifier__n_estimators": [100],
              "classifier__max_depth": [5],    ## 20,25
              "classifier__learning_rate": [0.01],  ## 0.1, 0.01, 0.001  
              "classifier__tol": [0.01]
             }

grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 2)

grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

estimator = GradientBoostingClassifier(n_estimators = grid.best_params_['classifier__n_estimators'], 
                                       max_depth = grid.best_params_['classifier__max_depth'],
                                       learning_rate = grid.best_params_['classifier__learning_rate'],
                                       fier__tol = grid.best_params_['classifier__tol'])

estimator.fit(xtrain_scaled, y)
pred = estimator.predict(xtest_scaled)
make_submission("prediction_trial.csv", pred)


'''
## SVC APPROACH -- GRID-SEARCH CV

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", SVC())]
pipeline = Pipeline(steps = steps)

parameters = {"classifier__kernel": ["rbf", "poly"],
              "classifier__gamma": ["auto"],
              "classifier__C": [15,30,45,60,75],  
              "classifier__class_weight": ["balanced"],
              "classifier__degree": [2,4,6,8]
             }

grid = GridSearchCV(pipeline, parameters, cv = 5, scoring = scorer_f1, verbose = 2)

grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

estimator = SVC(C = grid.best_params_['classifier__C'], gamma = 'auto', 
                class_weight = 'balanced', 
                kernel = grid.best_params_['classifier__kernel'], 
                degree = grid.best_params_['classifier__degree'])

estimator.fit(xtrain_scaled, y)
pred = estimator.predict(xtest_scaled)
make_submission("prediction_trial.csv", pred)
'''

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__tol=0.01 
[CV]  classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__tol=0.01, total= 6.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.2min remaining:    0.0s


[CV] classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__tol=0.01 
[CV]  classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__tol=0.01, total= 6.3min
[CV] classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__tol=0.01 


KeyboardInterrupt: 