In [3]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sys
print('Python: {}'.format(sys.version))

import scipy
print('scipy: {}'.format(scipy.__version__))

import csv
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import (PCA, LatentDirichletAllocation)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import f_regression
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingClassifier )
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer


import random as rn
from biosppy.signals import (ecg, tools)
import pywt
from tqdm import tqdm_notebook as tqdm

import math
from itertools import product
# ============= CONSTS =============
TRAIN_FILE_PATH = "X_train.csv"
TARGET_FILE_PATH =  "y_train.csv"
TEST_FILE_PATH = "X_test.csv"
SAMPLE_FILE_PATH = "sample.csv"

seed = 42
NUM_MAX_COLS = 18154
SAMPLING_RATE=300
USE_WAVE_LETS = False
my_cols = ["id"] + ["x" + str(i) for i in range(NUM_MAX_COLS)]
# ============= CONSTS =============

np.random.seed(seed)
rn.seed(seed)

Python: 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
scipy: 1.3.1


# Loadind Data

In [4]:
xtrain = pd.read_csv(TRAIN_FILE_PATH, names=my_cols)[1:]
xtrain.drop("id", axis=1, inplace=True)

ytrain = pd.read_csv(TARGET_FILE_PATH)
ytrain.drop("id", axis=1, inplace = True)

xtest =  pd.read_csv(TEST_FILE_PATH, names=my_cols)[1:]
id_test = xtest.columns[0]
xtest.drop("id", axis=1, inplace=True)


# Define functions -- Submission and Feature Extraction

In [5]:
def mean_sqrd_diff(rpeaks):
    diff = np.diff(rpeaks)
    mean_sqrd = np.mean(diff*diff)
    
    return mean_sqrd 

def make_submission(filename, predictions):
    sample =  pd.read_csv(SAMPLE_FILE_PATH)
    sample["y"] = predictions
    sample.to_csv(filename, index= False)

def get_features_from_raw_qrs(signal, sampling_rate):
    X = list()
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal, sampling_rate, show = False)
    
    '''
    Correct R-peak locations to the maximum within a tolerance
    '''
    rpeaks = ecg.correct_rpeaks(signal = signal, rpeaks=rpeaks, sampling_rate = sampling_rate, tol=0.01)  
    
    '''
    extracting values of R-peaks -- Note: rpeaks gives only indices for R-peaks location
    '''
    peaks = signal[rpeaks]
    
    if len(heart_rate) < 2:
        heart_rate = [0, 1]
    if len(heart_rate_ts) < 2:
        heart_rate_ts = [0, 1]
           
    X.append(np.mean(peaks))
    X.append(np.min(peaks))
    X.append(np.max(peaks))
    X.append(np.std(peaks))
    X.append(np.sqrt(mean_sqrd_diff(rpeaks)))        ## remove if results worsen
    X.append(np.mean(np.diff(rpeaks)))
    X.append(np.min(np.diff(rpeaks)))
    X.append(np.max(np.diff(rpeaks)))
    X.append(np.std(np.diff(rpeaks)))           
    X.append(np.mean(heart_rate))
    X.append(np.min(heart_rate))
    X.append(np.max(heart_rate))
    X.append(np.std(heart_rate))
    X.append(np.mean(np.diff(heart_rate)))
    X.append(np.min(np.diff(heart_rate)))
    X.append(np.max(np.diff(heart_rate)))
    X.append(np.std(np.diff(heart_rate)))
    X.append(np.mean(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.min(np.diff(heart_rate_ts)))
    X.append(np.max(np.diff(heart_rate_ts)))
    X.append(np.std(np.diff(heart_rate_ts)))
    X.append(np.sum(filtered-signal))
    
    X += list(np.mean(templates, axis=0))
    X += list(np.abs(np.fft.rfft(np.mean(templates, axis=0), axis=0)))
    X += list(np.min(templates, axis=0))
    X += list(np.max(templates, axis=0))
    X = np.array(X)
    
    X[np.isnan(X)] = 0
    return X

# Obtain features from raw signal

In [6]:
features = list()
sampling_rate = float(SAMPLING_RATE)
for id in tqdm(range(xtrain.shape[0])):
    signal = np.array(pd.to_numeric(xtrain.iloc[id].dropna()))
    features.append(get_features_from_raw_qrs(signal, sampling_rate))
    
    
X = np.array(features)
y = np.ravel(np.array(ytrain.values))

features_test = list()
for id in tqdm(range(xtest.shape[0])):
    signal = np.array(pd.to_numeric(xtest.iloc[id].dropna()))
    features_test.append(get_features_from_raw_qrs(signal, sampling_rate))
    
X_test = np.array(features_test)







# Model Selection 

In [44]:
'''
create subset of initial dataframe X for model selection 
'''
X = pd.DataFrame(X) 
X['y'] = y
X_sub = pd.DataFrame(X).sample(frac = 0.90, replace = False, axis=0)
y_sub = X_sub['y']
X_sub = X_sub.drop('y', axis = 1).values
print(y_sub.shape, X_sub.shape)

(4605,) (4605, 654)


In [48]:
'''
might have to filter class 3 beforehand
'''

scorer_f1 = make_scorer(f1_score, greater_is_better=True, average='micro')

#scaler = StandardScaler() 
#scaler.fit(X_sub)

## SVC APPROACH -- GRID-SEARCH CV

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", SVC())]
pipeline = Pipeline(steps = steps)

parameters = {"classifier__kernel": ["rbf"],
              "classifier__gamma": ["auto"],
              "classifier__C": [10, 25, 50, 75, 100],  
              "classifier__class_weight": ['balanced'],
              "classifier__degree": [1]
             }

grid = GridSearchCV(pipeline, parameters, cv = 3, scoring = scorer_f1, verbose = 2)

grid.fit(X_sub, y_sub)
print("Grid best scores and stats:", grid.best_score_)
print(grid.best_params_)

estimator = SVC(C = grid.best_params_['classifier__C'], gamma = 'auto', 
                class_weight = 'balanced', 
                kernel = grid.best_params_['classifier__kernel'], 
                degree = grid.best_params_['classifier__degree'])

# estimator.fit(xtrain_scaled, y)
# pred = estimator.predict(xtest_scaled)
# make_submission("prediction_trial.csv", pred)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf, total=   8.5s
[CV] classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.5s remaining:    0.0s


[CV]  classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf, total=   8.2s
[CV] classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf 
[CV]  classifier__C=10, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf, total=   7.8s
[CV] classifier__C=25, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf 
[CV]  classifier__C=25, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf, total=   7.5s
[CV] classifier__C=25, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf 
[CV]  classifier__C=25, classifier__class_weight=balanced, classifier__degree=1, classifier__gamma=auto, classifier__kernel=rbf, total=   8.6s
[CV] classifier__C=25, classifier__class_wei

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  2.0min finished


Grid best scores and stats: 0.7424538545059718
{'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__degree': 1, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}


In [47]:
scaler = StandardScaler() 
# scaler.fit(X)
X.drop("y", axis=1, inplace=True)
x_train_scaled = scaler.fit_transform(X)
x_test_scaled = scaler.fit_transform(X_test)
estimator.fit(x_train_scaled, y)
pred = estimator.predict(x_test_scaled)
make_submission("prediction_trial_svm_2.csv", pred)

# Conclusion: SVM best score on CV found was 0.74, best score on public 0.77.

In [37]:
scorer_f1 = make_scorer(f1_score, greater_is_better=True, average='micro')
'''
Gradient Boosting APPROACH -- GRID-SEARCH CV
''' 

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", GradientBoostingClassifier())]
pipeline = Pipeline(steps = steps)

# parameters = {"classifier__max_depth": [5,10,15,20,25,40,50,100],
#               "classifier__n_estimators": [100,200,300],
#               "classifier__learning_rate": [3,1,0.3,0.05,0.01],
#               "classifier__max_features": [20,40,60,80]
#              }

# grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 2,
#                    fit_params={'classifier__sample_weight': w_array}) -> Returns error with fit params

# Sample weights version!!

parameters = {"classifier__max_depth": [5],
              "classifier__n_estimators": [200],
              "classifier__learning_rate": [0.05],
              "classifier__max_features": [40]
             }
# Calculate the class_weight for the gradient booster
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(y_sub),
                                             y_sub))
w_array = np.ones(y_sub.shape[0], dtype = 'float')
for i, val in enumerate(y_sub):
    w_array[i] = class_weights[val-1]
pipeline.set_params(classifier__max_depth = 5, classifier__n_estimators = 200,
              classifier__learning_rate = 0.05, classifier__max_features = 40)

res = cross_val_score(pipeline, X_sub, y_sub, cv=3, verbose=2, scoring=scorer_f1,
                         fit_params={'classifier__sample_weight': w_array})




# grid.fit(X_sub, y_sub)
# print(grid.best_score_)
# print(grid.best_params_)
print("RES:", res)

# scaler = StandardScaler() 
# # scaler.fit(X)
# x_train_scaled = scaler.fit_transform(X)
# x_test_scaled = scaler.fit_transform(X_test)
# estimator.fit(x_train_scaled, y)
# pred = estimator.predict(x_test_scaled)
# make_submission("prediction_trial_GBclassweight.csv", pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] ................................................. , total=  12.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.4s remaining:    0.0s


[CV] ................................................. , total=  12.9s
[CV]  ................................................................
[CV] ................................................. , total=  12.5s
RES: [0.76061493 0.76336996 0.7703595 ]


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   37.7s finished


# Conclusion: Score is better without class weights, so do not use class weights. 

In [None]:
# Try random forest:

scorer_f1 = make_scorer(f1_score, greater_is_better=True, average='micro')
'''
Gradient Boosting APPROACH -- GRID-SEARCH CV
''' 

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", ())]
pipeline = Pipeline(steps = steps)

# parameters = {"classifier__max_depth": [5,10,15,20,25,40,50,100],
#               "classifier__n_estimators": [100,200,300],
#               "classifier__learning_rate": [3,1,0.3,0.05,0.01],
#               "classifier__max_features": [20,40,60,80]
#              }

# grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 2,
#                    fit_params={'classifier__sample_weight': w_array}) -> Returns error with fit params

# Sample weights version!!

parameters = {"classifier__max_depth": [5],
              "classifier__n_estimators": [200],
              "classifier__learning_rate": [0.05],
              "classifier__max_features": [40]
             }
# Calculate the class_weight for the gradient booster
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(y_sub),
                                             y_sub))
w_array = np.ones(y_sub.shape[0], dtype = 'float')
for i, val in enumerate(y_sub):
    w_array[i] = class_weights[val-1]
pipeline.set_params(classifier__max_depth = 5, classifier__n_estimators = 200,
              classifier__learning_rate = 0.05, classifier__max_features = 40)

res = cross_val_score(pipeline, X_sub, y_sub, cv=3, verbose=2, scoring=scorer_f1,
                         fit_params={'classifier__sample_weight': w_array})




# grid.fit(X_sub, y_sub)
# print(grid.best_score_)
# print(grid.best_params_)
print("RES:", res)

# scaler = StandardScaler() 
# # scaler.fit(X)
# x_train_scaled = scaler.fit_transform(X)
# x_test_scaled = scaler.fit_transform(X_test)
# estimator.fit(x_train_scaled, y)
# pred = estimator.predict(x_test_scaled)
# make_submission("prediction_trial_GBclassweight.csv", pred)

# Make Submission

In [218]:
scaler = StandardScaler() 
scaler.fit(X)
x_train_scaled = scaler.transform(X)
x_test_scaled = scaler.transform(X_test)

estimator = GradientBoostingClassifier(n_estimators = grid.best_params_['classifier__n_estimators'], 
                                       max_depth = grid.best_params_['classifier__max_depth'],
                                       learning_rate = grid.best_params_['classifier__learning_rate'], 
                                       max_features= grid.best_params_['classifier__max_features'])


estimator.fit(xtrain_scaled, y)
pred = estimator.predict(xtest_scaled)
make_submission("prediction_trial.csv", pred)