In [6]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from biosppy.signals import ecg
from scipy.signal import periodogram
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingClassifier
import pywt
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean


In [24]:
X_train = pd.read_csv('X_train.csv')
X_test_full = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')

X_train = X_train.drop('id', axis = 1)
X_test = X_test_full.drop('id', axis = 1)
y_train = y_train.drop('id', axis = 1)

In [49]:
def extract_features(ecg_sig, sampling_rate):
    X = list()
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(ecg_sig, sampling_rate, show = False)
    
    '''
    Correct R-peak locations to the maximum within a tolerance
    '''
    rpeaks = ecg.correct_rpeaks(signal = ecg_sig, rpeaks=rpeaks, sampling_rate = sampling_rate, tol=0.01)  
    
    '''
    extracting values of R-peaks -- Note: rpeaks gives only indices for R-peaks location
    '''
    peaks = ecg_sig[rpeaks]
    
    if len(heart_rate) < 2:
        heart_rate = [0, 1]
    if len(heart_rate_ts) < 2:
        heart_rate_ts = [0, 1]
           
    X = [
    np.mean(peaks),
    np.median(peaks),
    np.min(peaks),
    np.max(peaks),
    np.std(peaks),
    np.mean(rpeaks),
    np.median(rpeaks),
    np.min(rpeaks),
    np.max(rpeaks),
    np.std(rpeaks),
    np.mean(np.diff(rpeaks)),
    np.median(np.diff(rpeaks)),
    np.min(np.diff(rpeaks)),
    np.max(np.diff(rpeaks)),
    np.std(np.diff(rpeaks)),
    np.mean(heart_rate),
    np.median(heart_rate),
    np.min(heart_rate),
    np.max(heart_rate),
    np.std(heart_rate),
    np.mean(np.diff(heart_rate)),
    np.median(np.diff(heart_rate)),
    np.min(np.diff(heart_rate)),
    np.max(np.diff(heart_rate)),
    np.std(np.diff(heart_rate)),
    np.mean(heart_rate_ts),
    np.median(heart_rate_ts),
    np.min(heart_rate_ts),
    np.max(heart_rate_ts),
    np.std(heart_rate_ts),
    np.mean(np.diff(heart_rate_ts)),
    np.median(np.diff(heart_rate_ts)),
    np.min(np.diff(heart_rate_ts)),
    # np.min(np.diff(heart_rate_ts)),  # Double check this line, it seems duplicated in your original code
    np.max(np.diff(heart_rate_ts)),
    np.std(np.diff(heart_rate_ts)),
    np.sum(filtered - ecg_sig)
    ]
    
    X += list(np.mean(templates, axis=0))
    X += list(np.median(templates, axis=0))
    X += list(np.std(templates, axis=0))
    X += list(np.min(templates, axis=0))
    X += list(np.max(templates, axis=0))
    # coefficients = pywt.wavedec(np.mean(templates, axis=0), 'db1', level=5)
    # wavelet_features = [item for sublist in coefficients for item in sublist]
    # X += wavelet_features[:45]  # Limit the number of wavelet features
    
    # # Compute PCA and add the first few principal components as features
    # pca = PCA(n_components=5)
    # pca.fit(templates)
    # pca_features = pca.transform(templates)
    # pca_features = pca_features.flatten()[:45]  # Flatten and take the first few features
    # X += list(pca_features)
    
    X = np.array(X)
    
    # X[np.isnan(X)] = 0
    return X

train = []
for i in range(X_train.shape[0]):
    x = np.array(X_train.loc[i].dropna())
    features = extract_features(x, 300)
    train.append(features)

test = []
for i in range(X_test.shape[0]):
    x = np.array(X_test.loc[i].dropna())
    features = extract_features(x, 300)
    test.append(features)

y = np.ravel(np.array(y_train.values))




### Grid Search Hyperparameters

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, WhiteKernel
import numpy as np
from playsound import playsound


# Create a Gaussian Process Classifier
classifier = GaussianProcessClassifier(random_state=0)

# Create a time series split
tscv = TimeSeriesSplit(n_splits=3)

# Define the scoring metric
scorer = make_scorer(f1_score, average='micro')

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=tscv, scoring=scorer, n_jobs=-1,verbose=3)
grid_search.fit(train, y)


# Print the best parameters and corresponding mean cross-validated score
print("Best Parameters:", grid_search.best_params_)
print("Best Mean F1 Score:", grid_search.best_score_)


### Fit Model

In [50]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, WhiteKernel
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

import numpy as np
from playsound import playsound
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from scipy import stats

imp = IterativeImputer(n_nearest_features=15, max_iter=50)

train = imp.fit_transform(train)
test = imp.transform(test)

sc = RobustScaler(quantile_range=(0.3,0.7))

train_nor = sc.fit_transform(train, y)

z_scores = np.abs(stats.zscore(train_nor))
mask = ~(np.any(z_scores > 7, axis=1))
train_nor = pd.DataFrame(train_nor[mask])
y_masked = pd.DataFrame(np.ravel(y[mask]))

test_nor = sc.transform(test)
classifier = XGBClassifier(n_jobs=-1, eval_metric='merror', n_estimators=1000, max_depth=5, learning_rate=0.1)

scorer = make_scorer(f1_score, average='micro')
tscv = TimeSeriesSplit(n_splits=7)

res = (cross_val_score(classifier, train_nor, y_masked, scoring=scorer, n_jobs=-1))
print(res)
print(np.mean(res))



[0.82297155 0.82613277 0.82613277 0.82086407 0.82489451]
0.824199134776558


In [45]:
classifier.fit(train_nor,y_masked)
y_test = classifier.predict(test_nor)
submission = pd.DataFrame({'id': X_test_full['id'], 'y': y_test})
submission.to_csv('submission.csv', index=False)