In [None]:
# import general modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.svm import OneClassSVM
import xgboost as xgb
from helpers import *
import pandas as pd
import numpy as np

# import specialised modules
from sklearnex import patch_sklearn
patch_sklearn()

## Feature calculation

Features are calculated for the whole data. This only shows how it was done, the results are already saved as csv.

In [None]:
new_features = False

if new_features:
    # prepare the X data for analysis
    X_ = pd.read_csv('X_train.csv', engine='c')
    X_.drop(columns='id', inplace=True)
    col_names = X_.index
    # transform the data
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_processed = pd.DataFrame(scaler.fit_transform(X_.transpose()).transpose())
    # convert the frame to np arrays and remove the nans
    X_processed = [item[~np.isnan(item)] for item in X_processed.to_numpy()]
    #extracting noise features
    noise_df = extract_noise_features(X_processed)
    X_pp = preprocess(X_processed)
    #extract features based on temporal and frequental things
    df = extract_features(X_pp)
    df = pd.concat([df, noise_df], axis=1)
    df.to_csv('../data/features.csv')
else:
    df  = pd.read_csv('features.csv').drop(['Unnamed: 0'],axis=1)

In [None]:
#print general description
df.describe(include='all')

KNN Imputing was used for imputing missing data. Sometimes the libraries failes, especially in the noisy class, since the ECG libraries try to find
PQRST templates nonthereless.

In [None]:
# replace inf with nan
df.replace([np.inf, -np.inf], np.NaN, inplace=True)
imp = KNNImputer(n_neighbors=4, weights='distance')
df_X = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

In [None]:
y = pd.read_csv('y_train.csv')
y.drop(columns='id', inplace=True)

This would be the further processing of the time-series data.

Remove outliers with a one class SVM

In [None]:
features_before = df_X.shape[0]

trans = ExperimentalTransformer(OneClassSVM(nu=0.995))
trans.fit(df_X, y)

X_selection, y = trans.transform(df_X, y)
features_after = X_selection.shape[0]

print(f'Data size reduced from {features_before} to {features_after}')
df_X = X_selection

Find most important features with a random forest, since different libraries are used and they showed very inconsistant performance.
The 40 best features are used.

In [None]:
# feature importance (optional)
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(n_estimators=200, n_jobs=-1), n_features_to_select=40, step=1)
rfe.fit(df_X, y.values.ravel())
ranking = rfe.ranking_
ranked = pd.DataFrame(columns=['rank','feature'])
ranked['feature']= df.columns
ranked['rank'] = ranking
rank = ranked.sort_values('rank')
df_X = rfe.transform(df_X)

Standard train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selection, y, random_state=42)

## Model

XGB is often used for tabular data for many reasons and it's potential was again showed in the recent paper [Tabular Data: Deep Learning is Not All You Need](https://arxiv.org/abs/2106.03253).

The hyperopt package is used to maximize the AUC while finding the best hyperparameters.

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import warnings
warnings.filterwarnings('ignore')

space = {'max_depth': hp.quniform("max_depth", 3, 18, 1),
         'gamma': hp.uniform('gamma', 1, 9),
         'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
         'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
         'n_estimators': 1000,
         'seed': 0
         }

def objective(space):
    clf = xgb.XGBClassifier(
        use_label_encoder=False,n_jobs=-1,
        n_estimators=space['n_estimators'], max_depth=int(space['max_depth']), gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
        colsample_bytree=int(space['colsample_bytree']))

    evaluation = [(X_train, y_train.values.ravel()),
                  (X_test, y_test.values.ravel())]

    clf.fit(X_train, y_train.values.ravel(),
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10, verbose=False)

    pred = clf.predict(X_test)
    accuracy = f1_score(y_test.values.ravel(), pred, average='micro')
    return {'loss': -accuracy, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 500,
                        trials = trials)

best_xgb = xgb.XGBClassifier(best_hyperparams)
best_xgb.fit(X_train, y_train.values.ravel(), verbose=False)

In [None]:
print(f1_score(y_test, best_xgb.predict(X_test), average=None))
print(f1_score(y_test, best_xgb.predict(X_test), average='micro'))

Before using the model on the submission, it was finetuned on the whole data, since we should make use of all data we have.