In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [36]:
dataset = pd.read_csv("nsrdb10.csv", header=None)

In [37]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       292 non-null    object
 1   1       292 non-null    int64 
 2   2       292 non-null    int64 
 3   3       292 non-null    int64 
 4   4       292 non-null    int64 
 5   5       292 non-null    int64 
 6   6       292 non-null    int64 
 7   7       292 non-null    int64 
 8   8       292 non-null    int64 
 9   9       292 non-null    int64 
 10  10      292 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 25.2+ KB


In [38]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,292.0,4688696.0,2893269.0,1.0,2180898.75,4591292.5,7010514.25,10409787.0
2,292.0,4688796.0,2893277.0,2.0,2180977.0,4591404.0,7010644.75,10409952.0
3,292.0,4688896.0,2893284.0,3.0,2181053.75,4591518.0,7010817.0,10410099.0
4,292.0,4688996.0,2893291.0,4.0,2181132.25,4591640.5,7010981.5,10410220.0
5,292.0,4689097.0,2893298.0,5.0,2181212.0,4591771.5,7011147.5,10410335.0
6,292.0,4689198.0,2893307.0,6.0,2181292.5,4591904.5,7011315.5,10410496.0
7,292.0,4689299.0,2893315.0,7.0,2181376.0,4592036.0,7011479.0,10410645.0
8,292.0,4689400.0,2893323.0,8.0,2181463.25,4592169.5,7011648.0,10410784.0
9,292.0,4689501.0,2893332.0,9.0,2181551.5,4592299.0,7011800.5,10410906.0
10,292.0,4689602.0,2893340.0,10.0,2181640.0,4592433.5,7011945.25,10411019.0


In [39]:
dataset = dataset.iloc[1: , :]

In [40]:
X = pd.DataFrame()

In [41]:
X

### Adding ID column

In [42]:
column_name = "IDs"

## Adding target column

In [43]:
X[column_name] = dataset[0]

In [44]:
target_column = "has_atrial_fib"
ones_array = np.zeros(dataset.shape[0])

X[target_column] = ones_array.astype(int)

### Adding HRmax - HRmin column

In [45]:
sampling_freq = 250;
secs_in_min = 60;

def transform_samples_to_time_diffs(row):

  row = row.astype(float)

  nn_intervals = np.diff(row)

  transform_to_time_diffs = np.vectorize(lambda num_of_samples: num_of_samples / sampling_freq )

  return transform_to_time_diffs(nn_intervals)


In [46]:
time_diffs = dataset.iloc[:, 1:].T.apply(transform_samples_to_time_diffs).T

In [47]:
column_name = "HRmax - HRmin"

In [48]:

def calculate_hrmax_hrmin(row):

  calculate_hr = np.vectorize(lambda time_diff: secs_in_min / time_diff)

  hrs = calculate_hr(row)

  return np.max(hrs) - np.min(hrs)

X[column_name] = time_diffs.iloc[:, :].T.apply(calculate_hrmax_hrmin)

X[column_name]

1      25.038253
2      37.865749
3      14.730640
4      17.073171
5      33.914729
         ...    
287     8.059317
288     9.095634
289    30.246453
290     7.443553
291    13.304306
Name: HRmax - HRmin, Length: 291, dtype: float64

## Adding SDNN column

In [49]:
def calculate_sdnn(time_samples):

    sdnn = np.std(time_samples, ddof=1)  # ddof=1 for sample standard deviation

    return sdnn


In [50]:
SDNNs = time_diffs.iloc[:, :].T.apply(calculate_sdnn)

In [51]:
column_name = "SDNN"
X[column_name] = SDNNs

## SDSD

In [52]:
def calculate_sdsd(rr_intervals):
    # Calculate the differences between successive RR intervals
    successive_diffs = np.diff(rr_intervals)

    # Calculate SDSD (standard deviation of successive differences)
    sdsd = np.std(successive_diffs, ddof=1)  # ddof=1 for sample standard deviation

    return sdsd

In [53]:
SDSDs = time_diffs.iloc[:, :].T.apply(calculate_sdsd)

In [54]:
column_name = "SDSD"
X[column_name] = SDSDs

## NN50

In [55]:
def calculate_nn50(nn_intervals):
    # Calculate the successive differences between NN intervals
    nn_diff = np.abs(np.diff(nn_intervals))

    # Count the number of differences greater than 50 ms
    nn50 = np.sum(nn_diff > 0.050)

    return nn50

In [56]:
NN50s = time_diffs.iloc[:, :].T.apply(calculate_nn50)

In [57]:
column_name = "NN50"
X[column_name] = NN50s

## NN20

In [58]:
def calculate_nn20(nn_intervals):
    # Calculate the successive differences between NN intervals
    nn_diff = np.abs(np.diff(nn_intervals))

    # Count the number of differences greater than 50 ms
    nn20 = np.sum(nn_diff > 0.020)

    return nn20

In [59]:
NN20s = time_diffs.iloc[:, :].T.apply(calculate_nn20)

In [60]:
column_name = "NN20"
X[column_name] = NN20s

## NNs mean

In [61]:
means = time_diffs.iloc[:, :].T.apply(np.mean)

In [62]:
column_name = "nni_mean"
X[column_name] = means

## NNs min and max

In [63]:
mins = time_diffs.iloc[:, :].T.apply(np.min)

column_name = "nni_min"
X[column_name] = mins

In [64]:
maxs = time_diffs.iloc[:, :].T.apply(np.max)

column_name = "nni_max"
X[column_name] = maxs

## NNs diff mean

In [65]:
def calculate_nni_diff_mean(nn_intervals):
    # Calculate the successive differences between NN intervals
    nn_diff = np.abs(np.diff(nn_intervals))

    # Count the number of differences greater than 50 ms
    nni_diff_mean = np.mean(nn_diff)

    return nni_diff_mean

In [66]:
nni_diff_mean = time_diffs.iloc[:, :].T.apply(calculate_nni_diff_mean)

In [67]:
column_name = "nni_diff_mean"
X[column_name] = mins

## NNs diff min and max

In [68]:
def calculate_nni_diff_max(nn_intervals):
    # Calculate the successive differences between NN intervals
    nn_diff = np.abs(np.diff(nn_intervals))

    # Count the number of differences greater than 50 ms
    nni_diff_max = np.max(nn_diff)

    return nni_diff_max

def calculate_nni_diff_min(nn_intervals):
    # Calculate the successive differences between NN intervals
    nn_diff = np.abs(np.diff(nn_intervals))

    # Count the number of differences greater than 50 ms
    nni_diff_min = np.min(nn_diff)

    return nni_diff_min

In [69]:
nni_diff_max = time_diffs.iloc[:, :].T.apply(calculate_nni_diff_max)

In [70]:
nni_diff_min = time_diffs.iloc[:, :].T.apply(calculate_nni_diff_min)

In [71]:
column_name = "nni_diff_max"
X[column_name] = nni_diff_max

In [72]:
column_name = "nni_diff_min"
X[column_name] = nni_diff_min

In [81]:
time_domain_features_nsrdb = X

In [86]:
time_domain_features_afdb = pd.read_csv("time_domain_features_afdb.csv")

In [87]:
time_domain_features_afdb

Unnamed: 0,IDs,HRmax - HRmin,SDNN,SDSD,NN50,NN20,nni_mean,nni_min,nni_max,nni_diff_mean,nni_diff_max,nni_diff_min,has_atrial_fib
0,04015_1,97.068900,0.087356,0.085202,5,7,0.456889,0.296,0.568,0.296,0.144,0.020,1
1,04015_2,57.115081,0.116138,0.211639,7,8,0.590667,0.468,0.844,0.468,0.332,0.048,1
2,04015_3,115.777080,0.150737,0.151200,7,8,0.515556,0.312,0.784,0.312,0.272,0.020,1
3,04015_4,120.432321,0.097532,0.083241,7,7,0.410222,0.268,0.580,0.268,0.120,0.008,1
4,04015_5,48.508368,0.053666,0.060264,2,6,0.460000,0.372,0.532,0.372,0.116,0.004,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,08434_1,40.782493,0.051614,0.079122,4,8,0.516000,0.416,0.580,0.416,0.128,0.024,1
287,08434_2,42.251537,0.095424,0.142531,4,7,0.576444,0.476,0.716,0.476,0.220,0.008,1
288,08434_3,106.382979,0.249731,0.297777,8,8,0.663556,0.376,1.128,0.376,0.516,0.116,1
289,08455_1,53.035714,0.130319,0.233301,7,8,0.692889,0.500,0.896,0.500,0.396,0.020,1


In [None]:
time_domain_features_afdb

Unnamed: 0,IDs,HRmax - HRmin,SDNN,SDSD,NN50,NN20,nni_mean,nni_min,nni_max,nni_diff_mean,nni_diff_max,nni_diff_min,has_atrial_fib
0,04015_1,97.068900,0.087356,0.085202,5,7,0.456889,0.296,0.568,0.296,0.144,0.020,1
1,04015_2,57.115081,0.116138,0.211639,7,8,0.590667,0.468,0.844,0.468,0.332,0.048,1
2,04015_3,115.777080,0.150737,0.151200,7,8,0.515556,0.312,0.784,0.312,0.272,0.020,1
3,04015_4,120.432321,0.097532,0.083241,7,7,0.410222,0.268,0.580,0.268,0.120,0.008,1
4,04015_5,48.508368,0.053666,0.060264,2,6,0.460000,0.372,0.532,0.372,0.116,0.004,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,08434_1,40.782493,0.051614,0.079122,4,8,0.516000,0.416,0.580,0.416,0.128,0.024,1
287,08434_2,42.251537,0.095424,0.142531,4,7,0.576444,0.476,0.716,0.476,0.220,0.008,1
288,08434_3,106.382979,0.249731,0.297777,8,8,0.663556,0.376,1.128,0.376,0.516,0.116,1
289,08455_1,53.035714,0.130319,0.233301,7,8,0.692889,0.500,0.896,0.500,0.396,0.020,1


In [101]:
combined_df = pd.concat([time_domain_features_afdb, time_domain_features_nsrdb], ignore_index=True)

In [89]:
combined_df

Unnamed: 0,IDs,HRmax - HRmin,SDNN,SDSD,NN50,NN20,nni_mean,nni_min,nni_max,nni_diff_mean,nni_diff_max,nni_diff_min,has_atrial_fib
0,04015_1,97.068900,0.087356,0.085202,5,7,0.456889,0.296,0.568,0.296,0.144,0.020,1
1,04015_2,57.115081,0.116138,0.211639,7,8,0.590667,0.468,0.844,0.468,0.332,0.048,1
2,04015_3,115.777080,0.150737,0.151200,7,8,0.515556,0.312,0.784,0.312,0.272,0.020,1
3,04015_4,120.432321,0.097532,0.083241,7,7,0.410222,0.268,0.580,0.268,0.120,0.008,1
4,04015_5,48.508368,0.053666,0.060264,2,6,0.460000,0.372,0.532,0.372,0.116,0.004,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,08434_1,40.782493,0.051614,0.079122,4,8,0.516000,0.416,0.580,0.416,0.128,0.024,1
578,08434_2,42.251537,0.095424,0.142531,4,7,0.576444,0.476,0.716,0.476,0.220,0.008,1
579,08434_3,106.382979,0.249731,0.297777,8,8,0.663556,0.376,1.128,0.376,0.516,0.116,1
580,08455_1,53.035714,0.130319,0.233301,7,8,0.692889,0.500,0.896,0.500,0.396,0.020,1


# Model training

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [102]:
target_column = "has_atrial_fib"

y = combined_df[target_column]

In [118]:
X = combined_df.drop([target_column, "IDs"], axis=1)

## Model training and evaluation

In [112]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [119]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [137]:
models = {
    'SVC': SVC(),
    'SGDClassifier': SGDClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GaussianProcessClassifier': GaussianProcessClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier(),
}

In [139]:
models = {
    'XGBClassifier': XGBClassifier(),
}

In [134]:
param_grids = {
    'SVC': {
        'C':[1,10,100,1000],
        'gamma':[1,0.1,0.001,0.0001],
        'kernel':['linear','rbf']
    },
    'SGDClassifier': {
    },
    'KNeighborsClassifier': {
        "n_neighbors" : list(range(1, 31)),
        "weights" : ["uniform", "distance"],
        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],

    },
    'GaussianProcessClassifier': {
    },
    'DecisionTreeClassifier': {
    },
    'LogisticRegression': {
    },
    'RandomForestClassifier': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
    },
    'XGBClassifier': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [1, 2, 3, 6, 10],
    }
}

In [147]:
param_grids = {
    'XGBClassifier': {
        'n_estimators': [50, 100, 200, 500, 1000],
        'learning_rate': [0.001, 0.01, 0.1, 0.3],
        'max_depth': [1, 2, 3, 6, 10],
    }
}

In [148]:
# 3-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Train and tune the models
grids = {}
for model_name, model in models.items():

  print(f'Training and tuning {model_name}...')
  grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring="accuracy", n_jobs=-1, verbose=2)
  grids[model_name].fit(X, y)
  best_params = grids[model_name].best_params_
  best_score = grids[model_name].best_score_

  print(f'Best parameters for {model_name}: {best_params}')
  print(f'Best accuracy for {model_name}: {best_score}\n')

Training and tuning XGBClassifier...
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best parameters for XGBClassifier: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}
Best accuracy for XGBClassifier: 0.9827878433664525



## PCA

In [132]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
X_pca_pre = pca.fit_transform(X)

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)

# Choose the number of components based on the explained variance threshold
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
pipeline_pca = Pipeline(steps=[
                        ('pca', pca)])

X_pca = pipeline_pca.fit_transform(X)