<a href="https://colab.research.google.com/github/naveenhooda2000/IRModelSelection/blob/master/ecg_project_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
!pip install -q https://github.com/neuropsychology/NeuroKit.py/zipball/master

  Building wheel for neurokit (setup.py) ... [?25l[?25hdone


In [69]:
!pip install joblib



In [0]:
%tensorflow_version 1.x

In [0]:
import os
import sys
import glob
import math
import pickle

import matplotlib.pyplot as plt
import neurokit as nk
import numpy as np
from numpy import where
import pandas as pd
import tensorflow as tf

from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.externals import joblib
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.svm import OneClassSVM

from keras.layers import Dense, LSTM, Dropout, Activation
from keras.models import Sequential
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical

import warnings
warnings.simplefilter('ignore', DeprecationWarning)


In [0]:
# paths for different type of files
ECG_VALUES_FILES_PATH = '/content/drive/My Drive/Final Project/dataset/ecg-bg/data/ECG'
DATA_FRAME_FILE_PATH = '/content/drive/My Drive/Final Project/dataset/ecg-bg/ecg_reading_data_frame.csv'
META_DATA_FILE_PATH = '/content/drive/My Drive/Final Project/dataset/ecg-bg/data/meta.csv'
R_PEAK_DATA_FILE_PATH = '/content/drive/My Drive/Final Project/dataset/ecg-bg/rpeak-data-label.txt'

In [0]:
def extract_filename_from_path(path):
  return (os.path.split(path)[1])

In [0]:
def get_input_dataframe(use_data_frame_file = True):
  # read the labels from meta data and append them in the end
  meta_data_frame = pd.read_csv(META_DATA_FILE_PATH, engine='python',skiprows = 1, names=['Id',	'Date',	'Time',	'Age',	'Gender',	'Height',	'Weight',	'Hr',	'G'])
  ecg_labels_dict = {}
  for index, row in meta_data_frame.iterrows():
    if (int(row["G"]) > 140):
      ecg_labels_dict[row["Id"]] = 1
    else:
      ecg_labels_dict[row["Id"]] = 0
      # ecg_labels_dict[row["Id"]] = int(row["G"])
  # read the ecg values either from pre-calculated file or from each file by file.
  ecg_values_data_frame = pd.DataFrame()
  if use_data_frame_file:
      ecg_values_data_frame = pd.read_csv(DATA_FRAME_FILE_PATH, engine='python')
  else:
    all_files = glob.glob(ECG_VALUES_FILES_PATH + '/*.csv')
    li_df = []
    counter = 0
    for filename in all_files:
      df = pd.read_csv(filename, usecols=[1], engine='python')
      df = df.transpose();
      df.insert(0, 'fn', extract_filename_from_path(filename))
      li_df.append(df)
      counter = counter + 1
      if counter % 10 == 0:
        print(counter)
    ecg_values_data_frame = pd.concat(li_df, axis=0, ignore_index=True)
  label_serries_arr = []
  # iterate over file name column so that we can create a corresponding column for labels to be appended to ecg_values_data_frame
  for filename_col in ecg_values_data_frame[['fn']]:
    print(ecg_values_data_frame[filename_col].values.shape)
    for index, x in np.ndenumerate(ecg_values_data_frame[filename_col].values):
      key = x.replace(".csv", "")
      label_serries_arr.append(ecg_labels_dict[key])
  label_serries = pd.Series(data=label_serries_arr)
  ecg_values_data_frame['labels'] = label_serries
  return ecg_values_data_frame

In [75]:
ecg_values_data_frame_with_file_name = get_input_dataframe()
ecg_values_data_frame = ecg_values_data_frame_with_file_name.drop('fn', 1)
ecg_values = ecg_values_data_frame.values

(2238,)


In [0]:
def get_dense_data_vals(processed_ecg):
  rpeaks_inds = list(processed_ecg['ECG']['R_Peaks'])
  ECG_Filtered = list(processed_ecg['df']['ECG_Filtered'])

  dense_data_vals = []
  len_of_ECG = len(ECG_Filtered) # ECG data sampled at 1000Hz.
  left_algn_ecg = 150            # 150 ms left of Rpeak
  right_algn_ecg = 250           # 250 ms right of rpeak
  sample_step = 1
  seg_number = 0
  for each_rpeak_ind in rpeaks_inds :
      temp_data = []
      temp_inds = []
      temp_dict = {}
      for all_data in range((each_rpeak_ind - left_algn_ecg), (each_rpeak_ind + right_algn_ecg), sample_step) :
          temp_inds.append(all_data)
          temp_data.append(ECG_Filtered[all_data])
      temp_dict['inds'] = temp_inds
      temp_dict['sig'] = temp_data
      temp_dict['rpeak'] = ECG_Filtered[each_rpeak_ind]
      temp_dict['rpeak_ind'] = each_rpeak_ind
      temp_dict['seg_number'] = seg_number
      dense_data_vals.append(temp_dict)
      seg_number += 1
  return dense_data_vals

In [0]:
def getRPeaksLabelledFromFiducialPoints(data_frame):
  rpeak_list_labelled = []
  for index, row in data_frame.iterrows():
    np_temp = row.dropna().to_numpy()
    # https://neurokit.readthedocs.io/en/latest/documentation.html
    processed_ecg = nk.ecg_process(np_temp,sampling_rate=1000,filter_type='FIR',filter_band='bandpass',filter_frequency=[1, 40],segmenter='hamilton',quality_model='default') 
    dense_data_vals = get_dense_data_vals(processed_ecg)
    rpeak_labeled = []
    for dense_data_val in dense_data_vals:
        rpeak_labeled.append(dense_data_val['rpeak']);
    # append the label
    rpeak_labeled.append(row['labels'])
    rpeak_list_labelled.append(rpeak_labeled)
  return rpeak_list_labelled

In [0]:
rpeak_labelled_list = getRPeaksLabelledFromFiducialPoints(ecg_values_data_frame_deleted)

In [0]:
with open(R_PEAK_DATA_FILE_PATH, "wb") as fp:
  pickle.dump(rpeak_labelled_list, fp)

In [0]:
with open(R_PEAK_DATA_FILE_PATH, "rb") as fp:   # Unpickling
  rpeak_labelled_list_loaded = pickle.load(fp)

In [0]:
def sample_training_dataset(input_values, percentage = 0.7):
  freqmap = np.array(np.unique(input_values[:, [input_values.shape[1] - 1]].astype(int), return_counts=True)).T
  print (freqmap)
  freqmap[:,1] = freqmap[:,1].astype(float) * percentage
  # stores the required freq of each label
  labelCountDict = {};
  for label, freq in freqmap:
    labelCountDict[label] = float(freq)
  print(labelCountDict)
  train_arr, test_arr = [], []
  # go through each row of input array after removing NaN
  for row in input_values:
    label_column_index = row[input_values.shape[1] - 1]
    # if we still have required freq of label, we add to train dataset or add to test dataset
    if labelCountDict[label_column_index] > 0:
      train_arr.append(row)
      labelCountDict[label_column_index] = labelCountDict[label_column_index] - 1
    else:
      test_arr.append(row)
  return np.array(train_arr), np.array(test_arr)

In [0]:
def getNumpyNDArray(rpeak_labelled):
  features = []
  labels = []
  for record in rpeak_labelled:
    labels.append(record[-1])
    record = record[:-1]
    features.append(record)
  print(features)
  features_df = pd.DataFrame(features)
  features_numpy_array = np.asarray(features_df)

  labels_df = pd.DataFrame(labels)
  labels_numpy_array = np.asarray(labels_df)

  return np.concatenate((features_numpy_array, labels_numpy_array), axis=1)

In [0]:
A = getNumpyNDArray(rpeak_labelled_list_loaded)

In [0]:
train, test = sample_training_dataset(getNumpyNDArray(rpeak_labelled_list_loaded), 0.8)

In [0]:
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

In [0]:
clf = OneClassSVM(gamma='auto')
clf.fit(train_X)
y_pred_train = clf.predict(train_X)
y_pred_test = clf.predict(test_X)