**TM10007 Machine Learning**

## Import dependencies and data

In [None]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

if not os.path.isdir("tm10007_ml"):
    !git clone https://github.com/jveenland/tm10007_ml.git

if not os.path.isfile("tm10007_ml/ecg/ecg_data.csv"):
    with zipfile.ZipFile('tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
        zip_ref.extractall('tm10007_ml/ecg')

data = pd.read_csv('tm10007_ml/ecg/ecg_data.csv', index_col=0)


## split data into test and train data

In [None]:
x_train, x_test = model_selection.train_test_split(data, test_size=0.2)

ECG_ABNORMAL = data[data['label'] == 1]
ECG_NORMAL = data[data['label'] == 0]

x_train_abnormal, x_test_abnormal = model_selection.train_test_split(ECG_ABNORMAL, test_size=0.2)
x_train_normal, x_test_normal = model_selection.train_test_split(ECG_NORMAL,test_size = 0.2)

train_set = pd.concat([x_train_normal,x_train_abnormal])
test_set = pd.concat([x_test_normal,x_test_abnormal])


## Scale features of train and test set

In [None]:
# scaling the data
scaler = MinMaxScaler(feature_range=(0, 1))
# fit the scaler on the train set
scaler.fit(train_set)
# transform both train and test data set with the scaler
# rename the column names to the ones from 'data' again
train_set_scaled = pd.DataFrame(scaler.transform(train_set))
train_set_scaled.columns=data.columns.values
test_set_scaled = pd.DataFrame(scaler.transform(test_set))
test_set_scaled.columns=data.columns.values

## select best features

In [None]:
# Extract the features and labels
y = train_set['label']
X = train_set.drop(['label'], axis=1)

# Select the k best features using the chi-squared test
selector = SelectKBest(chi2, k=10)
X_new = selector.fit_transform(X, y)

# Print the indices of the selected features
print(f'The k best features are {selector.get_support(indices=True)}')
scores = pd.DataFrame(selector.scores_)
print(f'The scores for each of the k features are {scores}')

threshold = scores.iloc[0]/100
print(threshold)

# scores_above_cut_off = scores[scores['value'] > threshold]
# scores_above_cut_off = scores.mask(scores < threshold)
# value_k = len(scores_above_cut_off)
