# 1. First steps

______________

Importing Libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
# from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.metrics import Precision


from typing import List, Tuple, Literal, Dict
from itertools import combinations, product

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# fetch dataset 
df_train = pd.read_csv("set_train@val.csv")
df_test = pd.read_csv("set_test.csv") 

# drop first column (index)
df_train = df_train.iloc[:,1:]
df_test = df_test.iloc[:,1:]

# 2. Getting acquainted with data

_________________

- Categorical features:
   - *age*
   - *trestbps* - resting blood pressure /*mm Hg* (on admission to the hospital)
   - *chol* - serum cholesterol in *mg/dl*
   - *thalach* - Maximum heart rate achieved /*bps*
   - *oldpeak* -  ST depression induced by exercise relative to rest 
   - *ca* - Number of major vessels (0-3) colored by fluoroscopy

- Integer features:
   - *sex* 
       - 0: female
       - 1: male
   - *cp* - chest pain
       - 1: typical angina
       - 2: atypical angina
       - 3: non-anginal pain
       - 4: asymptomatic
   - *fbs* - fasting blood sugar > 120 mg/dl
       - 0: False
       - 1: True
   - *restecg* - Resting electrocardiographic results
       - 0: Normal
       - 1: Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
       - 2: Showing probable or definite left ventricular hypertrophy by Estes' criteria
   - *exang* - Exercise induced angina
       - 0: No
       - 1: Yes
   - *slope* - The slope of the peak exercise ST segment
       - 1: Up sloping
       - 2: Flat
       - 3: Down sloping
   - *thal*
       - 3: Normal
       - 6: Fixed defect
       - 7: Reversible defect
- Label
    - *num* - The final diagnosis of heart disease (angiographic disease status)

##### The information was taken from:
Nassif A. (et al). 2018. Machine Learning Classifications of Coronary Artery Disease. DOI: 10.1109/iSAI-NLP.2018.8692942

### 2.1 Checking training set

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
# checking the number of missing values in each feature
df_train.isnull().sum()

In [None]:
# number of samples per class
df_train.num.value_counts()

In [None]:
nrows = 2
ncols = int(np.ceil(len(df_train.columns) / (1.0*nrows)))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 15))
counter = 0
for i in range(nrows):
    for j in range(ncols):

        ax = axes[i][j]
        if counter < len(df_train.columns):

            ax.hist(df_train[df_train.columns[counter]], bins=10, color='blue', alpha=0.5)
            ax.set_ylabel(df_train.columns[counter])

        else: ax.set_axis_off()
        counter += 1

### 2.2 Checking test set

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.isnull().sum()

# 4. Feature Selection

__________

### 4.1 Features selection by correlation elimination

___________

We are using Spearman's correlation instead of Pearson's because practically all features don't have a normal distribution (as seen in the histograms above) and Spearman's is more robust to outliers.

Nassif et al. (2018; 10.1109/iSAI-NLP.2018.8692942) also applied correlation evaluation, but did not use Spearman's correlation.

Creating a function that selects features according correlation. The arguments of the function (1) the dataframe, (2) the correlation method, and (3) the correlation threshold for feature elimination.

In [None]:
def Corr_selection(df: pd.DataFrame, corr_method: str = 'spearman', threshold: float = 0.2) -> List:

    """This function selects the features to be eliminate according to a correlation threshold.
    
    Args:
        df: Dataframe - Dataset containing all data.
        corr_method: str - Which method to apply to calculate the correlation. Default is 'spearman'.
        threshold: float - a limit between 0 and 1. Default is 0.2.
    Return:
        a list value, containing the selected features.
        
    """
    corr_matrix = X.corr(method=corr_method)
    # covert corr_matrix to absolute values
    corr_matrix = np.array(abs(corr_matrix))
    # all features available
    features = np.array(df.columns)
    # list of features to eliminate
    to_eliminate = []
    # runs all lines
    for i in range(len(features)):
        # runs all columns
        for j in range(len(features)):
            
            if corr_matrix[i,j] > threshold and corr_matrix[i,j] != 1: 
                
                if np.mean(corr_matrix[i,:]) > np.mean(corr_matrix[:,j]): to_eliminate.append(features[i])
                else: to_eliminate.append(features[j])
    
    to_eliminate = np.unique(to_eliminate)
    selected_features = list( features[~np.isin(features, to_eliminate)] )
    return [selected_features]

### 4.2 Features selection by information gain

__________

Creating a function that selects features according to Mutual Information Gain. 

References:
- Nassif et al. (2018; 10.1109/iSAI-NLP.2018.8692942)
- Guhanesvar. *Feature Selection Based on Mutual Information Gain for Classification and Regression*. Medium. June 26, 2021.

In [None]:
def InfoGain_selection(X: pd.DataFrame, y: pd.DataFrame, n_features: int) -> Tuple[pd.Series, List]:

    """This function selects n_features according that have the greatest mutual information classification.

    Args:
        X: Dataframe - Dataset containing all data.
        y: Dataframe - Dataset containing all target data.
        n_features: int - Number of features to select.
    Return:
        a tuple of a Series and a list, of the importance and the selected features.
    """
    
    importance = mutual_info_classif(X, y)
    importance = pd.Series(data = importance, index = X.columns)
    importance = importance.sort_values(ascending=False)
    selected_features = list(importance.iloc[:n_features].index)
    return importance, [selected_features]

# 5. Class imbalance

_____________

As seen in the histogram of Fig. 14, there are many more samples with label 0 than with other labels. Therefore, class imbalance has to be dealt with. According to Hoffman [1], there are several ways to reduce class imbalance, such as using other performance metrics when assessing performance (precision, recall, F1 score, and confusion matrix), performing over-sampling (when the dataset is reletively small), or using the Synthetic Minority Over-sampling Technique (SMOTE) algorithm to create synthetic samples.

BUT, according to this video https://www.youtube.com/watch?v=adHqzek--d0, on stamp 3:52, the SMOTE technique is not good for multidimensional data (which is the one we have here)

REFERENCE [1]: K. Hoffman. *Machine Learning: How to Handle Class Imbalance*. Analytics Vidhya. February 13, 2021.

### 5.1 Oversampling by doubling classes different from 0

_____________

This method might create oversampling.

In [None]:
def oversampling_double(dataframe: pd.DataFrame) -> pd.DataFrame:   
    
    """This function doubles the number of samples with labels 1, 2, 3, 4.

    Args:
        dataframe: DataFrame - Dataset containing all data.
    Return:
        a DataFrame value.

    """
    
    for i in [1, 2, 3, 4]:

        df_add = dataframe[dataframe['num']==i]
        dataframe = pd.concat([dataframe, df_add], axis=0)

    # Restarting indices
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

### 5.2 Oversampling through SMOTE

____________________

In [None]:
def oversampling_smote(dataframe: pd.DataFrame) -> pd.DataFrame:
    """ This function oversamples a dataset with the Synthetic Minority Over-sampling Technique (it is required imblearn).
    
    Args:
        dataframe: DataFrame - Dataset containing all data.
    Return:
        a DataFrame value.

    """
    
    smote = SMOTE(random_state = 42)
    X, y = smote.fit_resample(dataframe.drop(columns='num'), dataframe['num'])
    y = pd.Series(data = y, name = 'num')
    df_smote = pd.concat([X, y], axis = 1, join = 'inner')
    return df_smote

### 5.3 Oversampling through Class Weight

____________________

In [None]:
def ClassWeight(dataframe: pd.DataFrame) -> Dict:
    
    from sklearn.utils import class_weight
    classes = dataframe.iloc[:,-1].unique()
    weights = class_weight.compute_class_weight(class_weight = 'balanced', 
                                                classes = classes, 
                                                y = dataframe.iloc[:,-1])
    ClassWeight = dict()
    for i in range(len(classes)): ClassWeight[classes[i]] = weights[i]
    return ClassWeight

# 6. Creating Models

____________

### 6.1 Convolutional Neural Networks

____________

In [None]:
X = np.array(df_train.drop('num', axis=1))
y = np.array(df_train['num'])
X_test = np.array(df_test.drop('num', axis=1))
y_test = np.array(df_test['num'])

# one-hot encoding
y_onehot = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1, 1))
y_test_onehot = OneHotEncoder(sparse=False).fit_transform(y_test.reshape(-1, 1))

# split dataset for training validation
X_train, X_val, y_train, y_val = train_test_split(X, y_onehot, test_size=0.2, random_state=42)


In [None]:
def conv_1d(X_train, X_val, X_test, y_train, y_val, y_test,
            n_conv_layers=1, filter=[64], kernel_size=[3], conv_activation='relu', 
            ann_activation='softmax'):

    # normalization
    scaler = StandardScaler().fit(X_train)
    X_train_norm = scaler.transform(X_train)
    X_val_norm = scaler.transform(X_val)
    X_test_norm = scaler.transform(X_test)

    # reshaping data to fit into CNN model
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

    model = Sequential()
    model.add(Conv1D(filters=filter[0], kernel_size=kernel_size[0], activation=conv_activation, input_shape=(13, 1)))
    # adding convolutional layers
    for i in range(1, n_conv_layers):
        model.add(Conv1D(filters=filter[i], kernel_size=kernel_size[i], activation=conv_activation))
        # input_shape = model.layers[-1].output_shape[1:]
    model.add(Dropout(0.5))
    model.add(Flatten())
    # last ANN layer
    length_flat = np.prod(model.output_shape[1:])
    model.add(Dense(round(length_flat/2), activation=ann_activation))
    # output layer
    model.add(Dense(5, activation=ann_activation))

    # compiling the model
    # we use Precision as the training metric because it is the best for heart disease detection
    # and it focuses on evaluating the true positives rate 
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[Precision()])

    # fitting the model
    model.fit(X_train_norm, y_train, epochs=500, batch_size=32, validation_data=(X_val_norm, y_val))
    history = model.history.history

    # prediction
    prediction = model.predict(X_test_norm)

    # undoing one-hot encoding
    prediction_decoded = [np.argmax(x) for x in prediction]
    y_test_decoded = [np.argmax(x) for x in y_test]

    # evalutation
    accuracy = accuracy_score(y_test_decoded, prediction_decoded)
    precision = precision_score(y_test_decoded, prediction_decoded, average='macro')
    recall = recall_score(y_test_decoded, prediction_decoded, average='macro')
    conf_matrix = confusion_matrix(y_test_decoded, prediction_decoded)

    return accuracy, precision, recall, conf_matrix, history, prediction

- The candidate parameters are mainly the number of layers, number of filters and size of filters.
- <b>NOTE:</b> When using 32 filters, we use filters of size 3 (less deep, but wider), 64 filters with size 4, and 128 filters with size 5 (deeper but norrower).

In [None]:
# candidate parameters 
n_conv_layers = [1, 2, 3]
all_filters = [
    [[32], [64], [128]], 
    [[32, 64], [64, 128], [32, 128]], 
    [[32, 64, 128], [64, 128, 32]]
    ]
all_kernels = [
    [[3], [4], [5]], 
    [[3, 4], [4, 5], [3, 5]], 
    [[3, 4, 5], [4, 5, 3]]
    ]
conv_activation = ['sigmoid', 'relu', 'softmax']
ann_activation = ['sigmoid', 'relu', 'softmax']

# creating models
cnn1d_results = {"n_conv_layers": [], "n_filters": [], "kernel_sizes": [], "conv_activation": [],
                 "ann_activation": [], "accuracy": [], "precision": [], "recall": [],
                 "conf_matrix": [], "history": [], "prediction": []}

for n_conv in n_conv_layers:
    filters = all_filters[n_conv]
    kernels = all_kernels[n_conv]
    
    for k in range(len(filters)):
        filt_layer = filters[k]
        kern_layer = kernels[k]
        
        for conv_act in conv_activation:
            for ann_act in ann_activation:

                accuracy, precision, recall, conf_matrix, history, prediction = conv_1d(
                    X_train, X_val, X_test, y_train, y_val, y_test, n_conv_layers=n_conv, 
                    filter=filt_layer, kernel_size=kern_layer, conv_activation=conv_act, 
                    ann_activation=ann_act)
                
                cnn1d_results["n_conv_layers"].append(n_conv)
                cnn1d_results["n_filters"].append(filt_layer)
                cnn1d_results["kernel_sizes"].append(kern_layer)
                cnn1d_results["conv_activation"].append(conv_act)
                cnn1d_results["ann_activation"].append(ann_act)
                cnn1d_results["accuracy"].append(accuracy)
                cnn1d_results["precision"].append(precision)
                cnn1d_results["recall"].append(recall)
                cnn1d_results["conf_matrix"].append(conf_matrix)
                cnn1d_results["history"].append(history)
                cnn1d_results["prediction"].append(prediction)

# 7. Models Evaluation

_______________________

### 7.1 1D-CNN

______________________

In [None]:
cnn1d_results_df = pd.DataFrame(cnn1d_results)
cnn1d_results_df_precison = cnn1d_results_df.sort_values("precision", ascending=False)

##### 7.1.1 Confusion Matrix (best precision)
________________

In [None]:
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cnn1d_results_df_precison['conf_matrix'], cmap=plt.cm.Blues, alpha=0.3)
for i in range(cnn1d_results_df_precison['conf_matrix'].shape[0]):
    for j in range(cnn1d_results_df_precison['conf_matrix'].shape[1]):
        ax.text(x = j, y = i, s = cnn1d_results_df_precison['conf_matrix'][i, j], 
                va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix of best model (precision)', fontsize=18)
plt.show()

##### 7.1.2 Loss function (best precision)

________________

In [None]:
cnn_training_loss = cnn1d_results_df_precison.history.loc[0]["loss"]
cnn_validation_loss = cnn1d_results_df_precison.history.loc[0]["val_loss"]

plt.figure()
plt.plot(cnn_training_loss)
plt.plot(cnn_validation_loss)
plt.legend(["Training", "Validation"])
plt.xlabel("Iteration")
plt.ylabel("Loss")

### 7.2 Standard DNN

______________________

##### 7.2.1 Confusion Matrix

________________

##### 7.2.2 Precision

________________

# 8. References
<a id="1">[1]</a>
Anggoro, Dimas Aryo and Kurnia, Naqshauliza Devi (2020). 
Comparison of accuracy level of support vector machine (SVM) and K-nearest neighbors (KNN) algorithms in predicting heart disease
International Journal, 8(5), 1689--1694.

<a id="2">[2]</a>
https://www.kaggle.com/code/zawlinnnaing/svm-for-multiclass-classification