In [None]:
import pandas as pd

In [None]:
X_df = pd.read_csv('data/dataset/dfu_features_dataset.csv', index_col=0)
y_df = pd.read_csv('data/dataset/dfu_labels_dataset.csv', index_col=0)

In [None]:
X_df

In [None]:
# matplolib for showing the correlation between variables
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 11))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(X_df.corr(), cmap=cmap, vmax=1., vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
y_df[y_df==0].index

# Correlation between Control Group and Diabetic

In [None]:
# filter dataframe by index prefix
# cg_group = X_df.filter(regex='^CG', axis=0).to_numpy()
# dm_group = X_df.filter(regex='^DM', axis=0).to_numpy()

cg_group_idx = y_df[y_df.Label==0].index
cg_group = X_df.loc[cg_group_idx]

df_group_idx = y_df[y_df.Label==1].index
dm_group = X_df.loc[df_group_idx]

# calculate the correlation between two groups
import numpy as np
corr = np.corrcoef(cg_group, dm_group)

# Plot correlation matrix
plt.figure(figsize=(12, 10))
plt.imshow(corr, cmap='Purples', vmin=.5, vmax=1, interpolation='nearest')
plt.xticks([])
plt.yticks([])
plt.colorbar()
plt.show()

# Compute ROC-AUC score

In [None]:
X = np.concatenate((cg_group, dm_group), axis=0)
y = np.concatenate((np.zeros(cg_group.shape[0]), np.ones(dm_group.shape[0])))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import numpy as np

roc_auc = np.zeros(X.shape[1])

for feature in range(X.shape[1]):
    _X = np.expand_dims(X[:,feature], axis=1)
    clf = LogisticRegression(solver="liblinear", random_state=0).fit(_X, y)
    y_hat = np.argmax(clf.predict_proba(_X), axis=1)
    roc_auc[feature] = roc_auc_score(y, y_hat)

# Force AUC >= .5
roc_auc = (np.abs(roc_auc - 0.5) + 0.5)

In [None]:
from matplotlib import pyplot as plt 

plt.plot(roc_auc)
plt.xlabel('Feature')
plt.ylabel('AUC')
plt.show()

# Remove high-correlated features

In [None]:
# Sorted by Feature Ranking based on ROC-AUC
X_df = X_df.iloc[:,roc_auc.argsort()[::-1]]
X_df.head()

In [None]:
import numpy as np

def select_features(df, threshold=.95):
    '''
        Select the features and remove those which have a high correlation factor.

        The features are selected taking into account the order of the features, i.e.,
        if three features are highly-correlated, the first one is selected and the
        other two are removed. For this reason, it is important the order of the features
        is sorted by the feature ranking such as, for instance, ROC-AUC.

        Parameters
        ----------
        df : pandas.DataFrame
            Dataframe which contains the dataset

        threshold : float
            Threshold for the correlation factor

        Returns
        -------
        selected_features : list
            List of selected features
    '''

    tmp_df = df.copy()
    mask = np.ones(len(tmp_df.columns), dtype=bool)
    selected_features = []

    while mask.sum() > 0:
        tmp_df = tmp_df.iloc[:,mask]
        mask = np.ones(len(tmp_df.columns), dtype=bool)
        corr = tmp_df.corr().abs().to_numpy()
        
        selected_features.append(tmp_df.columns[0])

        corr_with_current_feature = corr[0,:]
        idx = np.argwhere(corr_with_current_feature > .95).flatten()

        mask[idx] = False

    return selected_features


selected_features = select_features(X_df, threshold=.95)

In [None]:
X_df[selected_features].head()

In [None]:
columns = X_df[selected_features].columns.to_list()
for i in range(len(columns)):
    print(f'{i+1:02d} - {columns[i]}')

In [None]:
X_df[selected_features].to_csv('data/dataset/dfu_features_dataset_selected.csv')