Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.feature_selection import SelectKBest, chi2, RFE, mutual_info_classif
from sklearn.svm import LinearSVC

from mrmr import mrmr_classif
from ReliefF import ReliefF
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

Define ReliefF method

In [2]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def find_nearest_neighbors(instance, data, k):
    distances = [euclidean_distance(instance, x) for x in data]
    sorted_indices = np.argsort(distances)
    return sorted_indices[1:k+1]

def reliefF(X, y, k=3, num_iterations=100):
    num_samples, num_features = X.shape
    feature_weights = np.zeros(num_features)

    for _ in range(num_iterations):
        instance_idx = np.random.randint(0, num_samples)
        instance = X.iloc[instance_idx].values
        same_class_indices = np.where(y == y[instance_idx])[0]
        different_class_indices = np.where(y != y[instance_idx])[0]

        # Find k nearest neighbors from the same class
        same_class_neighbors = find_nearest_neighbors(instance, X.iloc[same_class_indices].values, k)
        # Find k nearest neighbors from different classes
        different_class_neighbors = find_nearest_neighbors(instance, X.iloc[different_class_indices].values, k)

        # Update feature weights
        for feature in range(num_features):
            nearest_same = np.mean(X.iloc[same_class_indices].values[:, feature][same_class_neighbors])
            nearest_different = np.mean(X.iloc[different_class_indices].values[:, feature][different_class_neighbors])
            feature_weights[feature] += abs(instance[feature] - nearest_same) - abs(instance[feature] - nearest_different)

    feature_weights /= num_iterations
    return feature_weights

Load dataset

In [3]:
df = pd.read_csv('../data/diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


check for null value

In [4]:
nan_values = df.isna()
sum_nan_values = df.isna().sum()
sum_nan_values

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

find the most frequent values

In [5]:
modes = df.mode().iloc[0]
modes

Pregnancies                  1.000
Glucose                     99.000
BloodPressure               70.000
SkinThickness                0.000
Insulin                      0.000
BMI                         32.000
DiabetesPedigreeFunction     0.254
Age                         22.000
Outcome                      0.000
Name: 0, dtype: float64

show only numeric values

In [6]:
df.describe(include=(np.number))

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Features(X)

In [7]:
X = df.drop(columns=['Outcome'])
features = X.columns
features

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

target variable(y)

In [8]:
y = df['Outcome']
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

data normalization with MinMaxScaler

In [9]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
x_normed = mms.fit_transform(X) 

In [10]:
x_normed=pd.DataFrame(x_normed, columns=features)
x_normed

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


cross validation

In [11]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.75974026, 0.75324675, 0.74025974, 0.81045752, 0.76470588])

Split the dataset into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_normed, y, test_size=0.2, random_state=42)

training data

In [13]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
60,0.117647,0.422111,0.000000,0.000000,0.000000,0.000000,0.096499,0.000000
618,0.529412,0.562814,0.672131,0.242424,0.000000,0.420268,0.514091,0.483333
346,0.058824,0.698492,0.377049,0.191919,0.098109,0.427720,0.245944,0.016667
294,0.000000,0.809045,0.409836,0.000000,0.000000,0.326379,0.075149,0.733333
231,0.352941,0.673367,0.655738,0.373737,0.437352,0.688525,0.068318,0.416667
...,...,...,...,...,...,...,...,...
71,0.294118,0.698492,0.524590,0.353535,0.165485,0.426230,0.142186,0.083333
106,0.058824,0.482412,1.000000,0.000000,0.000000,0.333830,0.055081,0.100000
270,0.588235,0.507538,0.704918,0.373737,0.000000,0.679583,0.451751,0.283333
435,0.000000,0.708543,0.000000,0.000000,0.000000,0.631893,0.054227,0.133333


test data

In [14]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
668,0.352941,0.492462,0.475410,0.333333,0.224586,0.506706,0.150299,0.366667
324,0.117647,0.562814,0.614754,0.323232,0.000000,0.532042,0.029889,0.000000
624,0.117647,0.542714,0.524590,0.000000,0.000000,0.459016,0.034159,0.000000
690,0.470588,0.537688,0.655738,0.000000,0.000000,0.366617,0.332195,0.216667
473,0.411765,0.683417,0.737705,0.000000,0.000000,0.445604,0.056362,0.483333
...,...,...,...,...,...,...,...,...
355,0.529412,0.829146,0.721311,0.000000,0.000000,0.453055,0.095645,0.466667
534,0.058824,0.386935,0.459016,0.303030,0.066194,0.496274,0.500854,0.050000
344,0.470588,0.477387,0.590164,0.000000,0.000000,0.548435,0.173783,0.600000
296,0.117647,0.733668,0.573770,0.383838,0.425532,0.417288,0.110589,0.133333


correlation matrix

In [15]:
corrs = df.corr()
corrs['Outcome']

Pregnancies                 0.221898
Glucose                     0.466581
BloodPressure               0.065068
SkinThickness               0.074752
Insulin                     0.130548
BMI                         0.292695
DiabetesPedigreeFunction    0.173844
Age                         0.238356
Outcome                     1.000000
Name: Outcome, dtype: float64

In [16]:
import plotly.figure_factory as ff
from plotly.offline import iplot

# Correlation Heatmap
iplot(ff.create_annotated_heatmap(corrs.iloc[:8, :8].round(3).values, x=list(corrs.iloc[:8, :8].columns), 
                                  y=list(corrs.iloc[:8, :8].index), annotation_text=corrs.iloc[:8, :8].round(3).values))

Chi square

In [17]:
k_best = SelectKBest(score_func=chi2, k=4)

# Fit the feature selector to the data
k_best.fit(X, y)

# Get the selected features (indexes of the selected features)
selected_features = k_best.get_support(indices=True)

# Transform the original data to retain only the selected features
X_selected = k_best.transform(X)
selected_features

array([1, 4, 5, 7], dtype=int64)

Mutual information

In [18]:
# Create the SelectKBest object with mutual_info_classif scoring function
k_best = SelectKBest(score_func=mutual_info_classif, k=4)

# Fit the feature selector to the data
k_best.fit(X, y)

# Get the selected features (indexes of the selected features)
selected_features = k_best.get_support(indices=True)

# Transform the original data to retain only the selected features
X_selected = k_best.transform(X)
selected_features

array([0, 1, 5, 7], dtype=int64)

mrmr

In [19]:
selected_features = mrmr_classif(X=X, y=y, K=4)
selected_features

100%|██████████| 4/4 [00:00<00:00, 41.56it/s]


['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction']

ReliefF

In [20]:
feature_weights = reliefF(X, y, k=3, num_iterations=100)
feature_weights

array([-0.37      , -2.18      , -1.65      , -1.48      , -1.26666667,
       -1.77533333, -0.01524333, -1.65666667])

Step Forward Selection

In [21]:
# Create a classifier (replace this with the classifier of your choice)
clf = KNeighborsClassifier()

# Create the SequentialFeatureSelector object with forward selection
sfs = SFS(clf,
          k_features=(1, 8),  # Range of features to select (1 to all features)
          forward=True,  # Forward selection (can also use backward=False for backward selection)
          floating=False,  # Disable floating search
          scoring='accuracy',  # Scoring metric for feature selection
          cv=5)  # Cross-validation folds

# Fit the SequentialFeatureSelector to the training data
sfs = sfs.fit(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = sfs.k_feature_idx_

# Transform the original data to retain only the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

selected_feature_indices


(0, 1, 2, 4, 5)

Step Backward Selection

In [22]:
model = LogisticRegression()  # You can use any other model of your choice

# Initialize the step backward feature selector
sbs = SFS(model,
        k_features=(1, 8),
        forward=False,  # Change to False for step backward selection
        floating=False,
        #verbose=2,
        scoring='accuracy',  # Change this to your chosen metric
        cv=5)  # Number of cross-validation folds

# Perform step backward feature selection
sbs.fit(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = sbs.k_feature_idx_

# Convert the indices to feature names
selected_features = [X.columns[idx] for idx in selected_feature_indices]

selected_feature_indices

(0, 1, 5, 6)

Algorithms

In [23]:
# Define a list of classification algorithms you want to run
algorithms = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), svm.SVC(), RandomForestClassifier(), MLPClassifier(), GradientBoostingClassifier(), XGBClassifier(), CatBoostClassifier(verbose=False)]

In [24]:
# Create a for loop to run each algorithm
for algorithm in algorithms:
    model_name = type(algorithm).__name__
    print(f"Training {model_name}...")
    
    # Fit the model on the training data
    algorithm.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = algorithm.predict(X_test)
    
    # Evaluate the model's performance using various metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    specificity = tn / (tn + fp)
    
    print(f"{model_name} evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix: {confusion}")
    print(f"Specificity: {specificity}\n")

Training LogisticRegression...
LogisticRegression evaluation:
Accuracy: 0.7662337662337663
Precision: 0.7611766419105869
Recall: 0.7662337662337663
F1 Score: 0.7601648351648352
Confusion Matrix: [[86 13]
 [23 32]]
Specificity: 0.8686868686868687

Training KNeighborsClassifier...
KNeighborsClassifier evaluation:
Accuracy: 0.6883116883116883
Precision: 0.6816326530612246
Recall: 0.6883116883116883
F1 Score: 0.6839043309631545
Confusion Matrix: [[78 21]
 [27 28]]
Specificity: 0.7878787878787878

Training DecisionTreeClassifier...
DecisionTreeClassifier evaluation:
Accuracy: 0.7532467532467533
Precision: 0.7605953010500869
Recall: 0.7532467532467533
F1 Score: 0.7557727832512315
Confusion Matrix: [[77 22]
 [16 39]]
Specificity: 0.7777777777777778

Training SVC...
SVC evaluation:
Accuracy: 0.7467532467532467
Precision: 0.7425824175824176
Recall: 0.7467532467532467
F1 Score: 0.7438423645320197
Confusion Matrix: [[82 17]
 [22 33]]
Specificity: 0.8282828282828283

Training RandomForestClassifie

Feature Selection

In [25]:
columns_to_select = [0, 1, 5, 6, 7]  # Replace [0, 2] with a list of index numbers of columns you want to select
x_normed = x_normed.iloc[:, columns_to_select]
x_normed

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.642325,0.943638,0.200000
...,...,...,...,...,...
763,0.588235,0.507538,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.448584,0.115713,0.433333


In [26]:
X_train, X_test, y_train, y_test = train_test_split(x_normed, y, test_size=0.2, random_state=42)

In [27]:
for algorithm in algorithms:
    model_name = type(algorithm).__name__
    print(f"Training {model_name}...")
    
    # Fit the model on the training data
    algorithm.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = algorithm.predict(X_test)
    
    # Evaluate the model's performance using various metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    specificity = tn / (tn + fp)
    
    print(f"{model_name} evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix: {confusion}")
    print(f"Specificity: {specificity}\n")

Training LogisticRegression...
LogisticRegression evaluation:
Accuracy: 0.7727272727272727
Precision: 0.7681159420289854
Recall: 0.7727272727272727
F1 Score: 0.7675419715884632
Confusion Matrix: [[86 13]
 [22 33]]
Specificity: 0.8686868686868687

Training KNeighborsClassifier...
KNeighborsClassifier evaluation:
Accuracy: 0.7337662337662337
Precision: 0.7372998768472907
Recall: 0.7337662337662337
F1 Score: 0.7352523582612079
Confusion Matrix: [[77 22]
 [19 36]]
Specificity: 0.7777777777777778

Training DecisionTreeClassifier...
DecisionTreeClassifier evaluation:
Accuracy: 0.7337662337662337
Precision: 0.7429623321979564
Recall: 0.7337662337662337
F1 Score: 0.7368518625063127
Confusion Matrix: [[75 24]
 [17 38]]
Specificity: 0.7575757575757576

Training SVC...
SVC evaluation:
Accuracy: 0.7597402597402597
Precision: 0.7559065934065935
Recall: 0.7597402597402597
F1 Score: 0.7569786535303777
Confusion Matrix: [[83 16]
 [21 34]]
Specificity: 0.8383838383838383

Training RandomForestClassifie