In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.feature_selection import SelectKBest, chi2, RFE, mutual_info_classif
from sklearn.svm import LinearSVC

#import pymrmr
#from pymrmr import mRMR
from sklearn.model_selection import cross_val_score
from mrmr import mrmr_classif
from ReliefF import ReliefF
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def find_nearest_neighbors(instance, data, k):
    distances = [euclidean_distance(instance, x) for x in data]
    sorted_indices = np.argsort(distances)
    return sorted_indices[1:k+1]

def reliefF(X, y, k=3, num_iterations=100):
    num_samples, num_features = X.shape
    feature_weights = np.zeros(num_features)

    for _ in range(num_iterations):
        instance_idx = np.random.randint(0, num_samples)
        instance = X.iloc[instance_idx].values
        same_class_indices = np.where(y == y[instance_idx])[0]
        different_class_indices = np.where(y != y[instance_idx])[0]

        # Find k nearest neighbors from the same class
        same_class_neighbors = find_nearest_neighbors(instance, X.iloc[same_class_indices].values, k)
        # Find k nearest neighbors from different classes
        different_class_neighbors = find_nearest_neighbors(instance, X.iloc[different_class_indices].values, k)

        # Update feature weights
        for feature in range(num_features):
            nearest_same = np.mean(X.iloc[same_class_indices].values[:, feature][same_class_neighbors])
            nearest_different = np.mean(X.iloc[different_class_indices].values[:, feature][different_class_neighbors])
            feature_weights[feature] += abs(instance[feature] - nearest_same) - abs(instance[feature] - nearest_different)

    feature_weights /= num_iterations
    return feature_weights

# Load your dataset and split it into features (X) and target variable (y)
data = pd.read_csv('../data/diabetes.csv')
X = data.drop(columns=['Outcome'])
y = data['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Chi-square test
# Create the SelectKBest object with chi2 scoring function
k_best = SelectKBest(score_func=chi2, k=4)

# Fit the feature selector to the data
k_best.fit(X, y)

# Get the selected features (indexes of the selected features)
selected_features = k_best.get_support(indices=True)

# Transform the original data to retain only the selected features
X_selected = k_best.transform(X)
print("Chi-square : ")
print("Selected features (indexes):", selected_features)

#Mutual information
# Create the SelectKBest object with mutual_info_classif scoring function
k_best = SelectKBest(score_func=mutual_info_classif, k=4)

# Fit the feature selector to the data
k_best.fit(X, y)

# Get the selected features (indexes of the selected features)
selected_features = k_best.get_support(indices=True)

# Transform the original data to retain only the selected features
X_selected = k_best.transform(X)
print("Mutual information : ")
print("Selected features (indexes):", selected_features)

#mrmr
selected_features = mrmr_classif(X=X, y=y, K=4)
print("mrmr : ")
print(selected_features)

#reliefF
if __name__ == "__main__":
    feature_weights = reliefF(X, y, k=3, num_iterations=100)
    print("ReliefF : ")
    print("Feature weights:", feature_weights)


#sfs
# Create a classifier (replace this with the classifier of your choice)
clf = KNeighborsClassifier()

# Create the SequentialFeatureSelector object with forward selection
sfs = SFS(clf,
          k_features=(1, 8),  # Range of features to select (1 to all features)
          forward=True,  # Forward selection (can also use backward=False for backward selection)
          floating=False,  # Disable floating search
          scoring='accuracy',  # Scoring metric for feature selection
          cv=5)  # Cross-validation folds

# Fit the SequentialFeatureSelector to the training data
sfs = sfs.fit(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = sfs.k_feature_idx_

# Transform the original data to retain only the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

print("SFS : ")
print("Selected feature indices:", selected_feature_indices)
#print("Selected feature names:", data.feature_names[selected_feature_indices])

#sbs
model = LogisticRegression()  # You can use any other model of your choice

# Initialize the step backward feature selector
sbs = SFS(model,
        k_features=(1, 8),
        forward=False,  # Change to False for step backward selection
        floating=False,
        #verbose=2,
        scoring='accuracy',  # Change this to your chosen metric
        cv=5)  # Number of cross-validation folds

# Perform step backward feature selection
sbs.fit(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = sbs.k_feature_idx_

# Convert the indices to feature names
selected_features = [X.columns[idx] for idx in selected_feature_indices]

print("SBS : ")
print("Selected features:", selected_feature_indices)

# Define a list of classification algorithms you want to run
algorithms = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), svm.SVC(), RandomForestClassifier(), MLPClassifier(), GradientBoostingClassifier(), XGBClassifier(), LGBMClassifier(), CatBoostClassifier(verbose=False)]


# Create a for loop to run each algorithm
for algorithm in algorithms:
    model_name = type(algorithm).__name__
    print(f"Training {model_name}...")
    
    # Fit the model on the training data
    algorithm.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = algorithm.predict(X_test)
    
    # Evaluate the model's performance using various metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    specificity = tn / (tn + fp)
    
    print(f"{model_name} evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix: {confusion}")
    print(f"Specificity: {specificity}\n")

print("After select 4 features : ")

X = data.drop(columns=['Pregnancies', 'BloodPressure', 'SkinThickness', 'DiabetesPedigreeFunction'])
y = data['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for algorithm in algorithms:
    model_name = type(algorithm).__name__
    print(f"Training {model_name}...")
    
    # Fit the model on the training data
    algorithm.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = algorithm.predict(X_test)
    
    # Evaluate the model's performance using various metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()
    specificity = tn / (tn + fp)
    
    print(f"{model_name} evaluation:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix: {confusion}")
    print(f"Specificity: {specificity}\n")



