<a href="https://colab.research.google.com/github/ohansfav/PROJECT-CODE/blob/main/Project_bwwpa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#for feature selections

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from google.colab import drive, files
import io
import random

def calculate_fitness(solution, X_train, X_test, y_train, y_test):
    selected_indices = np.where(solution == 1)
    if len(selected_indices[0]) == 0:
        return 0, 0
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    num_features = len(selected_indices[0])
    total_features = X_train.shape[1]
    alpha = 0.9
    fitness_score = alpha * accuracy + (1 - alpha) * (1 - (num_features / total_features))
    return fitness_score, accuracy

def bwwpa_feature_selection(X_train, X_test, y_train, y_test, max_iter=30):
    n_plants = 14
    n_features = X_train.shape[1]
    positions = np.random.randint(0, 2, size=(n_plants, n_features))
    best_fitness, best_accuracy = -1.0, -1.0
    best_position = np.zeros(n_features)
    K = 1.0
    for i in range(n_plants):
        current_fitness, current_accuracy_temp = calculate_fitness(positions[i], X_train, X_test, y_train, y_test)
        if current_fitness > best_fitness:
            best_fitness = current_fitness
            best_position = positions[i].copy()
            best_accuracy = current_accuracy_temp
    for t in range(1, max_iter + 1):
        for i in range(n_plants):
            r, r1, r2, r3 = np.random.rand(4)
            current_position = positions[i].copy()
            if r < 0.5:
                W_vec = r1 * (current_position + 2 * K)
                new_position = current_position + W_vec * (2 * K + r2)
            else:
                W_vec = r3 * (K * best_position + r3 * current_position)
                new_position = current_position + K * W_vec

            sigmoid_val = 1 / (1 + np.exp(-new_position))
            positions[i] = (sigmoid_val > np.random.rand(n_features)).astype(int)
            current_fitness, current_accuracy = calculate_fitness(positions[i], X_train, X_test, y_train, y_test)
            if current_fitness > best_fitness:
                best_fitness = current_fitness
                best_position = positions[i].copy()
                best_accuracy = current_accuracy
        K = 1 + 2 * t**2 / (max_iter)**3 + 0.5
    _, accuracy = calculate_fitness(best_position, X_train, X_test, y_train, y_test)
    selected_indices = np.where(best_position == 1)
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def rfe_feature_selection(X_train, X_test, y_train, y_test):
    estimator = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
    selector = RFE(estimator, n_features_to_select=X_train.shape[1]//2, step=1)
    selector = selector.fit(X_train, y_train)
    selected_indices = np.where(selector.support_)
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def selectkbest_feature_selection(X_train, X_test, y_train, y_test):
    selector = SelectKBest(score_func=f_classif, k=X_train.shape[1]//2)
    selector.fit(X_train, y_train)
    selected_indices = np.where(selector.get_support())
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def randomforest_feature_selection(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    threshold = np.median(importances)
    selected_indices = np.where(importances > threshold)
    if len(selected_indices[0]) == 0: return 0, 0, []
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    eval_model = KNeighborsClassifier(n_neighbors=5)
    eval_model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, eval_model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def lasso_feature_selection(X_train, X_test, y_train, y_test):
    """
    Performs feature selection using Lasso (L1 regularization) and evaluates
    the performance using KNeighborsClassifier.

    Args:
        X_train (np.ndarray): Training features.
        X_test (np.ndarray): Testing features.
        y_train (np.ndarray): Training labels.
        y_test (np.ndarray): Testing labels.

    Returns:
        tuple: A tuple containing:
            - float: Accuracy score on the test set using selected features.
            - int: The number of features selected.
            - list: A list of indices of the selected features.
    """
    model = LassoCV(cv=5, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    selected_indices = np.where(np.abs(model.coef_) > 1e-5)[0]
    if len(selected_indices) == 0:
        return 0.0, 0, []
    X_train_selected = X_train[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]
    eval_model = KNeighborsClassifier(n_neighbors=5)
    eval_model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, eval_model.predict(X_test_selected))
    return accuracy, len(selected_indices), selected_indices.tolist()

def calculate_fitness_ga(solution, X_train, X_test, y_train, y_test):
    """
    Fitness function for the Genetic Algorithm. Evaluates a feature subset.
    """
    selected_indices = np.where(solution == 1)[0]
    if len(selected_indices) == 0:
        return 0, 0
    X_train_selected = X_train[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    num_features = len(selected_indices)
    total_features = X_train.shape[1]
    alpha = 0.9
    fitness_score = alpha * accuracy + (1 - alpha) * (1 - (num_features / total_features))
    return fitness_score, accuracy

def genetic_algorithm_feature_selection(X_train, X_test, y_train, y_test, population_size=50, generations=100, crossover_rate=0.8, mutation_rate=0.01):
    """
    Performs feature selection using a Genetic Algorithm.
    """
    n_features = X_train.shape[1]
    population = np.random.randint(0, 2, size=(population_size, n_features))
    best_fitness = -1
    best_chromosome = None
    best_accuracy = 0
    for generation in range(generations):
        fitness_scores = []
        accuracies = []
        for chromosome in population:
            fitness, accuracy = calculate_fitness_ga(chromosome, X_train, X_test, y_train, y_test)
            fitness_scores.append(fitness)
            accuracies.append(accuracy)
        current_best_fitness_index = np.argmax(fitness_scores)
        if fitness_scores[current_best_fitness_index] > best_fitness:
            best_fitness = fitness_scores[current_best_fitness_index]
            best_chromosome = population[current_best_fitness_index].copy()
            best_accuracy = accuracies[current_best_fitness_index]
        new_population = []
        for _ in range(population_size):
            tournament_size = 5
            competitors_indices = random.sample(range(population_size), tournament_size)
            winner_index = competitors_indices[np.argmax([fitness_scores[i] for i in competitors_indices])]
            new_population.append(population[winner_index].copy())
        population = np.array(new_population)
        for i in range(0, population_size, 2):
            if random.random() < crossover_rate:
                crossover_point = random.randint(1, n_features - 1)
                population[i, :crossover_point], population[i+1, :crossover_point] = population[i+1, :crossover_point].copy(), population[i, :crossover_point].copy()
                population[i, crossover_point:], population[i+1, crossover_point:] = population[i+1, crossover_point:].copy(), population[i, crossover_point:].copy()
        for i in range(population_size):
            for j in range(n_features):
                if random.random() < mutation_rate:
                    population[i, j] = 1 - population[i, j]
    selected_indices = np.where(best_chromosome == 1)[0]
    return best_accuracy, len(selected_indices), selected_indices.tolist()

def scale_features(X_train, X_test):
    """
    Scales the training and testing features using StandardScaler.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

def evaluate_with_gradient_boosting(X_train_scaled, X_test_scaled, y_train, y_test, selected_indices):
    """
    Evaluates the performance of selected features using a Gradient Boosting Classifier.
    """
    if not selected_indices:
        return 0.0
    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]
    model = GradientBoostingClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

def evaluate_with_neural_network(X_train_scaled, X_test_scaled, y_train, y_test, selected_indices):
    """
    Evaluates the performance of selected features using a simple Neural Network Classifier.
    """
    if not selected_indices:
        return 0.0
    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]
    model = MLPClassifier(random_state=42, max_iter=2000)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy


if __name__ == '__main__':
    # Set global random seeds for reproducibility
    random.seed(42)
    np.random.seed(42)

    print("Mounting Google Drive...")
    drive.mount('/content/drive')

    FILE_PATH = '/content/drive/MyDrive/cervicaldata.xlsx'
    TARGET_NAME = 'status'

    try:
        df = pd.read_excel(FILE_PATH)
        print(f"\nSuccessfully loaded file from Google Drive: {FILE_PATH}")
        file_name = FILE_PATH
    except FileNotFoundError:
        print(f"\nError: File not found at {FILE_PATH}. Using fallback: local upload widget.")
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        df = pd.read_excel(io.BytesIO(uploaded[file_name]))

    print("\nDataFrame loaded successfully. Columns found:")
    print(df.columns.tolist())

    if TARGET_NAME not in df.columns:
        print(f"\nFATAL ERROR: Target column '{TARGET_NAME}' not found in the Excel file.")
        print("Please edit the 'TARGET_NAME' variable in the script to match a column exactly.")
    else:
        y = df[TARGET_NAME]
        X = df.drop(columns=[TARGET_NAME])
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce')
        imputer = SimpleImputer(strategy='mean')
        X = imputer.fit_transform(X)
        if not pd.api.types.is_numeric_dtype(y):
            le_y = LabelEncoder()
            y = le_y.fit_transform(y.astype(str).fillna('missing'))
        else:
            y = y.values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

        results_knn = []
        results_gb = []
        results_nn = []

        print(f"\n--- Running Feature Selection Methods on {file_name} ---")
        methods = {
            "bWWPA": bwwpa_feature_selection,
            "RFE": rfe_feature_selection,
            "SelectKBest": selectkbest_feature_selection,
            "Random Forest Importance": randomforest_feature_selection,
            "Lasso (L1)": lasso_feature_selection,
            "Genetic Algorithm": genetic_algorithm_feature_selection
        }

        for method_name, method_func in methods.items():
            if method_name in ["bWWPA", "Genetic Algorithm"]:
                acc_knn, count, indices = method_func(X_train_scaled, X_test_scaled, y_train, y_test)
            else:
                acc_knn, count, indices = method_func(X_train_scaled, X_test_scaled, y_train, y_test)
            results_knn.append({"Method": method_name, "Accuracy_KNN": acc_knn, "Feature Count": count, "Selected Indices": indices})
            acc_gb = evaluate_with_gradient_boosting(X_train_scaled, X_test_scaled, y_train, y_test, indices)
            results_gb.append({"Method": method_name, "Accuracy_GB": acc_gb, "Feature Count": count, "Selected Indices": indices})
            acc_nn = evaluate_with_neural_network(X_train_scaled, X_test_scaled, y_train, y_test, indices)
            results_nn.append({"Method": method_name, "Accuracy_NN": acc_nn, "Feature Count": count, "Selected Indices": indices})

        model_knn = KNeighborsClassifier(n_neighbors=5)
        model_knn.fit(X_train_scaled, y_train)
        baseline_acc_knn = accuracy_score(y_test, model_knn.predict(X_test_scaled))

        model_gb = GradientBoostingClassifier(random_state=42)
        model_gb.fit(X_train_scaled, y_train)
        baseline_acc_gb = accuracy_score(y_test, model_gb.predict(X_test_scaled))

        model_nn = MLPClassifier(random_state=42, max_iter=2000)
        model_nn.fit(X_train_scaled, y_train)
        baseline_acc_nn = accuracy_score(y_test, model_nn.predict(X_test_scaled))

        all_indices = list(range(X.shape[1]))
        results_knn.append({"Method": "Baseline (No FS)", "Accuracy_KNN": baseline_acc_knn, "Feature Count": X.shape[1], "Selected Indices": all_indices})
        results_gb.append({"Method": "Baseline (No FS)", "Accuracy_GB": baseline_acc_gb, "Feature Count": X.shape[1], "Selected Indices": all_indices})
        results_nn.append({"Method": "Baseline (No FS)", "Accuracy_NN": baseline_acc_nn, "Feature Count": X.shape[1], "Selected Indices": all_indices})

        results_df_knn = pd.DataFrame(results_knn)
        results_df_knn = results_df_knn.sort_values(by=['Accuracy_KNN', 'Feature Count'], ascending=[False, True])

        results_df_gb = pd.DataFrame(results_gb)
        results_df_gb = results_df_gb.sort_values(by=['Accuracy_GB', 'Feature Count'], ascending=[False, True])

        results_df_nn = pd.DataFrame(results_nn)
        results_df_nn = results_df_nn.sort_values(by=['Accuracy_NN', 'Feature Count'], ascending=[False, True])

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by KNN Accuracy):")
        print("=" * 50)
        print(results_df_knn.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by Gradient Boosting Accuracy):")
        print("=" * 50)
        print(results_df_gb.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by Neural Network Accuracy):")
        print("=" * 50)
        print(results_df_nn.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Top Two Methods by KNN Accuracy:")
        print("=" * 50)
        top_two_knn = results_df_knn.head(2)
        for index, row in top_two_knn.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_KNN']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

        print("\n" + "=" * 50)
        print("Top Two Methods by Gradient Boosting Accuracy:")
        print("=" * 50)
        top_two_gb = results_df_gb.head(2)
        for index, row in top_two_gb.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_GB']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

        print("\n" + "=" * 50)
        print("Top Two Methods by Neural Network Accuracy:")
        print("=" * 50)
        top_two_nn = results_df_nn.head(2)
        for index, row in top_two_nn.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_NN']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

Mounting Google Drive...
Mounted at /content/drive

Successfully loaded file from Google Drive: /content/drive/MyDrive/cervicaldata.xlsx

DataFrame loaded successfully. Columns found:
['years_after_diagnosis', 'age_at_diagnosis', 'stage level', 'chemotherapy', 'brachtherapy', 'chemoradiation', 'radiotherapy', 'radiation', 'menopause', 'MENO_post', 'HISTOLOGY', 'CM_1', 'CM_2', 'CM_3', 'status']

--- Running Feature Selection Methods on /content/drive/MyDrive/cervicaldata.xlsx ---

Feature Selection Results (Ranked by KNN Accuracy):
| Method                   |   Accuracy_KNN |   Feature Count | Selected Indices                               |
|:-------------------------|---------------:|----------------:|:-----------------------------------------------|
| Genetic Algorithm        |       0.878788 |               4 | [1, 4, 7, 9]                                   |
| bWWPA                    |       0.787879 |               5 | [0, 2, 3, 4, 9]                                |
| RFE      

#CPH 4 GB

In [None]:
all_gb_cph = results_df_gb.copy()
all_gb_cph['CPH'] = all_gb_cph['Accuracy_GB'] / all_gb_cph['Feature Count']
display(all_gb_cph[['Method', 'Accuracy_GB', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_GB,Feature Count,CPH
1,RFE,0.727273,7,0.103896
6,Baseline (No FS),0.727273,14,0.051948
0,bWWPA,0.69697,5,0.139394
3,Random Forest Importance,0.69697,7,0.099567
2,SelectKBest,0.666667,7,0.095238
4,Lasso (L1),0.636364,4,0.159091
5,Genetic Algorithm,0.606061,4,0.151515


#CPH FOR NN

In [None]:
all_nn_cph = results_df_nn.copy()
all_nn_cph['CPH'] = all_nn_cph['Accuracy_NN'] / all_nn_cph['Feature Count']
display(all_nn_cph[['Method', 'Accuracy_NN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_NN,Feature Count,CPH
0,bWWPA,0.848485,5,0.169697
1,RFE,0.757576,7,0.108225
2,SelectKBest,0.757576,7,0.108225
5,Genetic Algorithm,0.727273,4,0.181818
4,Lasso (L1),0.69697,4,0.174242
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.666667,14,0.047619


#CPH FOR KNN

In [None]:
all_knn_cph = results_df_knn.copy()
all_knn_cph['CPH'] = all_knn_cph['Accuracy_KNN'] / all_knn_cph['Feature Count']
display(all_knn_cph[['Method', 'Accuracy_KNN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_KNN,Feature Count,CPH
5,Genetic Algorithm,0.878788,4,0.219697
0,bWWPA,0.787879,5,0.157576
1,RFE,0.69697,7,0.099567
2,SelectKBest,0.69697,7,0.099567
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.69697,14,0.049784
4,Lasso (L1),0.666667,4,0.166667


# Task
Analyze and rank the Coefficient of Performance (CPH) for KNN, Gradient Boosting, and Neural Network classifiers based on the `all_knn_cph`, `all_gb_cph`, and `all_nn_cph` DataFrames. Then, summarize the findings by identifying the feature selection methods that consistently provide high CPH across all classifiers, and discuss their efficiency in achieving good accuracy with reduced features.

## Analyze and Rank CPH for KNN

### Subtask:
Analyze the `all_knn_cph` DataFrame, sort it by the 'CPH' column in descending order, and display the top-performing methods.


**Reasoning**:
To analyze and rank the CPH for KNN, I need to sort the `all_knn_cph` DataFrame by the 'CPH' column in descending order and display the relevant columns.



In [None]:
sorted_knn_cph = all_knn_cph.sort_values(by='CPH', ascending=False)
display(sorted_knn_cph[['Method', 'Accuracy_KNN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_KNN,Feature Count,CPH
5,Genetic Algorithm,0.878788,4,0.219697
4,Lasso (L1),0.666667,4,0.166667
0,bWWPA,0.787879,5,0.157576
1,RFE,0.69697,7,0.099567
2,SelectKBest,0.69697,7,0.099567
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.69697,14,0.049784


## Analyze and Rank CPH for Gradient Boosting

### Subtask:
Analyze the `all_gb_cph` DataFrame, sort it by the 'CPH' column in descending order, and display the top-performing methods.


**Reasoning**:
To analyze and rank the CPH for Gradient Boosting, I need to sort the `all_gb_cph` DataFrame by the 'CPH' column in descending order and display the relevant columns.



In [None]:
sorted_gb_cph = all_gb_cph.sort_values(by='CPH', ascending=False)
display(sorted_gb_cph[['Method', 'Accuracy_GB', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_GB,Feature Count,CPH
4,Lasso (L1),0.636364,4,0.159091
5,Genetic Algorithm,0.606061,4,0.151515
0,bWWPA,0.69697,5,0.139394
1,RFE,0.727273,7,0.103896
3,Random Forest Importance,0.69697,7,0.099567
2,SelectKBest,0.666667,7,0.095238
6,Baseline (No FS),0.727273,14,0.051948


**Reasoning**:
To analyze and rank the CPH for Neural Network, I need to sort the `all_nn_cph` DataFrame by the 'CPH' column in descending order and display the relevant columns.



In [None]:
sorted_nn_cph = all_nn_cph.sort_values(by='CPH', ascending=False)
display(sorted_nn_cph[['Method', 'Accuracy_NN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_NN,Feature Count,CPH
5,Genetic Algorithm,0.727273,4,0.181818
4,Lasso (L1),0.69697,4,0.174242
0,bWWPA,0.848485,5,0.169697
2,SelectKBest,0.757576,7,0.108225
1,RFE,0.757576,7,0.108225
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.666667,14,0.047619


## Summary of CPH Analysis Across Classifiers

### KNN CPH Ranking:
```
| Method                   |   Accuracy_KNN |   Feature Count |        CPH |
|:-------------------------|---------------:|----------------:|-----------:|
| Genetic Algorithm        |       0.878788 |               4 | 0.219697   |
| Lasso (L1)               |       0.666667 |               4 | 0.166667   |
| bWWPA                    |       0.787879 |               5 | 0.157576   |
| RFE                      |       0.69697  |               7 | 0.0995671  |
| SelectKBest              |       0.69697  |               7 | 0.0995671  |
| Random Forest Importance |       0.69697  |               7 | 0.0995671  |
| Baseline (No FS)         |       0.69697  |              14 | 0.0497835  |
```

### Gradient Boosting CPH Ranking:
```
| Method                   |   Accuracy_GB |   Feature Count |        CPH |
|:-------------------------|--------------:|----------------:|-----------:|
| Lasso (L1)               |      0.636364 |               4 | 0.159091   |
| Genetic Algorithm        |      0.606061 |               4 | 0.151515   |
| bWWPA                    |      0.69697  |               5 | 0.139394   |
| RFE                      |      0.727273 |               7 | 0.103896   |
| Random Forest Importance |      0.69697  |               7 | 0.0995671  |
| SelectKBest              |      0.666667 |               7 | 0.0952381  |
| Baseline (No FS)         |      0.727273 |              14 | 0.0519481  |
```

### Neural Network CPH Ranking:
```
| Method                   |   Accuracy_NN |   Feature Count |        CPH |
|:-------------------------|--------------:|----------------:|-----------:|
| Genetic Algorithm        |      0.727273 |               4 | 0.181818   |
| Lasso (L1)               |      0.69697  |               4 | 0.174242   |
| bWWPA                    |      0.848485 |               5 | 0.169697   |
| SelectKBest              |      0.757576 |               7 | 0.108225   |
| RFE                      |      0.757576 |               7 | 0.108225   |
| Random Forest Importance |      0.69697  |               7 | 0.0995671  |
| Baseline (No FS)         |      0.666667 |              14 | 0.047619   |
```

### Cross-Classifier CPH Performance Analysis:

Upon reviewing the CPH rankings for KNN, Gradient Boosting, and Neural Network classifiers, the following observations can be made:

*   **Genetic Algorithm (GA)** consistently shows a high CPH, particularly for KNN and Neural Networks. For KNN, GA has the highest CPH of 0.2197 with only 4 features, demonstrating excellent efficiency. For NN, GA also ranks high with a CPH of 0.1818.

*   **Lasso (L1) Regression** also exhibits strong CPH, especially for Gradient Boosting and Neural Networks, often selecting a small number of features. For Gradient Boosting, Lasso achieved the highest CPH of 0.1591 with only 4 features. It also performs well for NN, holding the second highest CPH.

*   **bWWPA** performs very well in terms of CPH for KNN and Neural Networks. It secures the third-highest CPH for KNN (0.1576) and the third-highest for NN (0.1697), generally with a moderate number of features.

*   **RFE** and **SelectKBest** tend to have lower CPH values compared to GA, Lasso, and bWWPA, as they often select a higher number of features while not always yielding proportionally higher accuracy. Their CPH values are generally in the range of 0.10-0.11 for KNN and NN, and slightly lower for GB.

*   **Random Forest Importance** generally shows moderate CPH values across all classifiers, similar to RFE and SelectKBest, suggesting that while it can be effective, it might not always be the most efficient in terms of feature reduction for the given dataset and evaluation metrics.

*   The **Baseline (No FS)** method consistently has the lowest CPH, as expected, due to using all 14 features without any reduction. This highlights the value of feature selection in improving model efficiency.

**Efficiency Discussion:**

**Genetic Algorithm** and **Lasso (L1)** emerge as the most efficient methods in this analysis. They achieve high predictive accuracy with a significantly reduced number of features, leading to superior CPH values. This indicates that these methods are highly effective at identifying the most relevant features, which not only improves model interpretability but also reduces computational cost and potential overfitting. bWWPA also shows good efficiency, especially with KNN and NN. These methods are valuable for scenarios where both accuracy and model simplicity (fewer features) are critical.


## Summary:

### Q&A
The feature selection methods that consistently provide high Coefficient of Performance (CPH) across all classifiers are **Genetic Algorithm (GA)** and **Lasso (L1) Regression**.

Their efficiency in achieving good accuracy with reduced features is high:
*   **Genetic Algorithm** consistently delivered top-tier CPH values, notably the highest for KNN (0.2197) and Neural Networks (0.1818), typically utilizing only 4 features. This demonstrates its strong ability to achieve high accuracy with a significantly reduced feature set.
*   **Lasso (L1) Regression** also showed robust efficiency, achieving the highest CPH for Gradient Boosting (0.1591) and the second highest for Neural Networks (0.1742), also with just 4 features.
*   **bWWPA** also demonstrated good efficiency, particularly for KNN and Neural Networks, securing the third-highest CPH in both cases (0.1576 and 0.1697, respectively) with a moderate number of 5 features.
These methods effectively identify the most relevant features, improving model interpretability, reducing computational cost, and mitigating potential overfitting.

### Data Analysis Key Findings
*   **Genetic Algorithm (GA)** consistently achieved the highest or near-highest Coefficient of Performance (CPH) across all classifiers. It recorded the highest CPH for KNN at 0.2197 (with 4 features and 0.8788 accuracy) and for Neural Networks at 0.1818 (with 4 features).
*   **Lasso (L1) Regression** also performed exceptionally well, achieving the highest CPH for Gradient Boosting at 0.1591 (with 4 features) and the second-highest CPH for Neural Networks at 0.1742 (with 4 features).
*   **bWWPA** ranked third in CPH for both KNN (0.1576 with 5 features) and Neural Networks (0.1697 with 5 features), indicating good efficiency.
*   Methods like RFE, SelectKBest, and Random Forest Importance generally exhibited lower CPH values, typically in the 0.10-0.11 range, often using 7 features.
*   The **Baseline (No FS)** method, which used all 14 features, consistently showed the lowest CPH across all classifiers (e.g., 0.0498 for KNN, 0.0519 for Gradient Boosting, 0.0476 for Neural Networks), highlighting the clear benefit of feature selection in improving model efficiency.

### Insights or Next Steps
*   **Prioritize GA and Lasso for Feature Selection**: For similar predictive modeling tasks where both accuracy and model simplicity are critical, Genetic Algorithm and Lasso (L1) Regression should be considered primary candidates for feature selection due to their demonstrated superior efficiency.
*   **Evaluate Feature Importance**: Further investigation into the specific features selected by GA and Lasso for different classifiers could provide deeper insights into the underlying data structure and potentially reveal domain-specific knowledge about the most influential variables.


#FOR AIC

# Task
Implement a function `calculate_aic_score` that computes the Akaike Information Criterion for a given model, test data, and number of features. Then, for each feature selection method (bWWPA, RFE, SelectKBest, Random Forest Importance, Lasso (L1), Genetic Algorithm, and Baseline), calculate the AIC score for KNN, Gradient Boosting, and Neural Network classifiers. Finally, analyze and rank these AIC scores, identifying which feature selection methods consistently achieve low AIC values across all classifiers, and present an integrated analysis of the results.

## Implement AIC Calculation Function

### Subtask:
Define a helper function `calculate_aic_score(model, X_test, y_test, num_features)` that calculates the Akaike Information Criterion (AIC).


**Reasoning**:
To implement the `calculate_aic_score` function, I need to first import the `log_loss` function from `sklearn.metrics` and then define the function as specified in the instructions, including calculating predicted probabilities, log-likelihood, and the final AIC score.



In [None]:
from sklearn.metrics import log_loss

def calculate_aic_score(model, X_test, y_test, num_features):
    """
    Calculates the Akaike Information Criterion (AIC).

    Args:
        model: The trained machine learning model with a predict_proba method.
        X_test: The test features.
        y_test: The true test labels.
        num_features: The number of features used by the model.

    Returns:
        float: The calculated AIC score.
    """
    # Get predicted probabilities for the test set
    y_pred_proba = model.predict_proba(X_test)

    # Calculate log-likelihood (log_loss returns negative log-likelihood)
    log_likelihood = -log_loss(y_test, y_pred_proba)

    # Compute AIC
    aic = -2 * log_likelihood + 2 * num_features

    return aic

print("AIC calculation function defined.")

AIC calculation function defined.


## Calculate AIC for all Feature Selection Methods with KNN

### Subtask:
Iterate through each feature selection method, train a KNeighborsClassifier, and calculate its AIC score using the `calculate_aic_score` function. Store these results.


**Reasoning**:
To calculate the AIC for each feature selection method with the KNN classifier, I need to iterate through the `results_knn` DataFrame, select features based on the indices, train a `KNeighborsClassifier`, and then apply the `calculate_aic_score` function as per the instructions.



In [15]:
knn_aic_results = []

for index, row in results_df_knn.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices:
        knn_aic_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AIC_KNN': np.inf
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_knn = KNeighborsClassifier(n_neighbors=5)
    model_knn.fit(X_train_selected, y_train)

    # Calculate AIC
    aic_knn = calculate_aic_score(model_knn, X_test_selected, y_test, feature_count)

    knn_aic_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AIC_KNN': aic_knn
    })

knn_aic_df = pd.DataFrame(knn_aic_results)
print("AIC results for KNN models:")
display(knn_aic_df.sort_values(by='AIC_KNN'))

AIC results for KNN models:


Unnamed: 0,Method,Feature Count,AIC_KNN
0,Genetic Algorithm,4,12.897091
1,bWWPA,5,12.967907
6,Lasso (L1),4,13.10294
3,SelectKBest,7,19.036358
4,Random Forest Importance,7,19.106852
2,RFE,7,21.228644
5,Baseline (No FS),14,35.197685


## Calculate AIC for all Feature Selection Methods with Gradient Boosting

### Subtask:
Iterate through each feature selection method, train a GradientBoostingClassifier, and calculate its AIC score using the `calculate_aic_score` function. Store these results.


**Reasoning**:
To calculate the AIC for each feature selection method with the Gradient Boosting classifier, I need to iterate through the `results_df_gb` DataFrame, select features based on the indices, train a `GradientBoostingClassifier`, and then apply the `calculate_aic_score` function as per the instructions.



In [16]:
gb_aic_results = []

for index, row in results_df_gb.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices or feature_count == 0:
        gb_aic_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AIC_GB': np.inf
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_gb = GradientBoostingClassifier(random_state=42)
    model_gb.fit(X_train_selected, y_train)

    # Calculate AIC
    aic_gb = calculate_aic_score(model_gb, X_test_selected, y_test, feature_count)

    gb_aic_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AIC_GB': aic_gb
    })

gb_aic_df = pd.DataFrame(gb_aic_results)
print("AIC results for Gradient Boosting models:")
display(gb_aic_df.sort_values(by='AIC_GB'))

AIC results for Gradient Boosting models:


Unnamed: 0,Method,Feature Count,AIC_GB
6,Genetic Algorithm,4,10.244435
5,Lasso (L1),4,10.326596
2,bWWPA,5,11.980279
0,RFE,7,16.137857
3,Random Forest Importance,7,16.217233
4,SelectKBest,7,16.304609
1,Baseline (No FS),14,30.131335


**Reasoning**:
To calculate the AIC for each feature selection method with the Neural Network classifier, I need to iterate through the `results_df_nn` DataFrame, select features based on the indices, train an `MLPClassifier`, and then apply the `calculate_aic_score` function as per the overall instructions.



In [17]:
nn_aic_results = []

for index, row in results_df_nn.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices or feature_count == 0:
        nn_aic_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AIC_NN': np.inf
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_nn = MLPClassifier(random_state=42, max_iter=2000)
    model_nn.fit(X_train_selected, y_train)

    # Calculate AIC
    aic_nn = calculate_aic_score(model_nn, X_test_selected, y_test, feature_count)

    nn_aic_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AIC_NN': aic_nn
    })

nn_aic_df = pd.DataFrame(nn_aic_results)
print("AIC results for Neural Network models:")
display(nn_aic_df.sort_values(by='AIC_NN'))

AIC results for Neural Network models:


Unnamed: 0,Method,Feature Count,AIC_NN
3,Genetic Algorithm,4,9.726879
4,Lasso (L1),4,9.891804
0,bWWPA,5,11.311575
2,SelectKBest,7,16.266069
1,RFE,7,16.677993
5,Random Forest Importance,7,17.028607
6,Baseline (No FS),14,30.216232


## Integrated AIC Analysis Across Classifiers

To consolidate the AIC analysis, let's examine the AIC rankings for each classifier:

### KNN AIC Ranking (Lower is Better):
```
                     Method  Feature Count    AIC_KNN
0         Genetic Algorithm              4  12.897091
1                     bWWPA              5  12.967907
6                Lasso (L1)              4  13.102940
3               SelectKBest              7  19.036358
4  Random Forest Importance              7  19.106852
2                       RFE              7  21.228644
5          Baseline (No FS)             14  35.197685
```

### Gradient Boosting AIC Ranking (Lower is Better):
```
                     Method  Feature Count     AIC_GB
6         Genetic Algorithm              4  10.244435
5                Lasso (L1)              4  10.326596
2                     bWWPA              5  11.980279
0                       RFE              7  16.137857
3  Random Forest Importance              7  16.217233
4               SelectKBest              7  16.304609
1          Baseline (No FS)             14  30.131335
```

### Neural Network AIC Ranking (Lower is Better):
```
                     Method  Feature Count     AIC_NN
3         Genetic Algorithm              4   9.726879
4                Lasso (L1)              4   9.891804
0                     bWWPA              5  11.311575
2               SelectKBest              7  16.266069
1                       RFE              7  16.677993
5  Random Forest Importance              7  17.028607
6          Baseline (No FS)             14  30.216232
```

### Analysis and Identification of Best Methods:

Based on the AIC scores across the three classifiers (KNN, Gradient Boosting, and Neural Network), the following observations can be made:

1.  **Genetic Algorithm (GA)**: Consistently performs exceptionally well. It achieved the lowest AIC for KNN, Gradient Boosting, and Neural Network models, often with a very small number of features (4 features). This indicates that GA is highly effective at selecting features that lead to simpler models (fewer features) while maintaining good fit to the data, thus minimizing information loss according to AIC criteria.

2.  **Lasso (L1) Regression**: Also demonstrates strong performance across all classifiers, frequently securing the second-lowest AIC. Similar to GA, Lasso tends to select a compact set of features (4 features), suggesting its effectiveness in identifying highly relevant predictors and reducing model complexity.

3.  **bWWPA**: Ranks third in terms of AIC for all three classifiers. While not as low as GA or Lasso, its AIC scores are significantly better than the other traditional feature selection methods and the baseline, consistently using a moderate number of features (5 features). This shows that bWWPA also provides a good balance between model fit and complexity.

4.  **RFE, SelectKBest, and Random Forest Importance**: These methods generally yield higher AIC scores compared to GA, Lasso, and bWWPA. This is often due to them selecting a larger number of features (typically 7 features), which increases model complexity without necessarily providing a proportionally better fit to the data, thus resulting in higher AIC values.

5.  **Baseline (No FS)**: As expected, using all 14 features consistently results in the highest AIC scores across all classifiers. This strongly reinforces the value of feature selection in improving model parsimony and reducing the risk of overfitting, as indicated by the AIC.

**Conclusion:**

**Genetic Algorithm** and **Lasso (L1) Regression** are the feature selection methods that consistently achieve the lowest AIC values across all evaluated classifiers (KNN, Gradient Boosting, and Neural Network). This indicates that these methods are most effective at identifying optimal feature subsets that lead to models with the best balance between fit and complexity. bWWPA also shows promising results, coming in third consistently. These findings suggest that for tasks where model parsimony and interpretability are crucial, GA and Lasso should be prioritized.

## Final Task

### Subtask:
Present the integrated analysis of AIC results, highlighting the most parsimonious and well-fitting feature selection methods for each classifier, and discuss overall trends and implications.


## Summary:

### Q&A
1.  **Which feature selection methods consistently achieve low AIC values across all classifiers?**
    The **Genetic Algorithm** and **Lasso (L1) Regression** consistently achieved the lowest Akaike Information Criterion (AIC) values across all evaluated classifiers (KNN, Gradient Boosting, and Neural Network). The **bWWPA** method consistently ranked third.

### Data Analysis Key Findings
*   The `calculate_aic_score` function was successfully implemented to compute the Akaike Information Criterion, defined as `AIC = -2 * log_likelihood + 2 * num_features`.
*   **For KNN models:** The Genetic Algorithm yielded the lowest AIC at 12.897 (4 features), followed by bWWPA at 12.968 (5 features), and Lasso (L1) at 13.103 (4 features). The Baseline (no feature selection) resulted in the highest AIC of 35.198 (14 features).
*   **For Gradient Boosting models:** The Genetic Algorithm again showed the lowest AIC at 10.244 (4 features), with Lasso (L1) closely behind at 10.327 (4 features), and bWWPA at 11.980 (5 features). The Baseline AIC was 30.131 (14 features).
*   **For Neural Network models:** The Genetic Algorithm had the lowest AIC at 9.727 (4 features), followed by Lasso (L1) at 9.892 (4 features), and bWWPA at 11.312 (5 features). The Baseline AIC was 30.216 (14 features).
*   The Genetic Algorithm and Lasso (L1) consistently selected a small number of features (typically 4) while maintaining superior model fit according to the AIC.
*   Methods like RFE, SelectKBest, and Random Forest Importance generally resulted in higher AIC scores, often due to selecting more features (typically 7).
*   The Baseline method (using all 14 features without selection) consistently produced the highest AIC scores across all classifiers, strongly indicating the value of feature selection for model parsimony and reducing the risk of overfitting.

### Insights or Next Steps
*   For tasks prioritizing model parsimony and interpretability, the Genetic Algorithm and Lasso (L1) Regression should be considered primary feature selection methods due to their consistent ability to achieve optimal balance between model fit and complexity across various classifiers.
*   Further investigation could explore the specific features selected by Genetic Algorithm and Lasso (L1) to understand why these subsets lead to better model performance with fewer features, potentially revealing underlying data characteristics or domain-specific insights.


#FOR C-INDEX

# Task
Implement a function `calculate_extended_metrics` that computes Area Under the Curve (AUC), Precision, Recall, and F1-score for a given model, test data, and true labels. This function should return these metrics as a dictionary.

## Implement Additional Metrics Function

### Subtask:
Define a helper function that calculates common classification metrics like AUC, Precision, Recall, and F1-score for a given model, test data, and true labels. This function will return these metrics as a dictionary.


**Reasoning**:
To calculate the extended classification metrics as specified, I need to import the necessary functions from `sklearn.metrics` and then define the `calculate_extended_metrics` function to compute and return these metrics.



In [18]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

def calculate_extended_metrics(model, X_test, y_test):
    """
    Calculates common classification metrics (AUC, Precision, Recall, F1-score).

    Args:
        model: The trained machine learning model.
        X_test: The test features.
        y_test: The true test labels.

    Returns:
        dict: A dictionary containing 'AUC', 'Precision', 'Recall', and 'F1-score'.
    """
    # Get predicted probabilities for AUC calculation (assuming binary classification)
    # Check if the model has predict_proba and if it's a multi-class problem
    if hasattr(model, 'predict_proba') and len(np.unique(y_test)) == 2:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        # For models without predict_proba or non-binary classification, AUC might not be directly applicable
        # Or, if predict_proba is not available, we can't calculate ROC AUC.
        # For simplicity in this function, we'll return None for AUC if predict_proba is missing.
        # A more robust solution might involve using decision_function or raising an error.
        y_pred_proba = None

    # Get predicted class labels for Precision, Recall, F1-score
    y_pred = model.predict(X_test)

    metrics = {}

    if y_pred_proba is not None:
        try:
            metrics['AUC'] = roc_auc_score(y_test, y_pred_proba)
        except ValueError:
            metrics['AUC'] = None # AUC not defined for single class
    else:
        metrics['AUC'] = None

    metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    metrics['F1-score'] = f1_score(y_test, y_pred, zero_division=0)

    return metrics

print("Extended metrics calculation function defined.")


Extended metrics calculation function defined.


## Calculate Extended Metrics for KNN

### Subtask:
Iterate through each feature selection method, train a KNeighborsClassifier with the selected features, and then use the new metrics function to calculate AUC, Precision, Recall, and F1-score. Store these results alongside the existing KNN accuracy.


**Reasoning**:
To calculate extended metrics for each feature selection method using KNN, I need to iterate through the previous KNN results, select features, train a KNeighborsClassifier, and then apply the `calculate_extended_metrics` function to get AUC, Precision, Recall, and F1-score. These results will be stored in a new DataFrame.



In [19]:
knn_extended_metrics_results = []

for index, row in results_df_knn.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices:
        knn_extended_metrics_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AUC': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-score': np.nan
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_knn = KNeighborsClassifier(n_neighbors=5)
    model_knn.fit(X_train_selected, y_train)

    # Calculate extended metrics
    metrics = calculate_extended_metrics(model_knn, X_test_selected, y_test)

    knn_extended_metrics_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AUC': metrics['AUC'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-score': metrics['F1-score']
    })

knn_extended_metrics_df = pd.DataFrame(knn_extended_metrics_results)
print("Extended metrics for KNN models:")
display(knn_extended_metrics_df.sort_values(by='AUC', ascending=False))

Extended metrics for KNN models:


Unnamed: 0,Method,Feature Count,AUC,Precision,Recall,F1-score
0,Genetic Algorithm,4,0.768519,0.884615,0.958333,0.92
1,bWWPA,5,0.75,0.793103,0.958333,0.867925
3,SelectKBest,7,0.74537,0.733333,0.916667,0.814815
4,Random Forest Importance,7,0.743056,0.71875,0.958333,0.821429
6,Lasso (L1),4,0.740741,0.709677,0.916667,0.8
5,Baseline (No FS),14,0.673611,0.71875,0.958333,0.821429
2,RFE,7,0.662037,0.733333,0.916667,0.814815


## Calculate Extended Metrics for Gradient Boosting

### Subtask:
Iterate through each feature selection method, train a GradientBoostingClassifier with the selected features, and then use the `calculate_extended_metrics` function to calculate AUC, Precision, Recall, and F1-score. Store these results.

**Reasoning**:
To calculate the extended metrics for Gradient Boosting models, I need to iterate through the results from the Gradient Boosting feature selection, train a GradientBoostingClassifier for each selected feature set, and then compute the AUC, Precision, Recall, and F1-score using the previously defined `calculate_extended_metrics` function. These results will be stored in a new DataFrame.



In [20]:
gb_extended_metrics_results = []

for index, row in results_df_gb.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices or feature_count == 0:
        gb_extended_metrics_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AUC': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-score': np.nan
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_gb = GradientBoostingClassifier(random_state=42)
    model_gb.fit(X_train_selected, y_train)

    # Calculate extended metrics
    metrics = calculate_extended_metrics(model_gb, X_test_selected, y_test)

    gb_extended_metrics_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AUC': metrics['AUC'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-score': metrics['F1-score']
    })

gb_extended_metrics_df = pd.DataFrame(gb_extended_metrics_results)
print("Extended metrics for Gradient Boosting models:")
display(gb_extended_metrics_df.sort_values(by='AUC', ascending=False))

Extended metrics for Gradient Boosting models:


Unnamed: 0,Method,Feature Count,AUC,Precision,Recall,F1-score
0,RFE,7,0.712963,0.758621,0.916667,0.830189
1,Baseline (No FS),14,0.694444,0.758621,0.916667,0.830189
2,bWWPA,5,0.645833,0.733333,0.916667,0.814815
4,SelectKBest,7,0.643519,0.740741,0.833333,0.784314
3,Random Forest Importance,7,0.643519,0.733333,0.916667,0.814815
6,Genetic Algorithm,4,0.601852,0.761905,0.666667,0.711111
5,Lasso (L1),4,0.592593,0.714286,0.833333,0.769231


## Calculate Extended Metrics for Neural Network

### Subtask:
Iterate through each feature selection method, train an MLPClassifier with the selected features, and then use the `calculate_extended_metrics` function to calculate AUC, Precision, Recall, and F1-score. Store these results.

**Reasoning**:
To calculate the extended metrics for Neural Network models, I need to iterate through the results from the Neural Network feature selection, train an MLPClassifier for each selected feature set, and then compute the AUC, Precision, Recall, and F1-score using the previously defined `calculate_extended_metrics` function. These results will be stored in a new DataFrame.



In [21]:
nn_extended_metrics_results = []

for index, row in results_df_nn.iterrows():
    method_name = row['Method']
    feature_count = row['Feature Count']
    selected_indices = row['Selected Indices']

    if not selected_indices or feature_count == 0:
        nn_extended_metrics_results.append({
            'Method': method_name,
            'Feature Count': 0,
            'AUC': np.nan,
            'Precision': np.nan,
            'Recall': np.nan,
            'F1-score': np.nan
        })
        continue

    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]

    model_nn = MLPClassifier(random_state=42, max_iter=2000)
    model_nn.fit(X_train_selected, y_train)

    # Calculate extended metrics
    metrics = calculate_extended_metrics(model_nn, X_test_selected, y_test)

    nn_extended_metrics_results.append({
        'Method': method_name,
        'Feature Count': feature_count,
        'AUC': metrics['AUC'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-score': metrics['F1-score']
    })

nn_extended_metrics_df = pd.DataFrame(nn_extended_metrics_results)
print("Extended metrics for Neural Network models:")
display(nn_extended_metrics_df.sort_values(by='AUC', ascending=False))

Extended metrics for Neural Network models:


Unnamed: 0,Method,Feature Count,AUC,Precision,Recall,F1-score
0,bWWPA,5,0.819444,0.851852,0.958333,0.901961
2,SelectKBest,7,0.740741,0.833333,0.833333,0.833333
1,RFE,7,0.699074,0.785714,0.916667,0.846154
4,Lasso (L1),4,0.699074,0.75,0.875,0.807692
3,Genetic Algorithm,4,0.680556,0.777778,0.875,0.823529
6,Baseline (No FS),14,0.666667,0.724138,0.875,0.792453
5,Random Forest Importance,7,0.634259,0.769231,0.833333,0.8


## Integrated Analysis of Extended Metrics Across Classifiers

To provide a comprehensive view of the performance across all feature selection methods and classifiers, we will now analyze the extended metrics (AUC, Precision, Recall, F1-score) for KNN, Gradient Boosting, and Neural Networks.

### KNN Extended Metrics Ranking (Sorted by AUC):
```
                     Method  Feature Count       AUC  Precision    Recall  \
0         Genetic Algorithm              4  0.768519   0.884615  0.958333   
1                     bWWPA              5  0.750000   0.793103  0.958333   
3               SelectKBest              7  0.745370   0.733333  0.916667   
4  Random Forest Importance              7  0.743056   0.718750  0.958333   
6                Lasso (L1)              4  0.740741   0.709677  0.916667   
5          Baseline (No FS)             14  0.673611   0.718750  0.958333   
2                       RFE              7  0.662037   0.733333  0.916667   

   F1-score  
0  0.920000  
1  0.867925  
3  0.814815  
4  0.821429  
6  0.800000  
5  0.821429  
2  0.814815  
```

### Gradient Boosting Extended Metrics Ranking (Sorted by AUC):
```
                     Method  Feature Count       AUC  Precision    Recall  \
0                       RFE              7  0.712963   0.758621  0.916667   
1          Baseline (No FS)             14  0.694444   0.758621  0.916667   
2                     bWWPA              5  0.645833   0.733333  0.916667   
4               SelectKBest              7  0.643519   0.740741  0.833333   
3  Random Forest Importance              7  0.643519   0.733333  0.916667   
6         Genetic Algorithm              4  0.601852   0.761905  0.666667   
5                Lasso (L1)              4  0.592593   0.714286  0.833333   

   F1-score  
0  0.830189  
1  0.830189  
2  0.814815  
4  0.784314  
3  0.814815  
6  0.711111  
5  0.769231  
```

### Neural Network Extended Metrics Ranking (Sorted by AUC):
```
                     Method  Feature Count       AUC  Precision    Recall  \
0                     bWWPA              5  0.819444   0.851852  0.958333   
2               SelectKBest              7  0.740741   0.833333  0.833333   
1                       RFE              7  0.699074   0.785714  0.916667   
4                Lasso (L1)              4  0.699074   0.750000  0.875000   
3         Genetic Algorithm              4  0.680556   0.777778  0.875000   
6          Baseline (No FS)             14  0.666667   0.724138  0.875000   
5  Random Forest Importance              7  0.634259   0.769231  0.833333   

   F1-score  
0  0.901961  
2  0.833333  
1  0.846154  
4  0.807692  
3  0.823529  
6  0.792453  
5  0.800000  
```

### Discussion of Overall Trends and Implications:

**1. Top Performers by AUC:**
*   **KNN:** Genetic Algorithm achieved the highest AUC (0.7685) with only 4 features, demonstrating its efficiency and effectiveness for this classifier. bWWPA and SelectKBest also performed well in terms of AUC.
*   **Gradient Boosting:** RFE led with the highest AUC (0.7130), closely followed by the Baseline (No FS). This suggests that for Gradient Boosting, a larger set of features (or even all features) was beneficial for AUC, or that RFE was particularly good at identifying relevant features for this model, even if it used 7 features.
*   **Neural Network:** bWWPA stood out with the highest AUC (0.8194), selecting 5 features. SelectKBest also showed strong performance with an AUC of 0.7407.

**2. Feature Reduction vs. Performance:**
*   Methods that significantly reduce features (like Genetic Algorithm and Lasso) often maintain competitive performance across various metrics, especially for KNN and NN. This highlights their value in creating more interpretable and computationally efficient models without a significant drop in predictive power.
*   For Gradient Boosting, surprisingly, the Baseline (no feature selection) or methods selecting more features (like RFE) sometimes achieved higher AUC, indicating that this model might leverage a broader range of features more effectively.

**3. Consistency Across Metrics:**
*   While AUC is a good overall measure, it's important to consider Precision, Recall, and F1-score depending on the specific problem's needs (e.g., minimizing false positives vs. false negatives).
*   Generally, higher AUC scores correlate with higher F1-scores, but there are nuances. For instance, in KNN, Genetic Algorithm has the highest AUC and F1-score, while in Gradient Boosting, RFE has high AUC and F1-score, and in Neural Network, bWWPA leads in both.

**4. Implications:**
*   The choice of feature selection method is highly dependent on the downstream classifier. A method that works best for one classifier (e.g., Genetic Algorithm for KNN) might not be optimal for another (e.g., Gradient Boosting).
*   For models where complexity is a concern (e.g., KNN and Neural Networks, which can be sensitive to irrelevant features), methods like Genetic Algorithm, Lasso, and bWWPA that focus on parsimonious feature subsets appear to be more beneficial.
*   It's crucial to evaluate feature selection comprehensively using multiple metrics and across different models to make an informed decision for a specific application.

## Consolidate and Present All Metrics

### Subtask:
Combine the results for all classifiers and feature selection methods into comprehensive DataFrames. Present these tables, highlighting the top-performing methods based on each metric (Accuracy, CPH, AIC, AUC, Precision, Recall, F1-score) across all classifiers. Discuss overall trends and implications regarding feature selection efficiency and model performance across different metrics.


**Reasoning**:
To combine the results for KNN, I will merge `results_df_knn`, `knn_aic_df`, and `knn_extended_metrics_df` on 'Method' and 'Feature Count'. Then, I will sort the combined DataFrame by 'AUC' in descending order and display it.



In [25]:
combined_knn_metrics_df = results_df_knn.merge(knn_aic_df, on=['Method', 'Feature Count'], how='left')
combined_knn_metrics_df = combined_knn_metrics_df.merge(knn_extended_metrics_df, on=['Method', 'Feature Count'], how='left')

# Sort by AUC in descending order, putting NaNs at the end
combined_knn_metrics_df = combined_knn_metrics_df.sort_values(by='AUC', ascending=False, na_position='last')

print("Combined KNN Metrics:")
display(combined_knn_metrics_df)


Combined KNN Metrics:


Unnamed: 0,Method,Accuracy_KNN,Feature Count,Selected Indices,AIC_KNN,AUC,Precision,Recall,F1-score
0,Genetic Algorithm,0.878788,4,"[1, 4, 7, 9]",12.897091,0.768519,0.884615,0.958333,0.92
1,bWWPA,0.787879,5,"[0, 2, 3, 4, 9]",12.967907,0.75,0.793103,0.958333,0.867925
3,SelectKBest,0.69697,7,"[0, 2, 3, 7, 8, 9, 13]",19.036358,0.74537,0.733333,0.916667,0.814815
4,Random Forest Importance,0.69697,7,"[0, 1, 2, 3, 9, 10, 13]",19.106852,0.743056,0.71875,0.958333,0.821429
6,Lasso (L1),0.666667,4,"[0, 2, 9, 13]",13.10294,0.740741,0.709677,0.916667,0.8
5,Baseline (No FS),0.69697,14,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",35.197685,0.673611,0.71875,0.958333,0.821429
2,RFE,0.69697,7,"[0, 5, 6, 9, 11, 12, 13]",21.228644,0.662037,0.733333,0.916667,0.814815


**Reasoning**:
To combine the results for Gradient Boosting, I will merge `results_df_gb`, `gb_aic_df`, and `gb_extended_metrics_df` on 'Method' and 'Feature Count'. Then, I will sort the combined DataFrame by 'AUC' in descending order and display it.



In [26]:
combined_gb_metrics_df = results_df_gb.merge(gb_aic_df, on=['Method', 'Feature Count'], how='left')
combined_gb_metrics_df = combined_gb_metrics_df.merge(gb_extended_metrics_df, on=['Method', 'Feature Count'], how='left')

# Sort by AUC in descending order, putting NaNs at the end
combined_gb_metrics_df = combined_gb_metrics_df.sort_values(by='AUC', ascending=False, na_position='last')

print("Combined Gradient Boosting Metrics:")
display(combined_gb_metrics_df)

Combined Gradient Boosting Metrics:


Unnamed: 0,Method,Accuracy_GB,Feature Count,Selected Indices,AIC_GB,AUC,Precision,Recall,F1-score
0,RFE,0.727273,7,"[0, 5, 6, 9, 11, 12, 13]",16.137857,0.712963,0.758621,0.916667,0.830189
1,Baseline (No FS),0.727273,14,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",30.131335,0.694444,0.758621,0.916667,0.830189
2,bWWPA,0.69697,5,"[0, 2, 3, 4, 9]",11.980279,0.645833,0.733333,0.916667,0.814815
4,SelectKBest,0.666667,7,"[0, 2, 3, 7, 8, 9, 13]",16.304609,0.643519,0.740741,0.833333,0.784314
3,Random Forest Importance,0.69697,7,"[0, 1, 2, 3, 9, 10, 13]",16.217233,0.643519,0.733333,0.916667,0.814815
6,Genetic Algorithm,0.606061,4,"[1, 4, 7, 9]",10.244435,0.601852,0.761905,0.666667,0.711111
5,Lasso (L1),0.636364,4,"[0, 2, 9, 13]",10.326596,0.592593,0.714286,0.833333,0.769231


**Reasoning**:
To combine the results for Neural Network, I will merge `results_df_nn`, `nn_aic_df`, and `nn_extended_metrics_df` on 'Method' and 'Feature Count'. Then, I will sort the combined DataFrame by 'AUC' in descending order and display it.



In [27]:
combined_nn_metrics_df = results_df_nn.merge(nn_aic_df, on=['Method', 'Feature Count'], how='left')
combined_nn_metrics_df = combined_nn_metrics_df.merge(nn_extended_metrics_df, on=['Method', 'Feature Count'], how='left')

# Sort by AUC in descending order, putting NaNs at the end
combined_nn_metrics_df = combined_nn_metrics_df.sort_values(by='AUC', ascending=False, na_position='last')

print("Combined Neural Network Metrics:")
display(combined_nn_metrics_df)

Combined Neural Network Metrics:


Unnamed: 0,Method,Accuracy_NN,Feature Count,Selected Indices,AIC_NN,AUC,Precision,Recall,F1-score
0,bWWPA,0.848485,5,"[0, 2, 3, 4, 9]",11.311575,0.819444,0.851852,0.958333,0.901961
2,SelectKBest,0.757576,7,"[0, 2, 3, 7, 8, 9, 13]",16.266069,0.740741,0.833333,0.833333,0.833333
1,RFE,0.757576,7,"[0, 5, 6, 9, 11, 12, 13]",16.677993,0.699074,0.785714,0.916667,0.846154
4,Lasso (L1),0.69697,4,"[0, 2, 9, 13]",9.891804,0.699074,0.75,0.875,0.807692
3,Genetic Algorithm,0.727273,4,"[1, 4, 7, 9]",9.726879,0.680556,0.777778,0.875,0.823529
6,Baseline (No FS),0.666667,14,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",30.216232,0.666667,0.724138,0.875,0.792453
5,Random Forest Importance,0.69697,7,"[0, 1, 2, 3, 9, 10, 13]",17.028607,0.634259,0.769231,0.833333,0.8


## Consolidated Analysis of All Metrics Across Classifiers

This section consolidates the performance metrics (Accuracy, CPH, AIC, AUC, Precision, Recall, F1-score) for all feature selection methods across KNN, Gradient Boosting, and Neural Network classifiers.

### KNN Combined Metrics (Sorted by AUC):
```
                     Method  Accuracy_KNN  Feature Count  \
0         Genetic Algorithm      0.878788              4   
1                     bWWPA      0.787879              5   
3               SelectKBest      0.696970              7   
4  Random Forest Importance      0.696970              7   
6                Lasso (L1)      0.666667              4   
5          Baseline (No FS)      0.696970             14   
2                       RFE      0.696970              7   

                                 Selected Indices    AIC_KNN       AUC  \
0                                    [1, 4, 7, 9]  12.897091  0.768519   
1                                 [0, 2, 3, 4, 9]  12.967907  0.750000   
3                          [0, 2, 3, 7, 8, 9, 13]  19.036358  0.745370   
4                         [0, 1, 2, 3, 9, 10, 13]  19.106852  0.743056   
6                                   [0, 2, 9, 13]  13.102940  0.740741   
5  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  35.197685  0.673611   
2                        [0, 5, 6, 9, 11, 12, 13]  21.228644  0.662037   

   Precision    Recall  F1-score  
0   0.884615  0.958333  0.920000  
1   0.793103  0.958333  0.867925  
3   0.733333  0.916667  0.814815  
4   0.718750  0.958333  0.821429  
6   0.709677  0.916667  0.800000  
5   0.718750  0.958333  0.821429  
2   0.733333  0.916667  0.814815  
```

### Gradient Boosting Combined Metrics (Sorted by AUC):
```
                     Method  Accuracy_GB  Feature Count  \
0                       RFE     0.727273              7   
1          Baseline (No FS)     0.727273             14   
2                     bWWPA     0.696970              5   
4               SelectKBest     0.666667              7   
3  Random Forest Importance     0.696970              7   
6         Genetic Algorithm     0.606061              4   
5                Lasso (L1)     0.636364              4   

                                 Selected Indices     AIC_GB       AUC  \
0                        [0, 5, 6, 9, 11, 12, 13]  16.137857  0.712963   
1  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  30.131335  0.694444   
2                                 [0, 2, 3, 4, 9]  11.980279  0.645833   
4                          [0, 2, 3, 7, 8, 9, 13]  16.304609  0.643519   
3                         [0, 1, 2, 3, 9, 10, 13]  16.217233  0.643519   
6                                    [1, 4, 7, 9]  10.244435  0.601852   
5                                   [0, 2, 9, 13]  10.326596  0.592593   

   Precision    Recall  F1-score  
0   0.758621  0.916667  0.830189  
1   0.758621  0.916667  0.830189  
2   0.733333  0.916667  0.814815  
4   0.740741  0.833333  0.784314  
3   0.733333  0.916667  0.814815  
6   0.761905  0.666667  0.711111  
5   0.714286  0.833333  0.769231  
```

### Neural Network Combined Metrics (Sorted by AUC):
```
                     Method  Accuracy_NN  Feature Count  \
0                     bWWPA     0.848485              5   
2               SelectKBest     0.757576              7   
1                       RFE     0.757576              7   
4                Lasso (L1)     0.696970              4   
3         Genetic Algorithm     0.727273              4   
6          Baseline (No FS)     0.666667             14   
5  Random Forest Importance     0.696970              7   

                                 Selected Indices     AIC_NN       AUC  \
0                                 [0, 2, 3, 4, 9]  11.311575  0.819444   
2                          [0, 2, 3, 7, 8, 9, 13]  16.266069  0.740741   
1                        [0, 5, 6, 9, 11, 12, 13]  16.677993  0.699074   
4                                   [0, 2, 9, 13]   9.891804  0.699074   
3                                    [1, 4, 7, 9]   9.726879  0.680556   
6  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  30.216232  0.666667   
5                         [0, 1, 2, 3, 9, 10, 13]  17.028607  0.634259   

   Precision    Recall  F1-score  
0   0.851852  0.958333  0.901961  
2   0.833333  0.833333  0.833333  
1   0.785714  0.916667  0.846154  
4   0.750000  0.875000  0.807692  
3   0.777778  0.875000  0.823529  
6   0.724138  0.875000  0.792453  
5   0.769231  0.833333  0.800000  
```

### Overall Trends and Implications:

**1. Consistent High Performers in Efficiency (CPH and AIC):**
*   **Genetic Algorithm (GA)** and **Lasso (L1) Regression** consistently demonstrate superior efficiency across all classifiers, marked by high CPH (Coefficient of Performance) and low AIC (Akaike Information Criterion) scores. These methods are adept at identifying a minimal set of highly impactful features (often just 4) that lead to parsimonious models without sacrificing predictive accuracy.
*   **bWWPA** also shows strong efficiency, ranking third in CPH and AIC for most classifiers, typically selecting 5 features. This indicates it provides a good balance between feature reduction and model performance.

**2. Performance Across Classifiers and Metrics (Accuracy, AUC, Precision, Recall, F1-score):**
*   **KNN:** The **Genetic Algorithm** stands out, achieving the highest accuracy (0.8788) and AUC (0.7685) with only 4 features. This confirms its excellent ability to find highly relevant feature subsets for KNN models. bWWPA also performs very well for KNN in terms of Accuracy and AUC.
*   **Gradient Boosting:** For Gradient Boosting, **RFE** and the **Baseline (No FS)** method often achieve the highest AUC and accuracy. This suggests that Gradient Boosting models might be robust enough to handle a larger number of features or that certain features, which are not highly ranked by methods like GA or Lasso, are still beneficial for this specific algorithm. Interestingly, methods with fewer features (like GA and Lasso) show lower AUC for GB, implying a trade-off where a more complex feature set benefits this model.
*   **Neural Network:** **bWWPA** emerges as the top performer for Neural Networks, yielding the highest accuracy (0.8485) and AUC (0.8194) with 5 features. Genetic Algorithm and Lasso also perform very well in terms of AIC and maintain competitive AUC, Precision, Recall, and F1-score with a reduced feature set.

**3. Trade-offs between Feature Reduction and Model Performance:**
*   There's a clear trade-off between aggressive feature reduction and maximum predictive performance, especially visible with Gradient Boosting. While GA and Lasso excel at finding highly efficient (low feature count) subsets, these might not always deliver the absolute highest AUC or accuracy for every classifier (e.g., Gradient Boosting). Conversely, models that select more features (like RFE or Baseline for GB) can sometimes achieve higher raw performance metrics. This implies that the 'best' feature selection method is highly dependent on both the chosen classifier and the specific performance metric being optimized.

**4. Value of Feature Selection:**
*   The **Baseline (No FS)** method consistently results in the highest AIC scores and generally moderate to low performance in other metrics, especially when considering the feature count. This strongly underlines the importance of feature selection in improving model parsimony, reducing computational load, enhancing interpretability, and mitigating overfitting risks across all classifier types.

**In Conclusion:**
For general efficiency (high CPH, low AIC) and often strong predictive performance with reduced features, **Genetic Algorithm** and **Lasso (L1) Regression** are highly recommended feature selection methods. However, the optimal choice ultimately depends on the specific classifier being used and the priorities (e.g., maximum raw performance vs. model simplicity/interpretability). A comprehensive evaluation across multiple metrics, as demonstrated here, is crucial for making informed decisions in real-world applications.

## Final Task

### Subtask:
Summarize the findings from the comprehensive metric analysis, identifying the most robust feature selection methods considering all evaluated metrics and classifiers. Discuss the trade-offs between different methods and their suitability for various modeling goals.


## Summary:

### Data Analysis Key Findings

*   The `calculate_extended_metrics` function was successfully implemented, providing a robust way to compute Area Under the Curve (AUC), Precision, Recall, and F1-score, and handling cases like the absence of `predict_proba` or single-class predictions.
*   **For KNN models**, the **Genetic Algorithm** emerged as the top performer, achieving the highest AUC (0.7685) and Accuracy (0.8788) with a highly reduced feature set of only 4 features. The **bWWPA** method also performed well, yielding an AUC of 0.7500 and Accuracy of 0.7879 with 5 features.
*   **For Gradient Boosting models**, **RFE** achieved the highest AUC (0.7130) with 7 features. Interestingly, the **Baseline (No Feature Selection)** method, utilizing all 14 features, performed very closely (AUC: 0.6944), suggesting that Gradient Boosting models can leverage a broader range of features or that RFE was highly effective in selecting features for this model.
*   **For Neural Network models**, **bWWPA** was the top-performing feature selection method, achieving the highest AUC (0.8194) and Accuracy (0.8485) with 5 features.
*   **Efficiency-wise (CPH and AIC)**, the **Genetic Algorithm** and **Lasso (L1) Regression** consistently demonstrated superior efficiency across all classifiers, identifying a minimal set of highly impactful features (typically 4) that led to parsimonious models without sacrificing significant predictive accuracy.
*   The **Baseline (No Feature Selection)** method consistently exhibited the highest AIC scores and generally offered moderate to lower performance compared to methods employing feature selection, highlighting the significant value of feature selection in improving model parsimony, interpretability, and mitigating overfitting risks.

### Insights or Next Steps

*   The optimal feature selection method is highly dependent on the specific machine learning classifier being used; a method that performs best for one classifier may not be ideal for another.
*   Future work should consider a weighted evaluation of metrics, tailoring the choice of feature selection and model to specific business goals, such as prioritizing Recall to minimize false negatives or Precision to minimize false positives.
