<a href="https://colab.research.google.com/github/ohansfav/PROJECT-CODE/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from google.colab import drive, files
import io
import random


def calculate_fitness(solution, X_train, X_test, y_train, y_test):
    selected_indices = np.where(solution == 1)
    if len(selected_indices[0]) == 0:
        return 0, 0
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    num_features = len(selected_indices[0])
    total_features = X_train.shape[1]
    alpha = 0.9
    fitness_score = alpha * accuracy + (1 - alpha) * (1 - (num_features / total_features))
    return fitness_score, accuracy

def bwwpa_feature_selection(X_train, X_test, y_train, y_test, max_iter=30):
    n_plants = 14
    n_features = X_train.shape[1]
    positions = np.random.randint(0, 2, size=(n_plants, n_features))
    best_fitness, best_accuracy = -1.0, -1.0
    best_position = np.zeros(n_features)
    K = 1.0
    for i in range(n_plants):
        current_fitness, current_accuracy_temp = calculate_fitness(positions[i], X_train, X_test, y_train, y_test)
        if current_fitness > best_fitness:
            best_fitness = current_fitness
            best_position = positions[i].copy()
            best_accuracy = current_accuracy_temp
    for t in range(1, max_iter + 1):
        for i in range(n_plants):
            r, r1, r2, r3 = np.random.rand(4)
            current_position = positions[i].copy()
            if r < 0.5:
                W_vec = r1 * (current_position + 2 * K)
                new_position = current_position + W_vec * (2 * K + r2)
            else:
                W_vec = r3 * (K * best_position + r3 * current_position)
                new_position = current_position + K * W_vec

            sigmoid_val = 1 / (1 + np.exp(-new_position))
            positions[i] = (sigmoid_val > np.random.rand(n_features)).astype(int)
            current_fitness, current_accuracy = calculate_fitness(positions[i], X_train, X_test, y_train, y_test)
            if current_fitness > best_fitness:
                best_fitness = current_fitness
                best_position = positions[i].copy()
                best_accuracy = current_accuracy
        K = 1 + 2 * t**2 / (max_iter)**3 + 0.5
    _, accuracy = calculate_fitness(best_position, X_train, X_test, y_train, y_test)
    selected_indices = np.where(best_position == 1)
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def rfe_feature_selection(X_train, X_test, y_train, y_test):
    estimator = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
    selector = RFE(estimator, n_features_to_select=X_train.shape[1]//2, step=1)
    selector = selector.fit(X_train, y_train)
    selected_indices = np.where(selector.support_)
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def selectkbest_feature_selection(X_train, X_test, y_train, y_test):
    selector = SelectKBest(score_func=f_classif, k=X_train.shape[1]//2)
    selector.fit(X_train, y_train)
    selected_indices = np.where(selector.get_support())
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def randomforest_feature_selection(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    importances = model.feature_importances_
    threshold = np.median(importances)
    selected_indices = np.where(importances > threshold)
    if len(selected_indices[0]) == 0: return 0, 0, []
    X_train_selected = X_train[:, selected_indices[0]]
    X_test_selected = X_test[:, selected_indices[0]]
    eval_model = KNeighborsClassifier(n_neighbors=5)
    eval_model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, eval_model.predict(X_test_selected))
    return accuracy, len(selected_indices[0]), selected_indices[0].tolist()

def lasso_feature_selection(X_train, X_test, y_train, y_test):
    """
    Performs feature selection using Lasso (L1 regularization) and evaluates
    the performance using KNeighborsClassifier.

    Args:
        X_train (np.ndarray): Training features.
        X_test (np.ndarray): Testing features.
        y_train (np.ndarray): Training labels.
        y_test (np.ndarray): Testing labels.

    Returns:
        tuple: A tuple containing:
            - float: Accuracy score on the test set using selected features.
            - int: The number of features selected.
            - list: A list of indices of the selected features.
    """
    model = LassoCV(cv=5, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    selected_indices = np.where(np.abs(model.coef_) > 1e-5)[0]
    if len(selected_indices) == 0:
        return 0.0, 0, []
    X_train_selected = X_train[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]
    eval_model = KNeighborsClassifier(n_neighbors=5)
    eval_model.fit(X_train_selected, y_train)
    accuracy = accuracy_score(y_test, eval_model.predict(X_test_selected))
    return accuracy, len(selected_indices), selected_indices.tolist()

def calculate_fitness_ga(solution, X_train, X_test, y_train, y_test):
    """
    Fitness function for the Genetic Algorithm. Evaluates a feature subset.
    """
    selected_indices = np.where(solution == 1)[0]
    if len(selected_indices) == 0:
        return 0, 0
    X_train_selected = X_train[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    num_features = len(selected_indices)
    total_features = X_train.shape[1]
    alpha = 0.9
    fitness_score = alpha * accuracy + (1 - alpha) * (1 - (num_features / total_features))
    return fitness_score, accuracy

def genetic_algorithm_feature_selection(X_train, X_test, y_train, y_test, population_size=50, generations=100, crossover_rate=0.8, mutation_rate=0.01):
    """
    Performs feature selection using a Genetic Algorithm.
    """
    n_features = X_train.shape[1]
    population = np.random.randint(0, 2, size=(population_size, n_features))
    best_fitness = -1
    best_chromosome = None
    best_accuracy = 0
    for generation in range(generations):
        fitness_scores = []
        accuracies = []
        for chromosome in population:
            fitness, accuracy = calculate_fitness_ga(chromosome, X_train, X_test, y_train, y_test)
            fitness_scores.append(fitness)
            accuracies.append(accuracy)
        current_best_fitness_index = np.argmax(fitness_scores)
        if fitness_scores[current_best_fitness_index] > best_fitness:
            best_fitness = fitness_scores[current_best_fitness_index]
            best_chromosome = population[current_best_fitness_index].copy()
            best_accuracy = accuracies[current_best_fitness_index]
        new_population = []
        for _ in range(population_size):
            tournament_size = 5
            competitors_indices = random.sample(range(population_size), tournament_size)
            winner_index = competitors_indices[np.argmax([fitness_scores[i] for i in competitors_indices])]
            new_population.append(population[winner_index].copy())
        population = np.array(new_population)
        for i in range(0, population_size, 2):
            if random.random() < crossover_rate:
                crossover_point = random.randint(1, n_features - 1)
                population[i, :crossover_point], population[i+1, :crossover_point] = population[i+1, :crossover_point].copy(), population[i, :crossover_point].copy()
                population[i, crossover_point:], population[i+1, crossover_point:] = population[i+1, crossover_point:].copy(), population[i, crossover_point:].copy()
        for i in range(population_size):
            for j in range(n_features):
                if random.random() < mutation_rate:
                    population[i, j] = 1 - population[i, j]
    selected_indices = np.where(best_chromosome == 1)[0]
    return best_accuracy, len(selected_indices), selected_indices.tolist()

def scale_features(X_train, X_test):
    """
    Scales the training and testing features using StandardScaler.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

def evaluate_with_gradient_boosting(X_train_scaled, X_test_scaled, y_train, y_test, selected_indices):
    """
    Evaluates the performance of selected features using a Gradient Boosting Classifier.
    """
    if not selected_indices:
        return 0.0
    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]
    model = GradientBoostingClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

def evaluate_with_neural_network(X_train_scaled, X_test_scaled, y_train, y_test, selected_indices):
    """
    Evaluates the performance of selected features using a simple Neural Network Classifier.
    """
    if not selected_indices:
        return 0.0
    X_train_selected = X_train_scaled[:, selected_indices]
    X_test_selected = X_test_scaled[:, selected_indices]
    model = MLPClassifier(random_state=42, max_iter=2000)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy


if __name__ == '__main__':
    # Set global random seeds for reproducibility
    random.seed(42)
    np.random.seed(42)

    print("Mounting Google Drive...")
    drive.mount('/content/drive')

    FILE_PATH = '/content/drive/MyDrive/cervicaldata.xlsx'
    TARGET_NAME = 'status'

    try:
        df = pd.read_excel(FILE_PATH)
        print(f"\nSuccessfully loaded file from Google Drive: {FILE_PATH}")
        file_name = FILE_PATH
    except FileNotFoundError:
        print(f"\nError: File not found at {FILE_PATH}. Using fallback: local upload widget.")
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        df = pd.read_excel(io.BytesIO(uploaded[file_name]))

    print("\nDataFrame loaded successfully. Columns found:")
    print(df.columns.tolist())

    if TARGET_NAME not in df.columns:
        print(f"\nFATAL ERROR: Target column '{TARGET_NAME}' not found in the Excel file.")
        print("Please edit the 'TARGET_NAME' variable in the script to match a column exactly.")
    else:
        y = df[TARGET_NAME]
        X = df.drop(columns=[TARGET_NAME])
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce')
        imputer = SimpleImputer(strategy='mean')
        X = imputer.fit_transform(X)
        if not pd.api.types.is_numeric_dtype(y):
            le_y = LabelEncoder()
            y = le_y.fit_transform(y.astype(str).fillna('missing'))
        else:
            y = y.values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

        results_knn = []
        results_gb = []
        results_nn = []

        print(f"\n--- Running Feature Selection Methods on {file_name} ---")
        methods = {
            "bWWPA": bwwpa_feature_selection,
            "RFE": rfe_feature_selection,
            "SelectKBest": selectkbest_feature_selection,
            "Random Forest Importance": randomforest_feature_selection,
            "Lasso (L1)": lasso_feature_selection,
            "Genetic Algorithm": genetic_algorithm_feature_selection
        }

        for method_name, method_func in methods.items():
            if method_name in ["bWWPA", "Genetic Algorithm"]:
                acc_knn, count, indices = method_func(X_train_scaled, X_test_scaled, y_train, y_test)
            else:
                acc_knn, count, indices = method_func(X_train_scaled, X_test_scaled, y_train, y_test)
            results_knn.append({"Method": method_name, "Accuracy_KNN": acc_knn, "Feature Count": count, "Selected Indices": indices})
            acc_gb = evaluate_with_gradient_boosting(X_train_scaled, X_test_scaled, y_train, y_test, indices)
            results_gb.append({"Method": method_name, "Accuracy_GB": acc_gb, "Feature Count": count, "Selected Indices": indices})
            acc_nn = evaluate_with_neural_network(X_train_scaled, X_test_scaled, y_train, y_test, indices)
            results_nn.append({"Method": method_name, "Accuracy_NN": acc_nn, "Feature Count": count, "Selected Indices": indices})

        model_knn = KNeighborsClassifier(n_neighbors=5)
        model_knn.fit(X_train_scaled, y_train)
        baseline_acc_knn = accuracy_score(y_test, model_knn.predict(X_test_scaled))

        model_gb = GradientBoostingClassifier(random_state=42)
        model_gb.fit(X_train_scaled, y_train)
        baseline_acc_gb = accuracy_score(y_test, model_gb.predict(X_test_scaled))

        model_nn = MLPClassifier(random_state=42, max_iter=2000)
        model_nn.fit(X_train_scaled, y_train)
        baseline_acc_nn = accuracy_score(y_test, model_nn.predict(X_test_scaled))

        all_indices = list(range(X.shape[1]))
        results_knn.append({"Method": "Baseline (No FS)", "Accuracy_KNN": baseline_acc_knn, "Feature Count": X.shape[1], "Selected Indices": all_indices})
        results_gb.append({"Method": "Baseline (No FS)", "Accuracy_GB": baseline_acc_gb, "Feature Count": X.shape[1], "Selected Indices": all_indices})
        results_nn.append({"Method": "Baseline (No FS)", "Accuracy_NN": baseline_acc_nn, "Feature Count": X.shape[1], "Selected Indices": all_indices})

        results_df_knn = pd.DataFrame(results_knn)
        results_df_knn = results_df_knn.sort_values(by=['Accuracy_KNN', 'Feature Count'], ascending=[False, True])

        results_df_gb = pd.DataFrame(results_gb)
        results_df_gb = results_df_gb.sort_values(by=['Accuracy_GB', 'Feature Count'], ascending=[False, True])

        results_df_nn = pd.DataFrame(results_nn)
        results_df_nn = results_df_nn.sort_values(by=['Accuracy_NN', 'Feature Count'], ascending=[False, True])

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by KNN Accuracy):")
        print("=" * 50)
        print(results_df_knn.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by Gradient Boosting Accuracy):")
        print("=" * 50)
        print(results_df_gb.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Feature Selection Results (Ranked by Neural Network Accuracy):")
        print("=" * 50)
        print(results_df_nn.to_markdown(index=False))

        print("\n" + "=" * 50)
        print("Top Two Methods by KNN Accuracy:")
        print("=" * 50)
        top_two_knn = results_df_knn.head(2)
        for index, row in top_two_knn.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_KNN']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

        print("\n" + "=" * 50)
        print("Top Two Methods by Gradient Boosting Accuracy:")
        print("=" * 50)
        top_two_gb = results_df_gb.head(2)
        for index, row in top_two_gb.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_GB']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

        print("\n" + "=" * 50)
        print("Top Two Methods by Neural Network Accuracy:")
        print("=" * 50)
        top_two_nn = results_df_nn.head(2)
        for index, row in top_two_nn.iterrows():
            print(f"-> {row['Method']}: Accuracy = {row['Accuracy_NN']:.4f}, Features Selected = {int(row['Feature Count']) if type(row['Feature Count']) is not np.int64 else row['Feature Count']}")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Successfully loaded file from Google Drive: /content/drive/MyDrive/cervicaldata.xlsx

DataFrame loaded successfully. Columns found:
['years_after_diagnosis', 'age_at_diagnosis', 'stage level', 'chemotherapy', 'brachtherapy', 'chemoradiation', 'radiotherapy', 'radiation', 'menopause', 'MENO_post', 'HISTOLOGY', 'CM_1', 'CM_2', 'CM_3', 'status']

--- Running Feature Selection Methods on /content/drive/MyDrive/cervicaldata.xlsx ---

Feature Selection Results (Ranked by KNN Accuracy):
| Method                   |   Accuracy_KNN |   Feature Count | Selected Indices                               |
|:-------------------------|---------------:|----------------:|:-----------------------------------------------|
| Genetic Algorithm        |       0.878788 |               4 | [1, 4, 7, 9]                                   |
| bWWPA              

CPH FOR ALL GRADIENT BOOSTING

In [38]:
all_gb_cph = results_df_gb.copy()
all_gb_cph['CPH'] = all_gb_cph['Accuracy_GB'] / all_gb_cph['Feature Count']
display(all_gb_cph[['Method', 'Accuracy_GB', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_GB,Feature Count,CPH
1,RFE,0.727273,7,0.103896
6,Baseline (No FS),0.727273,14,0.051948
0,bWWPA,0.69697,5,0.139394
3,Random Forest Importance,0.69697,7,0.099567
2,SelectKBest,0.666667,7,0.095238
4,Lasso (L1),0.636364,4,0.159091
5,Genetic Algorithm,0.606061,4,0.151515


CPH FOR ALL NEURAL NETWORK

In [39]:
all_nn_cph = results_df_nn.copy()
all_nn_cph['CPH'] = all_nn_cph['Accuracy_NN'] / all_nn_cph['Feature Count']
display(all_nn_cph[['Method', 'Accuracy_NN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_NN,Feature Count,CPH
0,bWWPA,0.848485,5,0.169697
1,RFE,0.757576,7,0.108225
2,SelectKBest,0.757576,7,0.108225
5,Genetic Algorithm,0.727273,4,0.181818
4,Lasso (L1),0.69697,4,0.174242
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.666667,14,0.047619


CPH FOR ALL KNN

In [40]:
all_knn_cph = results_df_knn.copy()
all_knn_cph['CPH'] = all_knn_cph['Accuracy_KNN'] / all_knn_cph['Feature Count']
display(all_knn_cph[['Method', 'Accuracy_KNN', 'Feature Count', 'CPH']])

Unnamed: 0,Method,Accuracy_KNN,Feature Count,CPH
5,Genetic Algorithm,0.878788,4,0.219697
0,bWWPA,0.787879,5,0.157576
1,RFE,0.69697,7,0.099567
2,SelectKBest,0.69697,7,0.099567
3,Random Forest Importance,0.69697,7,0.099567
6,Baseline (No FS),0.69697,14,0.049784
4,Lasso (L1),0.666667,4,0.166667


GENERAL COMPARISONS

#SECOND TESTING

#FOR KNN CPH, AIC AND CINDEX

# Task
To proceed with the survival analysis using the Cox Proportional Hazards model, I'll first install the necessary library, `lifelines`. Then, I will extract the duration ('years_after_diagnosis') and event ('status') columns from your original DataFrame `df`. Next, I will define a Python function to fit the CoxPH model for a given set of features, and calculate its C-index and AIC.

I will then iterate through the results of the KNN-ranked feature selection (`results_df_knn`), apply the CoxPH analysis for each selected feature subset, and store the C-index, AIC, and a custom 'CPH' metric (Accuracy_KNN / Feature Count) along with the existing feature selection metrics. Finally, I will display the comprehensive results, ranked by C-index, and provide a summary of the best-performing feature selection methods for survival prediction.

## Extract Duration and Event

### Subtask:
Extract 'years_after_diagnosis' as the duration and 'status' as the event indicator from the original DataFrame `df`. These columns are crucial for survival analysis.


**Reasoning**:
To prepare for survival analysis, I need to extract the 'years_after_diagnosis' and 'status' columns from the DataFrame `df` and store them in separate variables as `duration` and `event_observed`, respectively. I will then display the first few rows of each to verify the extraction.



In [46]:
duration = df['years_after_diagnosis']
event_observed = df['status']

print("Duration (years_after_diagnosis) head:")
print(duration.head())
print("\nEvent (status) head:")
print(event_observed.head())

Duration (years_after_diagnosis) head:
0    5.0
1    8.0
2    7.0
3    7.0
4    8.0
Name: years_after_diagnosis, dtype: float64

Event (status) head:
0    1
1    1
2    0
3    1
4    1
Name: status, dtype: int64


**Reasoning**:
The next step is to install the `lifelines` library, which is necessary for performing Cox Proportional Hazards survival analysis. This is a prerequisite before defining the CoxPH fitting function.



In [48]:
get_ipython().system('pip install lifelines')

print("lifelines library installed successfully.")

lifelines library installed successfully.


**Reasoning**:
Now that `lifelines` is installed, I need to define a Python function that takes a DataFrame of features, duration, and event data, fits a Cox Proportional Hazards model, and returns its C-index and AIC. This function will be crucial for evaluating the feature subsets with survival analysis.



In [49]:
from lifelines import CoxPHFitter

def fit_coxph_model(X_data, duration, event_observed):
    """
    Fits a Cox Proportional Hazards model and calculates C-index and AIC.

    Args:
        X_data (pd.DataFrame): DataFrame containing the selected features.
        duration (pd.Series): Series containing event durations.
        event_observed (pd.Series): Series containing event indicators (1 for event, 0 for censored).

    Returns:
        tuple: A tuple containing the C-index and AIC of the fitted model.
    """
    # Combine features, duration, and event into a single DataFrame for lifelines
    cph_df = X_data.copy()
    cph_df['duration'] = duration
    cph_df['event'] = event_observed

    # Ensure all columns are numeric for CoxPHFitter
    for col in cph_df.columns:
        if cph_df[col].dtype == 'object':
            cph_df[col] = pd.to_numeric(cph_df[col], errors='coerce')
    cph_df = cph_df.dropna(axis=1, how='all') # Drop columns that became all NaN
    cph_df = cph_df.dropna() # Drop rows with any NaN after conversion

    if cph_df.empty or len(cph_df.columns) < 2: # Need at least duration and event columns
        return 0.0, float('inf') # Return default values for empty or insufficient data

    # Instantiate and fit the CoxPH model
    cph = CoxPHFitter()
    try:
        cph.fit(cph_df, duration_col='duration', event_col='event', show_progress=False)
        c_index = cph.concordance_index_ # Calculate C-index
        aic_score = cph.AIC_ # Calculate AIC
    except Exception as e:
        print(f"Error fitting CoxPH model: {e}")
        c_index = 0.0
        aic_score = float('inf')

    return c_index, aic_score

print("fit_coxph_model function defined successfully.")

fit_coxph_model function defined successfully.


**Reasoning**:
I need to iterate through the KNN feature selection results, prepare the feature subsets for CoxPH analysis by ensuring 'years_after_diagnosis' is not used as a feature, fit the CoxPH model for each subset, and store the C-index, AIC, and a combined CPH metric. Finally, I will display the results sorted by C-index.



In [50]:
original_feature_names = df.drop(columns=[TARGET_NAME]).columns.tolist()
X_processed_df = pd.DataFrame(X, columns=original_feature_names)

cph_knn_results = []

for index, row in results_df_knn.iterrows():
    method_name = row['Method']
    accuracy_knn = row['Accuracy_KNN']
    feature_count_knn_original = row['Feature Count'] # Feature count as determined by KNN FS
    selected_indices = row['Selected Indices']

    # Get actual feature names from the original DataFrame's columns
    current_selected_feature_names = [original_feature_names[i] for i in selected_indices]

    # 'years_after_diagnosis' is the duration column and should not be a feature in CoxPH
    if 'years_after_diagnosis' in current_selected_feature_names:
        current_selected_feature_names.remove('years_after_diagnosis')

    # If no features remain after removing 'years_after_diagnosis', handle gracefully
    if not current_selected_feature_names:
        c_index = 0.0
        aic_score = float('inf')
        actual_feature_count_for_cph = 0
        print(f"Warning: '{method_name}' resulted in no features for CoxPH after removing 'years_after_diagnosis'.")
    else:
        X_data_for_cph = X_processed_df[current_selected_feature_names]
        c_index, aic_score = fit_coxph_model(X_data_for_cph, duration, event_observed)
        actual_feature_count_for_cph = len(current_selected_feature_names)

    # Calculate CPH (Cost Per Hit) using KNN accuracy and original feature count
    # Avoid division by zero if original feature count is 0, though highly unlikely for FS methods
    if feature_count_knn_original > 0:
        cph_metric = accuracy_knn / feature_count_knn_original
    else:
        cph_metric = 0.0 # Or some other indicator for no features

    cph_knn_results.append({
        "Method": method_name,
        "Accuracy_KNN": accuracy_knn,
        "Feature Count (KNN)": feature_count_knn_original,
        "CPH (KNN)": cph_metric,
        "Feature Count (CoxPH)": actual_feature_count_for_cph,
        "C-index": c_index,
        "AIC": aic_score
    })

cph_knn_results_df = pd.DataFrame(cph_knn_results)

# Sort results by C-index (descending) and then by AIC (ascending) for tie-breaking
cph_knn_results_df_sorted = cph_knn_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 50)
print("Comprehensive Feature Selection Results (Ranked by CoxPH C-index for KNN-selected features):")
print("=" * 50)
display(cph_knn_results_df_sorted)

print("\n" + "=" * 50)
print("Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index):")
print("=" * 50)
top_method = cph_knn_results_df_sorted.iloc[0]
print(f"The best method for CoxPH using KNN-selected features is '{top_method['Method']}' with a C-index of {top_method['C-index']:.4f} and {top_method['Feature Count (CoxPH)']} features.")

Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_` property instead.
Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_` property instead.
Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_` property instead.
Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_` property instead.
Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_` property instead.
Error fitting CoxPH model: Since the model is semi-parametric (and not fully-parametric), the AIC does not exist. You probably want the `.AIC_partial_

Unnamed: 0,Method,Accuracy_KNN,Feature Count (KNN),CPH (KNN),Feature Count (CoxPH),C-index,AIC
0,Genetic Algorithm,0.878788,4,0.219697,4,0.0,inf
1,bWWPA,0.787879,5,0.157576,4,0.0,inf
2,RFE,0.69697,7,0.099567,6,0.0,inf
3,SelectKBest,0.69697,7,0.099567,6,0.0,inf
4,Random Forest Importance,0.69697,7,0.099567,6,0.0,inf
5,Baseline (No FS),0.69697,14,0.049784,13,0.0,inf
6,Lasso (L1),0.666667,4,0.166667,3,0.0,inf



Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index):
The best method for CoxPH using KNN-selected features is 'Genetic Algorithm' with a C-index of 0.0000 and 4 features.


**Reasoning**:
The previous code generated an error indicating that `cph.AIC_` is not the correct property for a semi-parametric CoxPH model. I need to update the `fit_coxph_model` function to use `cph.AIC_partial_` instead, as suggested by the error message, to correctly calculate the AIC score.



In [51]:
from lifelines import CoxPHFitter

def fit_coxph_model(X_data, duration, event_observed):
    """
    Fits a Cox Proportional Hazards model and calculates C-index and AIC.

    Args:
        X_data (pd.DataFrame): DataFrame containing the selected features.
        duration (pd.Series): Series containing event durations.
        event_observed (pd.Series): Series containing event indicators (1 for event, 0 for censored).

    Returns:
        tuple: A tuple containing the C-index and AIC of the fitted model.
    """
    # Combine features, duration, and event into a single DataFrame for lifelines
    cph_df = X_data.copy()
    cph_df['duration'] = duration
    cph_df['event'] = event_observed

    # Ensure all columns are numeric for CoxPHFitter
    for col in cph_df.columns:
        if cph_df[col].dtype == 'object':
            cph_df[col] = pd.to_numeric(cph_df[col], errors='coerce')
    cph_df = cph_df.dropna(axis=1, how='all') # Drop columns that became all NaN
    cph_df = cph_df.dropna() # Drop rows with any NaN after conversion

    if cph_df.empty or len(cph_df.columns) < 2: # Need at least duration and event columns
        return 0.0, float('inf') # Return default values for empty or insufficient data

    # Instantiate and fit the CoxPH model
    cph = CoxPHFitter()
    try:
        cph.fit(cph_df, duration_col='duration', event_col='event', show_progress=False)
        c_index = cph.concordance_index_ # Calculate C-index
        aic_score = cph.AIC_partial_ # Calculate partial AIC
    except Exception as e:
        print(f"Error fitting CoxPH model: {e}")
        c_index = 0.0
        aic_score = float('inf')

    return c_index, aic_score

print("fit_coxph_model function redefined with AIC_partial_ successfully.")

fit_coxph_model function redefined with AIC_partial_ successfully.


**Reasoning**:
The `fit_coxph_model` function has been updated to correctly calculate the AIC. I now need to re-run the analysis that iterates through the KNN feature selection results, applies the corrected CoxPH model, and displays the comprehensive results. This will ensure that the C-index and AIC values are accurate.



In [52]:
original_feature_names = df.drop(columns=[TARGET_NAME]).columns.tolist()
X_processed_df = pd.DataFrame(X, columns=original_feature_names)

cph_knn_results = []

for index, row in results_df_knn.iterrows():
    method_name = row['Method']
    accuracy_knn = row['Accuracy_KNN']
    feature_count_knn_original = row['Feature Count'] # Feature count as determined by KNN FS
    selected_indices = row['Selected Indices']

    # Get actual feature names from the original DataFrame's columns
    current_selected_feature_names = [original_feature_names[i] for i in selected_indices]

    # 'years_after_diagnosis' is the duration column and should not be a feature in CoxPH
    if 'years_after_diagnosis' in current_selected_feature_names:
        current_selected_feature_names.remove('years_after_diagnosis')

    # If no features remain after removing 'years_after_diagnosis', handle gracefully
    if not current_selected_feature_names:
        c_index = 0.0
        aic_score = float('inf')
        actual_feature_count_for_cph = 0
        print(f"Warning: '{method_name}' resulted in no features for CoxPH after removing 'years_after_diagnosis'.")
    else:
        X_data_for_cph = X_processed_df[current_selected_feature_names]
        c_index, aic_score = fit_coxph_model(X_data_for_cph, duration, event_observed)
        actual_feature_count_for_cph = len(current_selected_feature_names)

    # Calculate CPH (Cost Per Hit) using KNN accuracy and original feature count
    # Avoid division by zero if original feature count is 0, though highly unlikely for FS methods
    if feature_count_knn_original > 0:
        cph_metric = accuracy_knn / feature_count_knn_original
    else:
        cph_metric = 0.0 # Or some other indicator for no features

    cph_knn_results.append({
        "Method": method_name,
        "Accuracy_KNN": accuracy_knn,
        "Feature Count (KNN)": feature_count_knn_original,
        "CPH (KNN)": cph_metric,
        "Feature Count (CoxPH)": actual_feature_count_for_cph,
        "C-index": c_index,
        "AIC": aic_score
    })

cph_knn_results_df = pd.DataFrame(cph_knn_results)

# Sort results by C-index (descending) and then by AIC (ascending) for tie-breaking
cph_knn_results_df_sorted = cph_knn_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 50)
print("Comprehensive Feature Selection Results (Ranked by CoxPH C-index for KNN-selected features):")
print("=" * 50)
display(cph_knn_results_df_sorted)

print("\n" + "=" * 50)
print("Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index):")
print("=" * 50)
top_method = cph_knn_results_df_sorted.iloc[0]
print(f"The best method for CoxPH using KNN-selected features is '{top_method['Method']}' with a C-index of {top_method['C-index']:.4f} and {top_method['Feature Count (CoxPH)']} features.")


Comprehensive Feature Selection Results (Ranked by CoxPH C-index for KNN-selected features):


Unnamed: 0,Method,Accuracy_KNN,Feature Count (KNN),CPH (KNN),Feature Count (CoxPH),C-index,AIC
5,Baseline (No FS),0.69697,14,0.049784,13,0.717574,606.911619
4,Random Forest Importance,0.69697,7,0.099567,6,0.708416,596.936224
3,SelectKBest,0.69697,7,0.099567,6,0.673391,598.854137
0,Genetic Algorithm,0.878788,4,0.219697,4,0.664356,600.58712
1,bWWPA,0.787879,5,0.157576,4,0.660767,597.797091
2,RFE,0.69697,7,0.099567,6,0.658168,602.387796
6,Lasso (L1),0.666667,4,0.166667,3,0.630941,598.64942



Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index):
The best method for CoxPH using KNN-selected features is 'Baseline (No FS)' with a C-index of 0.7176 and 13 features.


## Final Task

### Subtask:
Provide a comprehensive summary of the analysis, highlighting the top-performing feature selection methods based on CPH, C-index, and AIC.


## Summary:

### Q&A
The top-performing feature selection methods based on CPH, C-index, and AIC are:
*   **'Baseline (No FS)'** is identified as the best method for CoxPH using KNN-selected features, achieving a C-index of 0.7176 with 13 features.
*   **'Random Forest Importance'** is the second-best method, with a C-index of 0.7084 and 6 features.

### Data Analysis Key Findings
*   The `duration` (years\_after\_diagnosis) and `event` (status) columns were successfully extracted for survival analysis.
*   The `lifelines` library was installed to facilitate Cox Proportional Hazards (CoxPH) model fitting.
*   A custom function, `fit_coxph_model`, was developed to fit CoxPH models and calculate both the C-index and AIC for given feature subsets.
*   An initial attempt to calculate AIC resulted in an error, leading to the correction of the `fit_coxph_model` function to use `cph.AIC_partial_` for accurate AIC computation in semi-parametric models.
*   After correction, the CoxPH analysis successfully generated C-index and AIC values for various feature selection methods previously ranked by KNN accuracy.
*   The 'Baseline (No FS)' method yielded the highest C-index of 0.7176 with 13 features, suggesting that including all features initially leads to the best prognostic accuracy in this context.
*   The 'Random Forest Importance' method showed a strong performance with a C-index of 0.7084 using only 6 features, indicating its efficiency in selecting impactful features.

### Insights or Next Steps
*   Given that 'Baseline (No FS)' performs best in terms of C-index, further investigation into the individual significance of each of the 13 features in the full model could be beneficial.
*   The 'Random Forest Importance' method's strong performance with fewer features ($6$ vs. $13$) suggests a good balance between model complexity and predictive power, warranting a deeper dive into the specific features it selected and their biological or clinical relevance.


# Task
## CPH for Gradient Boosting Ranked Results

### Subtask:
Process Gradient Boosting ranked results for CPH and AIC analysis, and display the comprehensive table with a summary.

### Reasoning:
To continue the survival analysis, I will iterate through the `results_df_gb` DataFrame, which contains the feature selection results evaluated by Gradient Boosting accuracy. For each feature selection method, I will:

1.  **Extract Information**: Get the method name, `Accuracy_GB`, the `Feature Count` (as determined by the feature selection method), and the `Selected Indices`.
2.  **Prepare Feature Subset**: Use the `Selected Indices` to create a subset of the original features (`X_processed_df`). Crucially, I will ensure that `'years_after_diagnosis'` is removed from the selected features, as it serves as the duration variable for the CoxPH model and should not be included as a predictor.
3.  **Fit CoxPH Model**: Apply the `fit_coxph_model` function (which was previously defined and corrected to use `AIC_partial_`) to the prepared feature subset, along with the `duration` and `event_observed` data. This will yield the C-index and AIC for the CoxPH model.
4.  **Calculate CPH Metric**: Compute a custom CPH metric by dividing `Accuracy_GB` by the `Feature Count` (from the original feature selection process).
5.  **Store Results**: Collect these metrics (Method, Accuracy\_GB, original Feature Count, CPH metric, actual Feature Count for CoxPH, C-index, and AIC) into a list.
6.  **Display Results**: Convert the collected results into a pandas DataFrame, sort it by `C-index` (descending) and then by `AIC` (ascending) for comprehensive ranking, and display the table.
7.  **Summarize**: Provide a summary highlighting the best-performing method based on the C-index for Gradient Boosting, similar to the KNN analysis.

This process will allow for a robust comparison of how different feature selection methods, initially optimized for Gradient Boosting accuracy, perform in a survival analysis context.

## Process Gradient Boosting Ranked Results for CPH and AIC

### Subtask:
Iterate through the `results_df_gb` DataFrame, similar to how `results_df_knn` was processed. For each feature selection method, retrieve the selected indices and the `Accuracy_GB`. Prepare the feature subsets, fit the CoxPH model, calculate C-index and AIC, and calculate a CPH metric as `Accuracy_GB` divided by `Feature Count`. Store these results.


**Reasoning**:
I will now iterate through the `results_df_gb` DataFrame to apply the CoxPH model for each feature selection method, calculate the C-index, AIC, and a CPH metric, and store these results in a new list, similar to how it was done for KNN results.



In [53]:
original_feature_names = df.drop(columns=[TARGET_NAME]).columns.tolist()
X_processed_df = pd.DataFrame(X, columns=original_feature_names)

cph_gb_results = []

for index, row in results_df_gb.iterrows():
    method_name = row['Method']
    accuracy_gb = row['Accuracy_GB']
    feature_count_gb_original = row['Feature Count']
    selected_indices = row['Selected Indices']

    current_selected_feature_names = [original_feature_names[i] for i in selected_indices]

    if 'years_after_diagnosis' in current_selected_feature_names:
        current_selected_feature_names.remove('years_after_diagnosis')

    if not current_selected_feature_names:
        c_index = 0.0
        aic_score = float('inf')
        actual_feature_count_for_cph = 0
        print(f"Warning: '{method_name}' resulted in no features for CoxPH after removing 'years_after_diagnosis'.")
    else:
        X_data_for_cph = X_processed_df[current_selected_feature_names]
        c_index, aic_score = fit_coxph_model(X_data_for_cph, duration, event_observed)
        actual_feature_count_for_cph = len(current_selected_feature_names)

    if feature_count_gb_original > 0:
        cph_metric = accuracy_gb / feature_count_gb_original
    else:
        cph_metric = 0.0

    cph_gb_results.append({
        "Method": method_name,
        "Accuracy_GB": accuracy_gb,
        "Feature Count (GB)": feature_count_gb_original,
        "CPH (GB)": cph_metric,
        "Feature Count (CoxPH)": actual_feature_count_for_cph,
        "C-index": c_index,
        "AIC": aic_score
    })

cph_gb_results_df = pd.DataFrame(cph_gb_results)

cph_gb_results_df_sorted = cph_gb_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 50)
print("Comprehensive Feature Selection Results (Ranked by CoxPH C-index for Gradient Boosting-selected features):")
print("=" * 50)
display(cph_gb_results_df_sorted)

print("\n" + "=" * 50)
print("Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index) with GB:")
print("=" * 50)
top_method_gb = cph_gb_results_df_sorted.iloc[0]
print(f"The best method for CoxPH using Gradient Boosting-selected features is '{top_method_gb['Method']}' with a C-index of {top_method_gb['C-index']:.4f} and {top_method_gb['Feature Count (CoxPH)']} features.")


Comprehensive Feature Selection Results (Ranked by CoxPH C-index for Gradient Boosting-selected features):


Unnamed: 0,Method,Accuracy_GB,Feature Count (GB),CPH (GB),Feature Count (CoxPH),C-index,AIC
1,Baseline (No FS),0.727273,14,0.051948,13,0.717574,606.911619
3,Random Forest Importance,0.69697,7,0.099567,6,0.708416,596.936224
4,SelectKBest,0.666667,7,0.095238,6,0.673391,598.854137
6,Genetic Algorithm,0.606061,4,0.151515,4,0.664356,600.58712
2,bWWPA,0.69697,5,0.139394,4,0.660767,597.797091
0,RFE,0.727273,7,0.103896,6,0.658168,602.387796
5,Lasso (L1),0.636364,4,0.159091,3,0.630941,598.64942



Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index) with GB:
The best method for CoxPH using Gradient Boosting-selected features is 'Baseline (No FS)' with a C-index of 0.7176 and 13 features.


## Process Neural Network Ranked Results for CPH and AIC

### Subtask:
Iterate through the `results_df_nn` DataFrame. For each feature selection method, retrieve the selected indices and the `Accuracy_NN`. Prepare the feature subsets, fit the CoxPH model, calculate C-index and AIC, and calculate a CPH metric as `Accuracy_NN` divided by `Feature Count`. Store these results.


**Reasoning**:
To complete the subtask, I will iterate through the `results_df_nn` DataFrame, extract the necessary information, prepare the feature subsets, fit the CoxPH model using the corrected `fit_coxph_model` function, calculate the CPH metric, and store all results in a new list. Finally, I will convert this list into a DataFrame, sort it, and display the comprehensive results along with a summary of the best-performing method.



In [54]:
original_feature_names = df.drop(columns=[TARGET_NAME]).columns.tolist()
X_processed_df = pd.DataFrame(X, columns=original_feature_names)

cph_nn_results = []

for index, row in results_df_nn.iterrows():
    method_name = row['Method']
    accuracy_nn = row['Accuracy_NN']
    feature_count_nn_original = row['Feature Count'] # Feature count as determined by NN FS
    selected_indices = row['Selected Indices']

    # Get actual feature names from the original DataFrame's columns
    current_selected_feature_names = [original_feature_names[i] for i in selected_indices]

    # 'years_after_diagnosis' is the duration column and should not be a feature in CoxPH
    if 'years_after_diagnosis' in current_selected_feature_names:
        current_selected_feature_names.remove('years_after_diagnosis')

    # If no features remain after removing 'years_after_diagnosis', handle gracefully
    if not current_selected_feature_names:
        c_index = 0.0
        aic_score = float('inf')
        actual_feature_count_for_cph = 0
        print(f"Warning: '{method_name}' resulted in no features for CoxPH after removing 'years_after_diagnosis'.")
    else:
        X_data_for_cph = X_processed_df[current_selected_feature_names]
        c_index, aic_score = fit_coxph_model(X_data_for_cph, duration, event_observed)
        actual_feature_count_for_cph = len(current_selected_feature_names)

    # Calculate CPH (Cost Per Hit) using NN accuracy and original feature count
    # Avoid division by zero if original feature count is 0, though highly unlikely for FS methods
    if feature_count_nn_original > 0:
        cph_metric = accuracy_nn / feature_count_nn_original
    else:
        cph_metric = 0.0 # Or some other indicator for no features

    cph_nn_results.append({
        "Method": method_name,
        "Accuracy_NN": accuracy_nn,
        "Feature Count (NN)": feature_count_nn_original,
        "CPH (NN)": cph_metric,
        "Feature Count (CoxPH)": actual_feature_count_for_cph,
        "C-index": c_index,
        "AIC": aic_score
    })

cph_nn_results_df = pd.DataFrame(cph_nn_results)

# Sort results by C-index (descending) and then by AIC (ascending) for tie-breaking
cph_nn_results_df_sorted = cph_nn_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 50)
print("Comprehensive Feature Selection Results (Ranked by CoxPH C-index for Neural Network-selected features):")
print("=" * 50)
display(cph_nn_results_df_sorted)

print("\n" + "=" * 50)
print("Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index) with NN:")
print("=" * 50)
top_method_nn = cph_nn_results_df_sorted.iloc[0]
print(f"The best method for CoxPH using Neural Network-selected features is '{top_method_nn['Method']}' with a C-index of {top_method_nn['C-index']:.4f} and {top_method_nn['Feature Count (CoxPH)']} features.")


Comprehensive Feature Selection Results (Ranked by CoxPH C-index for Neural Network-selected features):


Unnamed: 0,Method,Accuracy_NN,Feature Count (NN),CPH (NN),Feature Count (CoxPH),C-index,AIC
6,Baseline (No FS),0.666667,14,0.047619,13,0.717574,606.911619
5,Random Forest Importance,0.69697,7,0.099567,6,0.708416,596.936224
2,SelectKBest,0.757576,7,0.108225,6,0.673391,598.854137
3,Genetic Algorithm,0.727273,4,0.181818,4,0.664356,600.58712
0,bWWPA,0.848485,5,0.169697,4,0.660767,597.797091
1,RFE,0.757576,7,0.108225,6,0.658168,602.387796
4,Lasso (L1),0.69697,4,0.174242,3,0.630941,598.64942



Summary of Best-Performing Feature Selection Methods for Survival Prediction (based on C-index) with NN:
The best method for CoxPH using Neural Network-selected features is 'Baseline (No FS)' with a C-index of 0.7176 and 13 features.


## Consolidate and Compare All Results

### Subtask:
Combine the CPH analysis results from KNN, Gradient Boosting, and Neural Network into a single DataFrame. Provide a final comprehensive summary and comparison of the top-performing feature selection methods across all evaluation models based on C-index, AIC, and CPH.


## Summary:

### Data Analysis Key Findings

*   **Gradient Boosting (GB) based Feature Selection Analysis:**
    *   The `results_df_gb` DataFrame, containing feature selection methods ranked by Gradient Boosting accuracy, was successfully processed.
    *   For each method, a CoxPH model was fitted using the selected features (after removing 'years\_after\_diagnosis').
    *   The best-performing feature selection method for survival prediction, based on the C-index for Gradient Boosting-selected features, was identified as 'Baseline (No FS)', achieving a C-index of 0.7176 with 13 features.
    *   A custom CPH metric was calculated by dividing the `Accuracy_GB` by the `Feature Count (GB)`.

*   **Neural Network (NN) based Feature Selection Analysis:**
    *   Similarly, the `results_df_nn` DataFrame, containing feature selection methods ranked by Neural Network accuracy, was processed.
    *   CoxPH models were fitted, and C-index and AIC were calculated for each feature subset.
    *   The best-performing feature selection method for survival prediction, based on the C-index for Neural Network-selected features, was also 'Baseline (No FS)', achieving a C-index of 0.7176 with 13 features.
    *   A custom CPH metric was calculated by dividing the `Accuracy_NN` by the `Feature Count (NN)`.

*   **Consistency Across Models:** Both Gradient Boosting and Neural Network-optimized feature selections indicated 'Baseline (No FS)' as the best approach for CoxPH model performance, with an identical C-index of 0.7176, suggesting that for this dataset, comprehensive feature sets might be more effective for survival analysis than reduced sets from these methods.

### Insights or Next Steps

*   The consistent identification of 'Baseline (No FS)' as the top performer across both Gradient Boosting and Neural Network initial optimizations, with an identical C-index, suggests that the full feature set (excluding the duration variable) is robust for survival analysis in this context. This implies that the evaluated feature selection methods did not significantly improve the CoxPH model's C-index compared to using all available features.
*   The next logical step is to combine these results with those from KNN-based feature selection to provide a final comprehensive summary and comparison of the top-performing feature selection methods across all evaluation models (KNN, Gradient Boosting, and Neural Network) based on C-index, AIC, and the custom CPH metric.


# Task
To examine the features currently selected by the `bWWPA` method, we will retrieve the `Selected Indices` for `bWWPA` from the `results_df_knn`, `results_df_gb`, and `results_df_nn` DataFrames, and then map these indices back to the original feature names. This will provide insights into which features `bWWPA` considered important when optimized for different classification models.

### Subtask:
Display the features selected by the `bWWPA` method from `results_df_knn`, `results_df_gb`, and `results_df_nn` DataFrames, along with their associated accuracy and feature count.

## Examine Current bWWPA Selected Features

### Subtask:
Retrieve and display the features currently selected by the `bWWPA` method from the previous analysis. This will help us understand its current feature selection strategy before modification.


**Reasoning**:
I need to extract and display the features selected by the `bWWPA` method from `results_df_knn`, `results_df_gb`, and `results_df_nn` DataFrames, along with their respective accuracies and feature counts, as requested by the subtask instructions.



In [55]:
original_feature_names = df.drop(columns=[TARGET_NAME]).columns.tolist()

# Create a list of the result DataFrames and their corresponding accuracy column names
results_dfs = [results_df_knn, results_df_gb, results_df_nn]
accuracy_cols = ['Accuracy_KNN', 'Accuracy_GB', 'Accuracy_NN']
model_names = ['KNN', 'Gradient Boosting', 'Neural Network']

print("Current bWWPA Selected Features across evaluation models:")
print("=" * 60)

for i, results_df in enumerate(results_dfs):
    model_name = model_names[i]
    accuracy_col = accuracy_cols[i]

    # Filter the DataFrame to find the row where the 'Method' is 'bWWPA'
    bwwpa_row = results_df[results_df['Method'] == 'bWWPA'].iloc[0]

    method_name = bwwpa_row['Method']
    accuracy = bwwpa_row[accuracy_col]
    feature_count = bwwpa_row['Feature Count']
    selected_indices = bwwpa_row['Selected Indices']

    # Map the 'Selected Indices' back to the original feature names
    selected_feature_names = [original_feature_names[idx] for idx in selected_indices]

    print(f"Model: {model_name}")
    print(f"  Method: {method_name}")
    print(f"  Accuracy ({model_name}): {accuracy:.4f}")
    print(f"  Feature Count: {feature_count}")
    print(f"  Selected Feature Names: {', '.join(selected_feature_names)}")
    print("-" * 60)

Current bWWPA Selected Features across evaluation models:
Model: KNN
  Method: bWWPA
  Accuracy (KNN): 0.7879
  Feature Count: 5
  Selected Feature Names: years_after_diagnosis, stage level, chemotherapy, brachtherapy, MENO_post
------------------------------------------------------------
Model: Gradient Boosting
  Method: bWWPA
  Accuracy (Gradient Boosting): 0.6970
  Feature Count: 5
  Selected Feature Names: years_after_diagnosis, stage level, chemotherapy, brachtherapy, MENO_post
------------------------------------------------------------
Model: Neural Network
  Method: bWWPA
  Accuracy (Neural Network): 0.8485
  Feature Count: 5
  Selected Feature Names: years_after_diagnosis, stage level, chemotherapy, brachtherapy, MENO_post
------------------------------------------------------------


## Redefine Fitness Function for Survival Analysis

### Subtask:
Create a new fitness function, `calculate_survival_fitness`, tailored for survival analysis. This function will take a feature subset, fit a CoxPH model using `fit_coxph_model`, and return a fitness score based on the C-index (e.g., directly use C-index or C-index penalized by feature count). This will guide `bWWPA` to select features relevant for survival prediction.


**Reasoning**:
I need to define the `calculate_survival_fitness` function as instructed, which will take a feature subset, use the previously defined `fit_coxph_model` for evaluation, and return a fitness score based on the C-index, potentially penalized by the number of features.



In [56]:
def calculate_survival_fitness(solution, X_processed_df, duration, event_observed):
    """
    Calculates a fitness score for a feature subset based on CoxPH C-index and feature count.

    Args:
        solution (np.ndarray): Binary array representing selected features (1 if selected, 0 otherwise).
        X_processed_df (pd.DataFrame): DataFrame containing all original features.
        duration (pd.Series): Series containing event durations.
        event_observed (pd.Series): Series containing event indicators (1 for event, 0 for censored).

    Returns:
        tuple: A tuple containing the calculated fitness score and the C-index.
    """
    selected_indices = np.where(solution == 1)[0]
    total_features = X_processed_df.shape[1]

    if len(selected_indices) == 0:
        return 0.0, 0.0 # No features selected, return minimal fitness and C-index

    # Get actual feature names from the original DataFrame's columns
    # original_feature_names is assumed to be globally available from previous steps
    global original_feature_names
    current_selected_feature_names = [original_feature_names[i] for i in selected_indices]

    # 'years_after_diagnosis' is the duration column and should not be a feature in CoxPH
    if 'years_after_diagnosis' in current_selected_feature_names:
        current_selected_feature_names.remove('years_after_diagnosis')

    if not current_selected_feature_names:
        return 0.0, 0.0 # No valid features for CoxPH after removal

    X_data_for_cph = X_processed_df[current_selected_feature_names]

    c_index, _ = fit_coxph_model(X_data_for_cph, duration, event_observed)

    # If C-index is very low (e.g., due to poor feature selection or errors in fit_coxph_model),
    # penalize further. A C-index of 0.5 is equivalent to random chance.
    if c_index < 0.5:
        c_index = 0.5 # Treat anything below random as random

    num_features_used = len(current_selected_feature_names)

    # Define fitness score as a weighted combination of C-index and feature count
    # Alpha controls the balance between C-index (accuracy) and feature reduction (simplicity)
    alpha = 0.9 # Weight for C-index

    # Ensure we don't divide by zero if total_features happens to be 0 (unlikely for X_processed_df)
    # And also ensure that num_features_used is not zero in the denominator for the penalty part
    if total_features > 0 and num_features_used > 0:
        # Penalize for more features. (1 - (num_features_used / total_features)) gives higher value for fewer features.
        fitness_score = alpha * c_index + (1 - alpha) * (1 - (num_features_used / total_features))
    elif num_features_used == 0:
        fitness_score = 0.0 # No features, no fitness
    else:
        fitness_score = alpha * c_index # Fallback if total_features is problematic

    return fitness_score, c_index

print("calculate_survival_fitness function defined successfully.")

calculate_survival_fitness function defined successfully.


## Create an Adapted bWWPA Function

### Subtask:
Develop a modified version of the `bwwpa_feature_selection` function, named `bwwpa_survival_feature_selection`. This new function will incorporate the `calculate_survival_fitness` function for its optimization process, ensuring that feature selection is driven by survival metrics.


**Reasoning**:
I need to develop the `bwwpa_survival_feature_selection` function, which adapts the existing `bwwpa_feature_selection` to use the `calculate_survival_fitness` function for survival analysis. This involves updating function arguments, replacing `calculate_fitness` calls, and modifying return values to reflect C-index and feature count.



In [57]:
def bwwpa_survival_feature_selection(X_processed_df, duration, event_observed, max_iter=30):
    """
    Performs feature selection using a modified bWWPA algorithm optimized for survival analysis.
    Uses C-index as the primary metric for fitness evaluation via calculate_survival_fitness.

    Args:
        X_processed_df (pd.DataFrame): DataFrame containing all original features.
        duration (pd.Series): Series containing event durations.
        event_observed (pd.Series): Series containing event indicators (1 for event, 0 for censored).
        max_iter (int): Maximum number of iterations for the optimization algorithm.

    Returns:
        tuple: A tuple containing:
            - float: Best C-index achieved during feature selection.
            - int: The number of features selected.
            - list: A list of indices of the selected features.
    """
    n_plants = 14
    n_features = X_processed_df.shape[1] # Total number of features in the processed DataFrame
    positions = np.random.randint(0, 2, size=(n_plants, n_features))
    best_fitness = -1.0
    best_c_index = 0.0
    best_position = np.zeros(n_features)
    K = 1.0

    # Initialize best fitness and position
    for i in range(n_plants):
        current_fitness, current_c_index_temp = calculate_survival_fitness(positions[i], X_processed_df, duration, event_observed)
        if current_fitness > best_fitness:
            best_fitness = current_fitness
            best_position = positions[i].copy()
            best_c_index = current_c_index_temp

    # Main bWWPA loop
    for t in range(1, max_iter + 1):
        for i in range(n_plants):
            r, r1, r2, r3 = np.random.rand(4)
            current_position = positions[i].copy()

            # Update position logic (simplified for bWWPA)
            if r < 0.5:
                W_vec = r1 * (current_position + 2 * K)
                new_position = current_position + W_vec * (2 * K + r2)
            else:
                W_vec = r3 * (K * best_position + r3 * current_position)
                new_position = current_position + K * W_vec

            # Convert continuous position to binary
            sigmoid_val = 1 / (1 + np.exp(-new_position))
            positions[i] = (sigmoid_val > np.random.rand(n_features)).astype(int)

            # Evaluate new position
            current_fitness, current_c_index = calculate_survival_fitness(positions[i], X_processed_df, duration, event_observed)

            # Update best if current solution is better
            if current_fitness > best_fitness:
                best_fitness = current_fitness
                best_position = positions[i].copy()
                best_c_index = current_c_index
        K = 1 + 2 * t**2 / (max_iter)**3 + 0.5

    # Final evaluation of the best position found
    _, final_c_index = calculate_survival_fitness(best_position, X_processed_df, duration, event_observed)
    selected_indices = np.where(best_position == 1)[0]

    return final_c_index, len(selected_indices), selected_indices.tolist()

print("bwwpa_survival_feature_selection function defined successfully.")

bwwpa_survival_feature_selection function defined successfully.


## Run Adapted bWWPA and Evaluate

### Subtask:
Execute the `bwwpa_survival_feature_selection` function using the scaled training and testing data. Then, evaluate the features selected by this adapted method using the `fit_coxph_model` to calculate its C-index and AIC, and store these results.


**Reasoning**:
I need to execute the `bwwpa_survival_feature_selection` function, then process its output to evaluate the selected features using `fit_coxph_model`, and finally store the relevant C-index, AIC, and feature count. This follows the first instruction of the subtask.



In [58]:
print("Running bWWPA Survival Feature Selection...")
bwwpa_c_index, bwwpa_feature_count, bwwpa_selected_indices = bwwpa_survival_feature_selection(X_processed_df, duration, event_observed)

# Map selected indices to feature names and remove 'years_after_diagnosis'
current_selected_feature_names_bwwpa_survival = [original_feature_names[i] for i in bwwpa_selected_indices]
if 'years_after_diagnosis' in current_selected_feature_names_bwwpa_survival:
    current_selected_feature_names_bwwpa_survival.remove('years_after_diagnosis')

# Prepare DataFrame for CoxPH evaluation
X_data_for_cph_bwwpa_survival = X_processed_df[current_selected_feature_names_bwwpa_survival]

# Evaluate the final selected features with CoxPH
final_c_index_bwwpa_survival, final_aic_bwwpa_survival = fit_coxph_model(X_data_for_cph_bwwpa_survival, duration, event_observed)

# Adjust feature count for what was actually used in CoxPH
actual_feature_count_for_cph_bwwpa_survival = len(current_selected_feature_names_bwwpa_survival)

# Store the results
bwwpa_survival_result = {
    "Method": "bWWPA_Survival",
    "C-index": final_c_index_bwwpa_survival,
    "AIC": final_aic_bwwpa_survival,
    "Feature Count (CoxPH)": actual_feature_count_for_cph_bwwpa_survival,
    "Selected Indices": bwwpa_selected_indices # Original indices before 'years_after_diagnosis' removal
}

print("bWWPA Survival Feature Selection and Evaluation Complete.")
print(f"  Method: {bwwpa_survival_result['Method']}")
print(f"  C-index: {bwwpa_survival_result['C-index']:.4f}")
print(f"  AIC: {bwwpa_survival_result['AIC']:.4f}")
print(f"  Features Used in CoxPH: {bwwpa_survival_result['Feature Count (CoxPH)']}")
print(f"  Selected Feature Names (for CoxPH): {', '.join(current_selected_feature_names_bwwpa_survival)}")

Running bWWPA Survival Feature Selection...
bWWPA Survival Feature Selection and Evaluation Complete.
  Method: bWWPA_Survival
  C-index: 0.7010
  AIC: 597.6295
  Features Used in CoxPH: 6
  Selected Feature Names (for CoxPH): age_at_diagnosis, chemotherapy, brachtherapy, radiation, MENO_post, CM_3


## Compare and Summarize Performance

### Subtask:
Combine the CPH analysis results from KNN, Gradient Boosting, and Neural Network into a single DataFrame. Provide a final comprehensive summary and comparison of the top-performing feature selection methods across all evaluation models based on C-index, AIC, and CPH.


**Reasoning**:
First, I need to convert the `bwwpa_survival_result` dictionary into a pandas DataFrame and add an 'Optimized_For' column indicating its specific optimization for CoxPH. This prepares the first set of results for the comprehensive comparison.



In [59]:
import pandas as pd

# 1. Convert bwwpa_survival_result dictionary into a DataFrame
bwwpa_survival_result_df = pd.DataFrame([bwwpa_survival_result])
bwwpa_survival_result_df['Optimized_For'] = 'CoxPH-Optimized'

# Display the prepared DataFrame
print("Prepared bWWPA_Survival Result DataFrame:")
display(bwwpa_survival_result_df[['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)']])

Prepared bWWPA_Survival Result DataFrame:


Unnamed: 0,Method,Optimized_For,C-index,AIC,Feature Count (CoxPH)
0,bWWPA_Survival,CoxPH-Optimized,0.70099,597.629545,6


**Reasoning**:
Now I will filter the existing results DataFrames to extract the 'bWWPA' method's rows, add an 'Optimized_For' column for context, and ensure column consistency across these filtered DataFrames for later concatenation.



In [60]:
bwwpa_knn = cph_knn_results_df_sorted[cph_knn_results_df_sorted['Method'] == 'bWWPA'].copy()
bwwpa_knn['Optimized_For'] = 'KNN-Optimized'
bwwpa_knn = bwwpa_knn.rename(columns={'Accuracy_KNN': 'Accuracy_Original', 'Feature Count (KNN)': 'Feature Count (Original)', 'CPH (KNN)': 'CPH (Original)'})

bwwpa_gb = cph_gb_results_df_sorted[cph_gb_results_df_sorted['Method'] == 'bWWPA'].copy()
bwwpa_gb['Optimized_For'] = 'GB-Optimized'
bwwpa_gb = bwwpa_gb.rename(columns={'Accuracy_GB': 'Accuracy_Original', 'Feature Count (GB)': 'Feature Count (Original)', 'CPH (GB)': 'CPH (Original)'})

bwwpa_nn = cph_nn_results_df_sorted[cph_nn_results_df_sorted['Method'] == 'bWWPA'].copy()
bwwpa_nn['Optimized_For'] = 'NN-Optimized'
bwwpa_nn = bwwpa_nn.rename(columns={'Accuracy_NN': 'Accuracy_Original', 'Feature Count (NN)': 'Feature Count (Original)', 'CPH (NN)': 'CPH (Original)'})

print("Filtered 'bWWPA' results with 'Optimized_For' column added:")
display(bwwpa_knn[['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)']])
display(bwwpa_gb[['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)']])
display(bwwpa_nn[['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)']])

Filtered 'bWWPA' results with 'Optimized_For' column added:


Unnamed: 0,Method,Optimized_For,C-index,AIC,Feature Count (CoxPH)
1,bWWPA,KNN-Optimized,0.660767,597.797091,4


Unnamed: 0,Method,Optimized_For,C-index,AIC,Feature Count (CoxPH)
2,bWWPA,GB-Optimized,0.660767,597.797091,4


Unnamed: 0,Method,Optimized_For,C-index,AIC,Feature Count (CoxPH)
0,bWWPA,NN-Optimized,0.660767,597.797091,4


**Reasoning**:
Now I will concatenate the `bwwpa_survival_result_df` with the filtered `bwwpa_knn`, `bwwpa_gb`, and `bwwpa_nn` DataFrames, and then with the full `cph_knn_results_df_sorted`, `cph_gb_results_df_sorted`, and `cph_nn_results_df_sorted` DataFrames. I will ensure column consistency by explicitly selecting the desired columns before concatenation.



In [61]:
selected_cols = ['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)']

# Ensure 'Optimized_For' column exists for full results DFs, set to 'Original_Classifier'
# Make a copy to avoid SettingWithCopyWarning
cph_knn_full = cph_knn_results_df_sorted.copy()
cph_knn_full['Optimized_For'] = 'KNN-Optimized-Full'

cph_gb_full = cph_gb_results_df_sorted.copy()
cph_gb_full['Optimized_For'] = 'GB-Optimized-Full'

cph_nn_full = cph_nn_results_df_sorted.copy()
cph_nn_full['Optimized_For'] = 'NN-Optimized-Full'

# Combine all relevant dataframes, selecting only the columns required for the final comparison
combined_results_df = pd.concat([
    bwwpa_survival_result_df[selected_cols + ['Selected Indices']], # Include 'Selected Indices' for detailed comparison if needed later
    bwwpa_knn[selected_cols + ['Selected Indices']],
    bwwpa_gb[selected_cols + ['Selected Indices']],
    bwwpa_nn[selected_cols + ['Selected Indices']],
    cph_knn_full[selected_cols + ['Selected Indices']],
    cph_gb_full[selected_cols + ['Selected Indices']],
    cph_nn_full[selected_cols + ['Selected Indices']]
])

# Sort the combined DataFrame
combined_results_df_sorted = combined_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 70)
print("Comprehensive CPH Analysis Results Across All Optimization Contexts:")
print("=" * 70)
display(combined_results_df_sorted.drop(columns=['Selected Indices']))


KeyError: "['Selected Indices'] not in index"

**Reasoning**:
The previous code failed with a `KeyError: "['Selected Indices'] not in index"` because the `Selected Indices` column was missing from some of the DataFrames (`bwwpa_knn`, `bwwpa_gb`, `bwwpa_nn`, and the `cph_full` variants) being concatenated. Although the original lists of dictionaries (`cph_knn_results`, etc.) contained this information, it appears to have been lost when creating the DataFrames or in subsequent filtering steps. To fix this, I will explicitly merge the `Selected Indices` column from the original `results_df_knn`, `results_df_gb`, and `results_df_nn` into their respective `cph_results_df_sorted` DataFrames. Then, I will recreate the `cph_full` and filtered `bwwpa_fixed` DataFrames, ensuring that the `Selected Indices` column is present in all DataFrames before concatenation.



In [62]:
import pandas as pd

# Ensure 'Selected Indices' is present in the `cph_results_df_sorted` dataframes
# by merging with the original `results_df_` dataframes.

# 1. Merge 'Selected Indices' into cph_knn_results_df_sorted
cph_knn_results_df_sorted_with_indices = cph_knn_results_df_sorted.merge(
    results_df_knn[['Method', 'Selected Indices']],
    on='Method',
    how='left'
)

# 2. Merge 'Selected Indices' into cph_gb_results_df_sorted
cph_gb_results_df_sorted_with_indices = cph_gb_results_df_sorted.merge(
    results_df_gb[['Method', 'Selected Indices']],
    on='Method',
    how='left'
)

# 3. Merge 'Selected Indices' into cph_nn_results_df_sorted
cph_nn_results_df_sorted_with_indices = cph_nn_results_df_sorted.merge(
    results_df_nn[['Method', 'Selected Indices']],
    on='Method',
    how='left'
)

# Now, re-create the _full and bwwpa_ dataframes with the corrected dataframes
# Define all columns that should be present for concatenation, including 'Selected Indices'
selected_cols_for_final = ['Method', 'Optimized_For', 'C-index', 'AIC', 'Feature Count (CoxPH)', 'Selected Indices']

# Create cph_full dataframes ensuring 'Selected Indices' is present and selecting only relevant columns
cph_knn_full = cph_knn_results_df_sorted_with_indices.copy()
cph_knn_full['Optimized_For'] = 'KNN-Optimized-Full'
cph_knn_full = cph_knn_full[selected_cols_for_final]

cph_gb_full = cph_gb_results_df_sorted_with_indices.copy()
cph_gb_full['Optimized_For'] = 'GB-Optimized-Full'
cph_gb_full = cph_gb_full[selected_cols_for_final]

cph_nn_full = cph_nn_results_df_sorted_with_indices.copy()
cph_nn_full['Optimized_For'] = 'NN-Optimized-Full'
cph_nn_full = cph_nn_full[selected_cols_for_final]

# Filter for bWWPA results from the *corrected* full dataframes
bwwpa_knn_fixed = cph_knn_full[cph_knn_full['Method'] == 'bWWPA'].copy()
bwwpa_gb_fixed = cph_gb_full[cph_gb_full['Method'] == 'bWWPA'].copy()
bwwpa_nn_fixed = cph_nn_full[cph_nn_full['Method'] == 'bWWPA'].copy()

# The bwwpa_survival_result_df already has 'Selected Indices' and 'Optimized_For'
bwwpa_survival_result_df_prepared = bwwpa_survival_result_df[selected_cols_for_final].copy()

# Concatenate all relevant dataframes
combined_results_df = pd.concat([
    bwwpa_survival_result_df_prepared,
    bwwpa_knn_fixed,
    bwwpa_gb_fixed,
    bwwpa_nn_fixed,
    cph_knn_full,
    cph_gb_full,
    cph_nn_full
], ignore_index=True)

# Sort the combined DataFrame
combined_results_df_sorted = combined_results_df.sort_values(by=['C-index', 'AIC'], ascending=[False, True])

print("\n" + "=" * 70)
print("Comprehensive CPH Analysis Results Across All Optimization Contexts:")
print("=" * 70)
display(combined_results_df_sorted.drop(columns=['Selected Indices']))



Comprehensive CPH Analysis Results Across All Optimization Contexts:


Unnamed: 0,Method,Optimized_For,C-index,AIC,Feature Count (CoxPH)
4,Baseline (No FS),KNN-Optimized-Full,0.717574,606.911619,13
11,Baseline (No FS),GB-Optimized-Full,0.717574,606.911619,13
18,Baseline (No FS),NN-Optimized-Full,0.717574,606.911619,13
5,Random Forest Importance,KNN-Optimized-Full,0.708416,596.936224,6
12,Random Forest Importance,GB-Optimized-Full,0.708416,596.936224,6
19,Random Forest Importance,NN-Optimized-Full,0.708416,596.936224,6
0,bWWPA_Survival,CoxPH-Optimized,0.70099,597.629545,6
6,SelectKBest,KNN-Optimized-Full,0.673391,598.854137,6
13,SelectKBest,GB-Optimized-Full,0.673391,598.854137,6
20,SelectKBest,NN-Optimized-Full,0.673391,598.854137,6


## Comprehensive Summary and Comparison of Feature Selection Methods for Survival Analysis

This analysis compared various feature selection methods, both when optimized for traditional classification tasks (KNN, Gradient Boosting, Neural Networks) and when directly optimized for survival analysis metrics using a modified bWWPA algorithm.

### Key Findings from Comprehensive Results:

1.  **Overall Best Performer (C-index): 'Baseline (No FS)'**
    *   Across all initial optimization contexts (KNN, Gradient Boosting, Neural Network), the 'Baseline (No FS)' method consistently achieved the highest C-index of **0.7176** with 13 features. This suggests that for this dataset, utilizing the full set of available features (excluding the duration variable) provides the most robust prognostic model for survival prediction when evaluated by a CoxPH model.

2.  **'Random Forest Importance' as a Strong Reduced-Feature Contender**
    *   This method consistently ranked as the second-best performer across all initial optimization contexts, achieving a C-index of **0.7084** with only 6 features. This indicates that Random Forest Importance is effective at identifying a compact, highly predictive subset of features for survival analysis, providing a good balance between model performance and complexity.

3.  **Performance of the Newly Adapted 'bWWPA_Survival' Method**
    *   The `bwwpa_survival_feature_selection` method, which was directly optimized using `calculate_survival_fitness` (based on C-index and feature penalty), achieved a C-index of **0.7010** with 6 features. This places it third overall in terms of C-index, just below 'Random Forest Importance'.
    *   It also achieved a relatively low AIC of 597.63, indicating a good model fit for the number of features used.

4.  **Comparison of 'bWWPA_Survival' vs. Original 'bWWPA'**:
    *   The original `bWWPA` method, when optimized for KNN, GB, or NN accuracies, yielded a C-index of **0.6608** with 4 features for the CoxPH model.
    *   The `bWWPA_Survival` method, through direct optimization for survival metrics, significantly improved its C-index from 0.6608 to **0.7010**. It also selected a slightly larger, but still compact, set of 6 features, demonstrating that tuning the feature selection algorithm specifically for survival outcomes can lead to better performance in that domain.

5.  **Other Methods**:
    *   'SelectKBest' and 'Genetic Algorithm' showed moderate C-indices (around 0.66 - 0.67), while 'RFE' and 'Lasso (L1)' generally performed lower in terms of C-index for survival prediction.

### Discussion: Did Direct Optimization Improve `bWWPA`'s Performance in Survival Prediction?

Yes, the direct optimization for survival metrics significantly improved `bWWPA`'s performance in survival prediction. The `bWWPA_Survival` method achieved a C-index of 0.7010, which is a notable increase compared to the 0.6608 C-index achieved by the original `bWWPA` when its feature selection was driven by classification accuracies (KNN, GB, NN). This improvement highlights the importance of aligning the feature selection objective function with the ultimate evaluation metric (in this case, survival C-index). By directly optimizing for survival outcomes, `bWWPA_Survival` was able to identify a feature subset that is more prognostically relevant.

While 'Baseline (No FS)' still achieved the highest C-index, 'bWWPA_Survival' offered a competitive performance with a reduced set of 6 features, demonstrating its utility as an effective feature selection method for survival analysis when feature parsimony is desired.

## Final Task

### Subtask:
Summarize the findings regarding the improved bWWPA method and its comparison to other feature selection techniques for survival analysis.


## Summary:

### Q&A
**Did direct optimization improve `bWWPA`'s performance in survival prediction?**
Yes, the direct optimization for survival metrics significantly improved `bWWPA`'s performance in survival prediction. The `bWWPA_Survival` method achieved a C-index of 0.7010, which is a notable increase compared to the 0.6608 C-index achieved by the original `bWWPA` when its feature selection was driven by classification accuracies (KNN, GB, NN). This improvement highlights the importance of aligning the feature selection objective function with the ultimate evaluation metric.

### Data Analysis Key Findings
*   The original `bWWPA` method, when optimized for traditional classification models (KNN, Gradient Boosting, Neural Network), consistently selected the same 5 features: `years_after_diagnosis`, `stage level`, `chemotherapy`, `brachtherapy`, and `MENO_post`. Its performance for survival prediction resulted in a C-index of 0.6608 with 4 features.
*   A new fitness function, `calculate_survival_fitness`, was successfully developed, using the C-index and a penalty for the number of features to guide feature selection for survival analysis.
*   An adapted feature selection function, `bwwpa_survival_feature_selection`, was created to integrate this new survival-specific fitness function.
*   The `bWWPA_Survival` method, directly optimized for survival metrics, achieved a C-index of 0.7010 and an AIC of 597.6295, utilizing 6 features: `age_at_diagnosis`, `chemotherapy`, `brachtherapy`, `radiation`, `MENO_post`, and `CM_3`.
*   Comparing all evaluated feature selection methods for CoxPH models:
    *   **Baseline (No FS)** achieved the highest C-index of 0.7176 using 13 features.
    *   **Random Forest Importance** was the next best, achieving a C-index of 0.7084 with a reduced set of 6 features.
    *   **`bWWPA_Survival`** ranked third with a C-index of 0.7010, also using 6 features.
*   The C-index of `bWWPA_Survival` (0.7010) demonstrated a significant improvement over the C-index of the original `bWWPA` (0.6608) when the latter was optimized based on classification model accuracies.

### Insights or Next Steps
*   Aligning feature selection objectives with the final evaluation metric (e.g., C-index for survival analysis) is crucial for achieving optimal performance in specific domains. The `bWWPA_Survival` method's improved C-index compared to the classification-optimized `bWWPA` underscores this principle.
*   While `Baseline (No FS)` provided the highest C-index, `Random Forest Importance` and `bWWPA_Survival` offer compelling alternatives for survival prediction when feature parsimony is desired, providing a strong C-index with a significantly reduced feature set.
