In [1]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, SelectKBest, f_classif, SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lifelines.utils import concordance_index
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
import pandas as pd

# Load the dataset from the specified location
file_path = "G:\\HTW Project\\Data\\pyradiomics_train_data-copy.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [5]:
df_0 = df[df['day'] == 0]
df_0.head()

Unnamed: 0,diagnosis,well,day,original_firstorder_10percentile,original_firstorder_90percentile,original_firstorder_energy,original_firstorder_entropy,original_firstorder_interquartilerange,original_firstorder_kurtosis,original_firstorder_maximum,...,original_glszm_smallareahighgraylevelemphasis,original_glszm_smallarealowgraylevelemphasis,original_glszm_zoneentropy,original_glszm_zonepercentage,original_glszm_zonevariance,original_ngtdm_busyness,original_ngtdm_coarseness,original_ngtdm_complexity,original_ngtdm_contrast,original_ngtdm_strength
0,relapsed,FaDu_plate101_07B,0,159.0,229.0,53813244585,2.245177,39.0,11.038256,255,...,42.747166,0.010156,4.898068,0.22314,132.79258,595.133942,1.3e-05,15.512103,0.008326,0.001329
7,relapsed,FaDu_plate101_07C,0,145.0,217.0,48016057345,2.108233,39.0,8.179105,255,...,36.146975,0.011614,5.247692,0.163092,347.542727,423.378563,1.8e-05,9.992886,0.005386,0.001589
14,relapsed,FaDu_plate101_07D,0,151.0,233.0,53114609930,2.369748,46.0,6.762271,255,...,38.900879,0.010733,5.267214,0.179737,347.544814,434.709465,1.8e-05,11.690268,0.007238,0.00165
21,relapsed,FaDu_plate101_07E,0,140.0,230.0,49106323539,2.527393,51.0,5.27577,255,...,36.862399,0.011521,5.452243,0.1696,340.331236,412.753919,2.2e-05,10.531287,0.007675,0.001625
28,relapsed,FaDu_plate101_07F,0,148.0,231.0,51351479102,2.414003,48.0,6.690151,255,...,37.893907,0.011388,5.320579,0.184825,271.803325,452.488207,1.8e-05,11.823394,0.007706,0.001553


In [11]:
# Define the features (radiomics) and target (diagnosis)
X = df_0.drop(columns=['diagnosis', 'day','well'])  # Exclude target and "day" column
y = df_0['diagnosis']  # Target is 'diagnosis'
# Encode the target variable if necessary (e.g., relapsed -> 1, controlled -> 0)
y = y.map({'relapsed': 1, 'controlled': 0})


In [13]:
feature_selectors = {
    'RFE': RFE(estimator=RandomForestClassifier(), n_features_to_select=10),
    'SelectKBest': SelectKBest(f_classif, k=10),  # Select 10 best features with ANOVA F-test
    'TreeBased': RandomForestClassifier(),  # Feature importance from tree-based methods
    'PCA': PCA(n_components=10),  # Select top 10 principal components
    'Lasso': SelectFromModel(Lasso(alpha=0.01)),  # Lasso for feature selection
    'Cox-Lasso': SelectFromModel(LogisticRegression(penalty='l1', solver='saga', max_iter=1000))  # Lasso in Logistic Regression
}

# Define machine learning models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'k-NN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),  # XGBoost for classification
}

# Create a list to store the results
results = []

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Function to calculate C-index for classification
def calculate_c_index(y_true, y_pred):
    return concordance_index(y_true, y_pred)

# Loop over feature selectors and models
for fs_name, selector in feature_selectors.items():
    for model_name, model in models.items():
        # Create a pipeline for feature selection and classification
        if fs_name == 'TreeBased':
            # For tree-based models, skip explicit feature selection since it uses importance natively
            pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', model)])
        else:
            pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('feature_selection', selector), ('model', model)])
        
        # Generate cross-validated predictions
        y_pred = cross_val_predict(pipeline, X, y, cv=kf, method='predict')
        y_pred_prob = cross_val_predict(pipeline, X, y, cv=kf, method='predict_proba')[:, 1]  # For probability scores
        
        # Calculate various metrics
        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred)
        recall = recall_score(y, y_pred)
        f1 = f1_score(y, y_pred)
        auc = roc_auc_score(y, y_pred_prob)
        c_index = calculate_c_index(y, y_pred_prob)  # C-index calculation
        
        # Store the results
        results.append({
            'Feature Selection': fs_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'AUC': auc,
            'C-Index': c_index
        })

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)

# Sort results by Accuracy (or any other metric) for comparison
results_df = results_df.sort_values(by='Accuracy', ascending=False)


KeyboardInterrupt



In [None]:
# Visualization: Heatmap of feature selection methods and model performance (using Accuracy)
pivot_table = results_df.pivot("Feature Selection", "Model", "Accuracy")

plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, annot=True, cmap="YlGnBu", fmt=".3f")
plt.title("Heatmap of Accuracy for Feature Selection and ML Model Combinations")
plt.show()

# Visualization: Heatmap for C-index
pivot_table_cindex = results_df.pivot("Feature Selection", "Model", "C-Index")

plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table_cindex, annot=True, cmap="YlOrRd", fmt=".3f")
plt.title("Heatmap of C-Index for Feature Selection and ML Model Combinations")
plt.show()

# Display the results
import ace_tools as tools; tools.display_dataframe_to_user(name="Comparison of Feature Selection and ML Models with Metrics", dataframe=results_df)