<h1 style="color:red; text-align:center; font-size:64px;">Data Equalization</h1> 

<p>This part provide to duplicate the malignant datas. At first, There were 437 benign datas and 210 malignant datas.</p>
<p>We need to double malignant datas to balance dataset.</p>
<p>Using malignant datas' reflection on axis y will solve the problem.</p>

### import libraries

In [None]:
import cv2
import os

#### reflection method

In [None]:
def flip_image_y(image_path):
    
    image = cv2.imread(image_path)
    flipped_image = cv2.flip(image, 1)
    return flipped_image

#### save ultrasound images method

In [None]:
def save_flipped_images(folder_path, output_folder, prefix):
    
    files = os.listdir(folder_path)

    for file in files:
        if file.endswith(".png") and prefix in file and "_mask" not in file:
            image_path = os.path.join(folder_path, file)
            flipped_image = flip_image_y(image_path)
            file_parts = file.split(prefix)
            if len(file_parts) > 1:
                index = file_parts[1].split(".")[0]
                if index.isdigit():
                    index = int(index)
                    new_file_name = prefix + str(index + 210) + ".png"
                    output_path = os.path.join(output_folder, new_file_name)
                    cv2.imwrite(output_path, flipped_image)
                else:
                    pass
            else:
                pass

#### save marked images method

In [None]:
def save_flipped_masks(folder_path, output_folder, prefix):

    files = os.listdir(folder_path)
    
    for file in files:
        if file.endswith(".png") and prefix in file:
            image_path = os.path.join(folder_path, file)
            flipped_image = flip_image_y(image_path)
            file_parts = file.split(prefix)
            if len(file_parts) > 1:
                index = file_parts[1].split("_")[0]
                if index.isdigit():
                    index = int(index)
                    new_file_name = prefix + str(index + 210) + "_mask.png"
                    output_path = os.path.join(output_folder, new_file_name)
                    cv2.imwrite(output_path, flipped_image)
                else:
                    pass
            else:
                pass

#### run

In [None]:
input_folder = "images/"
output_folder = "images/"

save_flipped_images(input_folder, output_folder, "malignant")
save_flipped_masks(input_folder, output_folder, "malignant")

<h1 style="color:red; text-align:center; font-size:64px;">Radiomic Feature Extraction</h1> 

<p>This part provides extracting radiomic features from images.</p>
<p>In this part, we use yaml, SimpleITK and radiomics libraries.</p>

#### import libraries

In [None]:
import os
import yaml
import pandas as pd
import SimpleITK as sitk
from radiomics import featureextractor
from radiomics.featureextractor import getFeatureClasses

import warnings
warnings.filterwarnings('ignore')

#### load parameters and define variables

In [None]:
with open("Params.yaml", "r") as stream:
    params = yaml.safe_load(stream)

image_folder = "images/"
results = []

extractor = featureextractor.RadiomicsFeatureExtractor(params)

 #### loop for extraction

In [None]:
for filename in os.listdir(image_folder):
    if filename.endswith(".png") and "_mask" not in filename:
        try:
            image_path = os.path.join(image_folder, filename)
            image = sitk.ReadImage(image_path)

            image_array = sitk.GetArrayFromImage(image)
            if image.GetNumberOfComponentsPerPixel() > 1:
                image = sitk.VectorIndexSelectionCast(image, 0, sitk.sitkUInt8)
            else:
                image = sitk.Cast(image, sitk.sitkUInt8)

            mask_filename = filename.split(".")[0] + "_mask.png"
            mask_path = os.path.join(image_folder, mask_filename)

            if os.path.exists(mask_path):
                mask = sitk.ReadImage(mask_path)

                features = extractor.execute(image, mask)
                features["ultrasound"] = filename
                cancertype = "benign" if "benign" in filename else "malignant"
                features["cancertype"] = cancertype

                results.append(features)
            else:
                print(f"{mask_filename} adlı maske bulunamadı, işlem atlandı.")
        except Exception as e:
            print(f"Hata oluştu: {e}")


#### creat csv

In [None]:
results_df = pd.DataFrame(results)

ultrasound_col = results_df.pop("ultrasound")
results_df.insert(0, "ultrasound", ultrasound_col)

results_df.to_csv("radiomic_features.csv", index=False)

radiomic_features_df = pd.read_csv("radiomic_features.csv")

benign_df = radiomic_features_df[radiomic_features_df["cancertype"] == "benign"]
malignant_df = radiomic_features_df[radiomic_features_df["cancertype"] == "malignant"]

benign_test = benign_df.tail(50)
malignant_test = malignant_df.tail(50)
test_df = pd.concat([benign_test, malignant_test])

benign_train = benign_df.iloc[:-50]
malignant_train = malignant_df.iloc[:-50]
train_df = pd.concat([benign_train, malignant_train])

test_df.to_csv("radiomic_test.csv", index=False)
train_df.to_csv("radiomic_train.csv", index=False)

print("Radiomic Features DataFrame:")
print(radiomic_features_df.head())

print("\nTrain DataFrame:")
print(train_df.head())

print("\nTest DataFrame:")
print(test_df.head())

<h1 style="color:red; text-align:center; font-size:64px;">Feature Selection</h1> 

<p>This part provide to calculate feature importance scores for each model and each feature.</p>
<p>Feature importance score and correlation values help us to choose features that we use.</p>

### import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import openpyxl
import xlsxwriter
from openpyxl.formatting.rule import CellIsRule
from openpyxl.styles import PatternFill, Font
from openpyxl import load_workbook


#### define features

In [None]:
results_df = pd.read_csv("radiomic_features.csv")

X = results_df[["diagnostics_Versions_PyRadiomics","diagnostics_Versions_Numpy","diagnostics_Versions_SimpleITK",
"diagnostics_Versions_PyWavelet","diagnostics_Versions_Python","diagnostics_Configuration_Settings",
"diagnostics_Configuration_EnabledImageTypes","original_shape2D_MeshSurface","original_shape2D_PixelSurface",
"original_shape2D_Perimeter","original_shape2D_PerimeterSurfaceRatio","original_shape2D_Sphericity",
"original_shape2D_MaximumDiameter","original_shape2D_MajorAxisLength","original_shape2D_MinorAxisLength",
"original_shape2D_Elongation","original_firstorder_Energy","original_firstorder_TotalEnergy","original_firstorder_10Percentile",
"original_firstorder_90Percentile","original_firstorder_Mean","original_firstorder_Variance","original_firstorder_Skewness",
"original_firstorder_Kurtosis","original_firstorder_Maximum","original_firstorder_Minimum","original_firstorder_Median",
"original_firstorder_InterquartileRange","original_firstorder_Range","original_firstorder_MeanAbsoluteDeviation",
"original_firstorder_Entropy","original_firstorder_RobustMeanAbsoluteDeviation","original_firstorder_RootMeanSquared",
"original_firstorder_Uniformity","original_glcm_Autocorrelation","original_glcm_JointAverage","original_glcm_ClusterProminence",
"original_glcm_ClusterShade","original_glcm_ClusterTendency","original_glcm_Contrast","original_glcm_Correlation",
"original_glcm_DifferenceAverage","original_glcm_DifferenceEntropy","original_glcm_DifferenceVariance",
"original_glcm_JointEnergy","original_glcm_JointEntropy","original_glcm_Imc1","original_glcm_Imc2","original_glcm_Idm",
"original_glcm_Idmn","original_glcm_Id","original_glcm_Idn","original_glcm_InverseVariance","original_glcm_MaximumProbability",
"original_glcm_SumEntropy","original_glcm_SumSquares","original_glrlm_ShortRunEmphasis","original_glrlm_LongRunEmphasis",
"original_glrlm_GrayLevelNonUniformity","original_glrlm_GrayLevelNonUniformityNormalized","original_glrlm_RunLengthNonUniformity",
"original_glrlm_RunLengthNonUniformityNormalized","original_glrlm_RunPercentage","original_glrlm_GrayLevelVariance",
"original_glrlm_RunVariance","original_glrlm_RunEntropy","original_glrlm_LowGrayLevelRunEmphasis",
"original_glrlm_HighGrayLevelRunEmphasis","original_glrlm_ShortRunLowGrayLevelEmphasis",
"original_glrlm_ShortRunHighGrayLevelEmphasis","original_glrlm_LongRunLowGrayLevelEmphasis",
"original_glrlm_LongRunHighGrayLevelEmphasis","original_glszm_SmallAreaEmphasis","original_glszm_LargeAreaEmphasis",
"original_glszm_GrayLevelNonUniformity","original_glszm_SizeZoneNonUniformity","original_glszm_SizeZoneNonUniformityNormalized",
"original_glszm_ZonePercentage","original_glszm_GrayLevelVariance","original_glszm_ZoneVariance","original_glszm_ZoneEntropy",
"original_glszm_LowGrayLevelZoneEmphasis","original_glszm_HighGrayLevelZoneEmphasis",
"original_glszm_SmallAreaLowGrayLevelEmphasis","original_glszm_SmallAreaHighGrayLevelEmphasis",
"original_glszm_LargeAreaLowGrayLevelEmphasis","original_glszm_LargeAreaHighGrayLevelEmphasis",
"original_gldm_GrayLevelNonUniformity","original_gldm_SmallDependenceEmphasis","original_gldm_LargeDependenceEmphasis",
"original_gldm_DependenceNonUniformityNormalized","original_gldm_GrayLevelVariance","original_gldm_DependenceNonUniformity",
"original_gldm_DependenceEntropy","original_gldm_DependenceVariance","original_gldm_LowGrayLevelEmphasis",
"original_gldm_HighGrayLevelEmphasis","original_gldm_SmallDependenceLowGrayLevelEmphasis",
"original_gldm_SmallDependenceHighGrayLevelEmphasis","original_gldm_LargeDependenceLowGrayLevelEmphasis",
"original_gldm_LargeDependenceHighGrayLevelEmphasis","original_ngtdm_Coarseness","original_ngtdm_Contrast",
"original_ngtdm_Busyness","original_ngtdm_Complexity","original_ngtdm_Strength"]]

y = results_df["cancertype"]

#### calculate and save feature importance scores

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

preprocessor = ColumnTransformer(
transformers=[
("num", StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
("cat", OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
])

classifiers = {
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier(),
"XGBoost": XGBClassifier(),
"SVM": SVC(kernel='linear'),
"AdaBoost": AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.1),
"Decision Tree": DecisionTreeClassifier()
}

feature_importance_dfs = {}

for name, clf in classifiers.items():
    print(f"Calculating feature importance scores for {name}...")

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", clf)])
feature_names = preprocessor.fit(X).get_feature_names_out()
pipeline.fit(X, y_encoded)

if hasattr(clf, "feature_importances_"):
    importances = clf.feature_importances_
elif hasattr(clf, "coef_"):
    importances = clf.coef_[0]
else:
    importances = None
if importances is not None:
    feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
    feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
    feature_importance_dfs[name] = feature_importance_df

for name, df in feature_importance_dfs.items():
    df_sorted = df.sort_values(by="Importance", ascending=False)
    df_sorted.to_excel(f"documents/{name}_feature_importance_scores.xlsx", index=False)
    print(f"Feature importance scores for {name} saved to {name}_feature_importance_scores.xlsx")
    print(f"\nFeature importance scores for {name}:")
    df_preview = pd.read_excel(f"documents/{name}_feature_importance_scores.xlsx")
    print(df_preview.head())

#### calculate and save correlation matrix

In [None]:
numeric_features = X.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_features.corr()

correlation_matrix.to_excel("documents/feature_correlation_matrix.xlsx")
print("Feature correlation matrix saved to feature_correlation_matrix.xlsx")

file_path = "documents/feature_correlation_matrix.xlsx"
wb = openpyxl.load_workbook(file_path)
ws = wb.active

for row in ws.iter_rows():
    for cell in row:
        value = cell.value
        if isinstance(value, (int, float)) and abs(value) > 0.60:
            cell.fill = openpyxl.styles.PatternFill(start_color="c6efce", end_color="c6efce", fill_type="solid")

wb.save(file_path)

correlation_matrix_preview = pd.read_excel("documents/feature_correlation_matrix.xlsx")
print("\nFeature correlation matrix:")
print(correlation_matrix_preview.head())


#### Automatic Feature Selection

In this section, we create a composite score with mean importance and correlation indexes. And we are going to use best 30 features.

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ("cat", OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(kernel='linear'),
    "AdaBoost": AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.1),
    "Decision Tree": DecisionTreeClassifier()
}

feature_importances = pd.DataFrame(index=X.columns)

for name, clf in classifiers.items():
    print(f"Calculating feature importance scores for {name}...")
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", clf)])
    pipeline.fit(X, y_encoded)
    if hasattr(clf, "feature_importances_"):
        importances = clf.feature_importances_
    elif hasattr(clf, "coef_"):
        importances = np.abs(clf.coef_[0])
    else:
        importances = np.zeros(len(X.columns))  
    feature_importances[name] = importances

feature_importances = feature_importances.fillna(0)
feature_importances['Mean_Importance'] = feature_importances.mean(axis=1)
numeric_features = X.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_features.corr().abs()
mean_correlation = correlation_matrix.abs().mean(axis=1)
composite_score = feature_importances['Mean_Importance'] * mean_correlation.reindex(feature_importances.index).fillna(0)
top_features = composite_score.index

selected_features_df = pd.DataFrame({
    'Feature': top_features,
    'Composite_Score': composite_score[top_features]
}).sort_values(by='Composite_Score', ascending=False)

selected_features_df.to_excel("selected_features_with_scores.xlsx", index=False)
print(f"Selected features and their composite scores saved to selected_features_with_scores.xlsx")

output_path = "selected_features_with_scores.xlsx"
selected_features_preview = pd.read_excel(output_path)
print("\nSelected features with their composite scores:")
print(selected_features_preview.head())

X_selected = X[top_features]
X_selected.to_excel("X_selected.xlsx", index=False)
print(f"Selected features data saved to X_selected.xlsx")

#### Feature Selection with PCA

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ("cat", OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])

def conditional_formatting(value):
    return value > 0.1

pca = PCA(n_components=0.95)
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("pca", pca)])
pipeline.fit(X)
pca_components = pipeline.named_steps['pca'].components_

original_features = pipeline.named_steps['preprocessor'].get_feature_names_out()

pca_df = pd.DataFrame(data=pca_components, columns=original_features)

pca_df.index = [f"PC{i+1}" for i in range(pca_df.shape[0])]

output_file = "PCA_components.xlsx"
pca_df.to_excel(output_file, index=True)

wb = openpyxl.load_workbook("PCA_components.xlsx")

sheet = wb.active

for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=2, max_col=sheet.max_column):
    for cell in row:
        if cell.value > 0.1:
            cell.fill = PatternFill(start_color="00FF00", end_color="00FF00", fill_type="solid")

wb.save("PCA_components.xlsx")

print(f"PCA bileşen matrisi {output_file} dosyasına kaydedildi.")

pca_df = pd.read_excel("PCA_components.xlsx", index_col=0)

wb = openpyxl.load_workbook("PCA_components.xlsx")
sheet = wb.active

new_wb = openpyxl.Workbook()
new_sheet = new_wb.active

new_sheet.append(["PCA Bileşeni", "Özellikler"])

for pc_name in pca_df.index:
    feature_names = [col for col, value in pca_df.loc[pc_name].items() if conditional_formatting(value)]
    
    feature_string = ', '.join(feature_names)
    feature_string = '"' + feature_string + '"'
    
    new_sheet.append([pc_name, feature_string])

new_wb.save("PCA_component_features.xlsx")

new_wb.close()

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(kernel='linear'),  
    "AdaBoost": AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.1), 
    "Decision Tree": DecisionTreeClassifier()
}

feature_importance_dfs = {}

for name, clf in classifiers.items():
    print(f"Calculating feature importance scores for {name} with PCA...")
    
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("pca", pca), ("classifier", clf)])
    
    pipeline.fit(X, y_encoded)
    
    if hasattr(clf, "feature_importances_"):
        importances = clf.feature_importances_
    elif hasattr(clf, "coef_"):
        importances = clf.coef_[0]
    else:
        importances = None
    
    if importances is not None:
        feature_names = [f"PC{i+1}" for i in range(len(importances))]
        feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
        feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
        feature_importance_dfs[name] = feature_importance_df

for name, df in feature_importance_dfs.items():
    df_sorted = df.sort_values(by="Importance", ascending=False)
    df_sorted.to_excel(f"documents/{name}_PCA_feature_importance_scores.xlsx", index=False)
    print(f"Feature importance scores for {name} with PCA saved to {name}_PCA_feature_importance_scores.xlsx")
    print(f"\nFeature importance scores for {name} with PCA:")
    df_preview = pd.read_excel(f"documents/{name}_PCA_feature_importance_scores.xlsx")
    print(df_preview.head())



<h1 style="color:red; text-align:center; font-size:64px;">Training Models with All Features</h1> 

<p> In this part, we train the models with best parameters and save them.</p>
<p> We use Random Forest, Support Vector Machine, XGBoost, Gradient Boosting, AdaBoost, Decision Tree.</p>

#### import libraries

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from joblib import dump
from imblearn.over_sampling import SMOTE
import warnings

#### load dataset and define variables

In [None]:
# Load the dataset
results_df = pd.read_csv("radiomic_train.csv")

# Separate features and target variable
X = results_df.drop(columns=["cancertype", "ultrasound"])
y = results_df["cancertype"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ("cat", OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])

# Encode the target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "XGBoost": XGBClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

#### train each model and save them

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

for name, clf in classifiers.items():
    print(f"Finding best parameters for {name} with PCA...")
    
    # Define parameter grid for hyperparameter tuning
    param_grid = {}
    if name == "Random Forest":
        param_grid = {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__max_features': ['auto', 'sqrt'],
            'classifier__max_depth': [10, 50, 100, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    elif name == "Support Vector Machine":
        param_grid = {
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__gamma': [1, 0.1, 0.01, 0.001],
            'classifier__kernel': ['rbf', 'linear']
        }
    elif name == "XGBoost":
        param_grid = {
            'classifier__learning_rate': [0.1, 0.01, 0.05],
            'classifier__max_depth': [3, 5, 7,],
            'classifier__min_child_weight': [1, 3, 5],
            'classifier__gamma': [0.1, 0.2, 0.3],
            'classifier__subsample': [0.6, 0.8, 1.0],
            'classifier__colsample_bytree': [0.6, 0.8, 1.0],
            'classifier__objective': ['binary:logistic'],
            'classifier__n_estimators': [100, 200, 300]
        }
    elif name == "Gradient Boosting":
        param_grid = {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__learning_rate': [0.01, 0.05, 0.1],
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__max_features': ['auto', 'sqrt']
        }

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        estimator=Pipeline(steps=[("preprocessor", preprocessor), ("pca", PCA()), ("classifier", clf)]),
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {name} with PCA: {grid_search.best_params_}")
    
    print(f"Training {name} with best parameters and PCA...")
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Save the trained model
    dump(best_model, f"models/{name}_model_with_PCA.joblib")

#### train other models 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    "AdaBoost": AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.1), 
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    print(f"Finding best parameters for {name}...")
    param_grid = {}
    if name == "Decision Tree":
        param_grid = {
            'classifier__criterion': ['gini', 'entropy'], 
            'classifier__max_depth': [None, 10, 50, 100],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    
    grid_search = GridSearchCV(
        estimator=Pipeline(steps=[("preprocessor", preprocessor), ("classifier", clf)]),
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    print(f"Training {name} with best parameters...")
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    
    dump(best_model, f"models/{name}_model2.joblib")


<h1 style="color:red; text-align:center; font-size:64px;">Training Models with Selected Features</h1>

<p> In this part, we select features according to feature importance score and correlation map results.</p>
<p> And then, we train the models with best parameters and save them.</p>

#### import libraries

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from joblib import dump
from imblearn.over_sampling import SMOTE

#### load dataset and define variables

In [None]:
results_df = pd.read_csv("radiomic_train.csv")

X = results_df[["original_glcm_SumEntropy",
"original_glcm_JointEnergy",
"original_ngtdm_Complexity",
"original_glcm_Correlation",
"original_gldm_DependenceEntropy",
"original_glcm_SumSquares",
"original_firstorder_Uniformity",
"original_ngtdm_Coarseness",
"original_shape2D_PerimeterSurfaceRatio",
"original_glszm_LowGrayLevelZoneEmphasis",
"original_glszm_ZoneEntropy",
"original_shape2D_PixelSurface",
"original_glszm_SizeZoneNonUniformity",
"original_shape2D_Perimeter",
"original_gldm_DependenceNonUniformityNormalized",
"original_glszm_SmallAreaLowGrayLevelEmphasis",
"original_glrlm_GrayLevelNonUniformity",
"original_glszm_SizeZoneNonUniformityNormalized",
"original_shape2D_MajorAxisLength",
"original_glcm_DifferenceVariance",
"original_glcm_Idmn",
"original_firstorder_Entropy",
"original_shape2D_MaximumDiameter",
"original_glcm_DifferenceAverage",
"original_glcm_JointAverage",
"original_glrlm_LowGrayLevelRunEmphasis",
"original_glcm_Imc1",
"original_glrlm_RunEntropy",
"original_glcm_ClusterTendency",
"original_glrlm_RunLengthNonUniformity",
"original_glcm_MaximumProbability",
"original_glcm_ClusterShade",
"original_firstorder_Variance",
"original_glrlm_GrayLevelVariance",
"original_shape2D_PerimeterSurfaceRatio",
"original_shape2D_Sphericity",
"original_glcm_JointEntropy",
"original_firstorder_RootMeanSquared",
"original_glrlm_RunLengthNonUniformity",
"original_gldm_HighGrayLevelEmphasis",
"original_gldm_DependenceNonUniformity",
"original_glcm_Idn",
"original_ngtdm_Contrast",
"original_shape2D_MeshSurface",
"original_gldm_LargeDependenceLowGrayLevelEmphasis",
"original_shape2D_Elongation",
"original_glszm_SmallAreaHighGrayLevelEmphasis",
"original_shape2D_MinorAxisLength",
"original_gldm_LowGrayLevelEmphasis",
"original_glcm_Contrast",
"original_glszm_SmallAreaEmphasis"]]
y = results_df["cancertype"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ("cat", OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])


label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "XGBoost": XGBClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

#### train each model and save them

In [None]:
for name, clf in classifiers.items():
    print(f"Finding best parameters for {name}...")
    param_grid = {}
    if name == "Random Forest":
        param_grid = {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__max_features': ['auto', 'sqrt'],
            'classifier__max_depth': [10, 50, 100, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    elif name == "Support Vector Machine":
        param_grid = {
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__gamma': [1, 0.1, 0.01, 0.001],
            'classifier__kernel': ['rbf', 'linear']
        }
    elif name == "XGBoost":
        param_grid = {
            'classifier__learning_rate': [0.1, 0.01, 0.05],
            'classifier__max_depth': [3, 5, 7,],
            'classifier__min_child_weight': [1, 3, 5],
            'classifier__gamma': [0.1, 0.2, 0.3],
            'classifier__subsample': [0.6, 0.8, 1.0],
            'classifier__colsample_bytree': [0.6, 0.8, 1.0],
            'classifier__objective': ['binary:logistic'],
            'classifier__n_estimators': [100, 200, 300]
        }
    elif name == "Gradient Boosting":
        param_grid = {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__learning_rate': [0.01, 0.05, 0.1],
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__max_features': ['auto', 'sqrt']
        }

    grid_search = GridSearchCV(
        estimator=Pipeline(steps=[("preprocessor", preprocessor), ("classifier", clf)]),
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    print(f"Training {name} with best parameters...")
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    dump(best_model, f"models/{name}_selected2.joblib")

#### train other models

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    "AdaBoost": AdaBoostClassifier(estimator=DecisionTreeClassifier(), learning_rate=0.1), 
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in classifiers.items():
    print(f"Finding best parameters for {name}...")
    param_grid = {}
    if name == "Decision Tree":
        param_grid = {
            'classifier__criterion': ['gini', 'entropy'], 
            'classifier__max_depth': [None, 10, 50, 100],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    
    grid_search = GridSearchCV(
        estimator=Pipeline(steps=[("preprocessor", preprocessor), ("classifier", clf)]),
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    print(f"Training {name} with best parameters...")
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    
    dump(best_model, f"models/{name}_selected2.joblib")


<h1 style="color:red; text-align:center; font-size:64px;">Classification Comparison</h1>

<p>This part includes some classification calculations both all-features models and selected-features models.</p>
<p>Also visualize them with matpllotlib.</p>
<p>Support Vector Machine does not support probability predictions.</p>

#### import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, matthews_corrcoef, f1_score
from joblib import load
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

#### calculation of classification metrics

In [None]:
results_df = pd.read_csv("radiomic_features.csv")

y = results_df["cancertype"]
label_encoder = LabelEncoder()
label_encoder.fit(y)

X_test = results_df.drop(columns=["cancertype", "ultrasound"])
y_test = results_df["cancertype"]

model_names = ["Random Forest", "Support Vector Machine", "XGBoost", "Gradient Boosting", "AdaBoost", "Decision Tree"]

avg_accuracy = []
avg_sensitivity = []
avg_specificity = []
avg_precision = []
avg_recall = []
avg_auc = []
avg_mcc = []
avg_fscore = []

report_df = pd.DataFrame(columns=["Model", "Classification Report"])

for model_name in model_names:
    model = load(f"models/{model_name}_model.joblib")
    
    y_pred_encoded = model.predict(X_test)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    
    accuracy = accuracy_score(y_test, y_pred)
    avg_accuracy.append(accuracy)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.concat([report_df, pd.DataFrame({"Model": [model_name], "Classification Report": [report]})], ignore_index=True)

    classes = list(report.keys())
    classes = [cls for cls in classes if cls not in ['accuracy', 'macro avg', 'weighted avg']]
    
    recalls = [report[cls]['recall'] for cls in classes]
    precisions = [report[cls]['precision'] for cls in classes]
    avg_recall.append(np.mean(recalls))
    avg_precision.append(np.mean(precisions))
    
    cm = confusion_matrix(y_test, y_pred)
    specificities = []
    for i, cls in enumerate(classes):
        tn = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i])
        fp = np.sum(cm[:, i]) - cm[i, i]
        specificity = tn / (tn + fp)
        specificities.append(specificity)
    
    avg_specificity.append(np.mean(specificities))
    
    if len(classes) == 2: 
        y_test_bin = label_encoder.transform(y_test)
        y_pred_bin = label_encoder.transform(y_pred)
        auc = roc_auc_score(y_test_bin, y_pred_bin)
        avg_auc.append(auc)
    else:  
        auc = roc_auc_score(label_encoder.transform(y_test), model.predict_proba(X_test), multi_class='ovo')
        avg_auc.append(auc)
    
    mcc = matthews_corrcoef(y_test, y_pred)
    fscore = f1_score(y_test, y_pred, average='weighted')
    
    avg_mcc.append(mcc)
    avg_fscore.append(fscore)

metrics_df = pd.DataFrame({
    "Model": model_names,
    "Accuracy": avg_accuracy,
    "Sensitivity": avg_recall,  
    "Specificity": avg_specificity,
    "Precision": avg_precision,
    "Recall": avg_recall,
    "AUC": avg_auc,
    "MCC": avg_mcc,
    "F-score": avg_fscore
})

#### calculation for selected models

In [None]:
results_df2 = pd.read_csv("radiomic_features.csv")

y2 = results_df2["cancertype"]
label_encoder2 = LabelEncoder()
label_encoder2.fit(y2)

X_test2 = results_df2[["original_glcm_SumEntropy","original_glcm_JointEnergy","original_ngtdm_Complexity",
"original_glcm_Correlation","original_gldm_DependenceEntropy","original_glcm_SumSquares","original_firstorder_Uniformity",
"original_ngtdm_Coarseness","original_shape2D_PerimeterSurfaceRatio","original_glszm_LowGrayLevelZoneEmphasis",
"original_glszm_ZoneEntropy","original_shape2D_PixelSurface","original_glszm_SizeZoneNonUniformity",
"original_shape2D_Perimeter","original_gldm_DependenceNonUniformityNormalized","original_glszm_SmallAreaLowGrayLevelEmphasis",
"original_glrlm_GrayLevelNonUniformity","original_glszm_SizeZoneNonUniformityNormalized","original_shape2D_MajorAxisLength",
"original_glcm_DifferenceVariance","original_glcm_Idmn","original_firstorder_Entropy","original_shape2D_MaximumDiameter",
"original_glcm_DifferenceAverage","original_glcm_JointAverage","original_glrlm_LowGrayLevelRunEmphasis","original_glcm_Imc1",
"original_glrlm_RunEntropy","original_glcm_ClusterTendency",
"original_glcm_MaximumProbability","original_glcm_ClusterShade","original_firstorder_Variance","original_glrlm_GrayLevelVariance","original_shape2D_Sphericity","original_glcm_JointEntropy",
"original_firstorder_RootMeanSquared","original_glrlm_RunLengthNonUniformity","original_gldm_HighGrayLevelEmphasis",
"original_gldm_DependenceNonUniformity","original_glcm_Idn","original_ngtdm_Contrast","original_shape2D_MeshSurface",
"original_gldm_LargeDependenceLowGrayLevelEmphasis","original_glszm_SmallAreaHighGrayLevelEmphasis",
"original_shape2D_MinorAxisLength","original_shape2D_Elongation","original_gldm_LowGrayLevelEmphasis",
"original_glcm_Contrast","original_glszm_SmallAreaEmphasis"]]
y_test2 = results_df2["cancertype"]

model_names2 = ["Random Forest", "Support Vector Machine", "XGBoost", "Gradient Boosting", "AdaBoost", "Decision Tree"]

avg_accuracy2 = []
avg_sensitivity2 = []
avg_specificity2 = []
avg_precision2 = []
avg_recall2 = []
avg_auc2 = []
avg_mcc2 = []
avg_fscore2 = []

report_df2 = pd.DataFrame(columns=["Model", "Classification Report"])

for model_name2 in model_names2:
    model2 = load(f"models/{model_name2}_selected2.joblib")
    
    y_pred_encoded2 = model2.predict(X_test2)
    y_pred2 = label_encoder2.inverse_transform(y_pred_encoded2)
    
    accuracy2 = accuracy_score(y_test2, y_pred2)
    avg_accuracy2.append(accuracy2)
    
    report2 = classification_report(y_test2, y_pred2, output_dict=True)
    report_df2 = pd.concat([report_df2, pd.DataFrame({"Model": [model_name2], "Classification Report": [report2]})], ignore_index=True)

    classes2 = list(report2.keys())
    classes2 = [cls2 for cls2 in classes2 if cls2 not in ['accuracy', 'macro avg', 'weighted avg']]
    
    recalls2 = [report2[cls2]['recall'] for cls2 in classes2]
    precisions2 = [report2[cls2]['precision'] for cls2 in classes2]
    avg_recall2.append(np.mean(recalls2))
    avg_precision2.append(np.mean(precisions2))
    
    cm2 = confusion_matrix(y_test2, y_pred2)
    specificities2 = []
    for i2, cls2 in enumerate(classes2):
        tn2 = np.sum(cm2) - (np.sum(cm2[i2, :]) + np.sum(cm2[:, i2]) - cm2[i2, i2])
        fp2 = np.sum(cm2[:, i2]) - cm2[i2, i2]
        specificity2 = tn2 / (tn2 + fp2)
        specificities2.append(specificity2)
    
    avg_specificity2.append(np.mean(specificities2))
    
    if len(classes2) == 2: 
        y_test_bin2 = label_encoder2.transform(y_test2)
        y_pred_bin2 = label_encoder2.transform(y_pred2)
        auc2 = roc_auc_score(y_test_bin2, y_pred_bin2)
        avg_auc2.append(auc2)
    else:  
        auc2 = roc_auc_score(label_encoder2.transform(y_test2), model2.predict_proba(X_test2), multi_class='ovo')
        avg_auc2.append(auc2)
    
    mcc2 = matthews_corrcoef(y_test2, y_pred2)
    fscore2 = f1_score(y_test2, y_pred2, average='weighted')
    
    avg_mcc2.append(mcc2)
    avg_fscore2.append(fscore2)

metrics_df2 = pd.DataFrame({
    "Model": model_names2,
    "Accuracy": avg_accuracy2,
    "Sensitivity": avg_recall2,  
    "Specificity": avg_specificity2,
    "Precision": avg_precision2,
    "Recall": avg_recall2,
    "AUC": avg_auc2,
    "MCC": avg_mcc2,
    "F-score": avg_fscore2
})

In [None]:
report_df.to_excel("classification_reports_allfeatures.xlsx", index=False)
report_df2.to_excel("classification_reports_selected.xlsx", index=False)

#### graph 1

In [None]:
fig1, ax1 = plt.subplots(figsize=(12, 6))

x = np.arange(len(model_names)) 
width = 0.25 

rects1 = ax1.bar(x - width, metrics_df["Accuracy"], width, label='Accuracy')
rects2 = ax1.bar(x, metrics_df["Sensitivity"], width, label='Sensitivity')
rects3 = ax1.bar(x + width, metrics_df["Specificity"], width, label='Specificity')

ax1.set_xlabel('Model')
ax1.set_ylabel('Scores')
ax1.set_title('Comparison of Accuracy, Sensitivity, and Specificity')
ax1.set_xticks(x)
ax1.set_xticklabels(model_names)
ax1.legend()

def add_value_labels(rects, ax):
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  
                    textcoords="offset points",
                    ha='center', va='bottom')

add_value_labels(rects1, ax1)
add_value_labels(rects2, ax1)
add_value_labels(rects3, ax1)

fig1.tight_layout()
plt.show()

fig1_2, ax1_2 = plt.subplots(figsize=(12, 6))

x2 = np.arange(len(model_names2)) 
width2 = 0.25 

rects1_2 = ax1_2.bar(x2 - width2, metrics_df2["Accuracy"], width2, label='Accuracy')
rects2_2 = ax1_2.bar(x2, metrics_df2["Sensitivity"], width2, label='Sensitivity')
rects3_2 = ax1_2.bar(x2 + width2, metrics_df2["Specificity"], width2, label='Specificity')

ax1_2.set_xlabel('Models with Selected Features')
ax1_2.set_ylabel('Scores')
ax1_2.set_title('Comparison of Accuracy, Sensitivity, and Specificity')
ax1_2.set_xticks(x2)
ax1_2.set_xticklabels(model_names2)
ax1_2.legend()

def add_value_labels2(rects2, ax2):
    for rect2 in rects2:
        height2 = rect2.get_height()
        ax2.annotate('{}'.format(round(height2, 2)),
                    xy=(rect2.get_x() + rect2.get_width() / 2, height2),
                    xytext=(0, 3),  
                    textcoords="offset points",
                    ha='center', va='bottom')

add_value_labels2(rects1_2, ax1_2)
add_value_labels2(rects2_2, ax1_2)
add_value_labels2(rects3_2, ax1_2)

fig1_2.tight_layout()
plt.show()

#### graph 2

In [None]:
fig2, ax2 = plt.subplots(figsize=(12, 6))

rects4 = ax2.bar(x - width/2, metrics_df["Precision"], width, label='Precision')
rects5 = ax2.bar(x + width/2, metrics_df["Recall"], width, label='Recall')

ax2.set_xlabel('Model')
ax2.set_ylabel('Scores')
ax2.set_title('Comparison of Precision and Recall')
ax2.set_xticks(x)
ax2.set_xticklabels(model_names)
ax2.legend()

add_value_labels(rects4, ax2)
add_value_labels(rects5, ax2)

fig2.tight_layout()
plt.show()

fig2_2, ax2_2 = plt.subplots(figsize=(12, 6))

rects4_2 = ax2_2.bar(x2 - width2/2, metrics_df2["Precision"], width2, label='Precision')
rects5_2 = ax2_2.bar(x2 + width2/2, metrics_df2["Recall"], width2, label='Recall')

ax2_2.set_xlabel('Models with Selected Features')
ax2_2.set_ylabel('Scores')
ax2_2.set_title('Comparison of Precision and Recall')
ax2_2.set_xticks(x2)
ax2_2.set_xticklabels(model_names2)
ax2_2.legend()

add_value_labels2(rects4_2, ax2_2)
add_value_labels2(rects5_2, ax2_2)

fig2_2.tight_layout()
plt.show()

#### graph 3

In [None]:
fig3, ax3 = plt.subplots(figsize=(12, 6))

rects6 = ax3.bar(x - width, metrics_df["AUC"], width, label='AUC')
rects7 = ax3.bar(x, metrics_df["MCC"], width, label='MCC')
rects8 = ax3.bar(x + width, metrics_df["F-score"], width, label='F-score')

ax3.set_xlabel('Model')
ax3.set_ylabel('Scores')
ax3.set_title('Comparison of AUC, MCC, and F-score')
ax3.set_xticks(x)
ax3.set_xticklabels(model_names)
ax3.legend()

add_value_labels(rects6, ax3)
add_value_labels(rects7, ax3)
add_value_labels(rects8, ax3)

fig3.tight_layout()
plt.show()

fig3_2, ax3_2 = plt.subplots(figsize=(12, 6))

rects6_2 = ax3_2.bar(x2 - width2, metrics_df2["AUC"], width2, label='AUC')
rects7_2 = ax3_2.bar(x2, metrics_df2["MCC"], width2, label='MCC')
rects8_2 = ax3_2.bar(x2 + width2, metrics_df2["F-score"], width2, label='F-score')

ax3_2.set_xlabel('Models with Selected Features')
ax3_2.set_ylabel('Scores')
ax3_2.set_title('Comparison of AUC, MCC, and F-score')
ax3_2.set_xticks(x2)
ax3_2.set_xticklabels(model_names2)
ax3_2.legend()

add_value_labels2(rects6_2, ax3_2)
add_value_labels2(rects7_2, ax3_2)
add_value_labels2(rects8_2, ax3_2)

fig3_2.tight_layout()
plt.show()

<h1 style="color:red; text-align:center; font-size:64px;">Roc Curve and Confusion Matrixes</h1>

<p>In this part, we calculate Receiver Operating Characteristic (ROC) Curve and Confusion Matrixes both all-features model and selected-features models</p>

#### import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from joblib import load
from sklearn.preprocessing import LabelEncoder
from itertools import cycle

#### load dataset and define variables

In [None]:
results_df = pd.read_csv("radiomic_test.csv")

y = results_df["cancertype"]
label_encoder = LabelEncoder()
label_encoder.fit(y)

X_test = results_df.drop(columns=["cancertype", "ultrasound"])
y_test = results_df["cancertype"]

y_test_encoded = label_encoder.transform(y_test)

model_names = ["Random Forest", "Support Vector Machine", "XGBoost", "Gradient Boosting", "AdaBoost", "Decision Tree"]


results_df2 = pd.read_csv("radiomic_features.csv")

y2 = results_df2["cancertype"]
label_encoder2 = LabelEncoder()
label_encoder2.fit(y2)

X_test2 = results_df2[["original_glcm_SumEntropy",
"original_glcm_JointEnergy",
"original_ngtdm_Complexity",
"original_glcm_Correlation",
"original_gldm_DependenceEntropy",
"original_glcm_SumSquares",
"original_firstorder_Uniformity",
"original_ngtdm_Coarseness",
"original_shape2D_PerimeterSurfaceRatio",
"original_glszm_LowGrayLevelZoneEmphasis",
"original_glszm_ZoneEntropy",
"original_shape2D_PixelSurface",
"original_glszm_SizeZoneNonUniformity",
"original_shape2D_Perimeter",
"original_gldm_DependenceNonUniformityNormalized",
"original_glszm_SmallAreaLowGrayLevelEmphasis",
"original_glrlm_GrayLevelNonUniformity",
"original_glszm_SizeZoneNonUniformityNormalized",
"original_shape2D_MajorAxisLength",
"original_glcm_DifferenceVariance",
"original_glcm_Idmn",
"original_firstorder_Entropy",
"original_shape2D_MaximumDiameter",
"original_glcm_DifferenceAverage",
"original_glcm_JointAverage",
"original_glrlm_LowGrayLevelRunEmphasis",
"original_glcm_Imc1",
"original_glrlm_RunEntropy",
"original_glcm_ClusterTendency",
"original_glrlm_RunLengthNonUniformity",
"original_glcm_MaximumProbability",
"original_glcm_ClusterShade",
"original_firstorder_Variance",
"original_glrlm_GrayLevelVariance",
"original_shape2D_PerimeterSurfaceRatio",
"original_shape2D_Sphericity",
"original_glcm_JointEntropy",
"original_firstorder_RootMeanSquared",
"original_glrlm_RunLengthNonUniformity",
"original_gldm_HighGrayLevelEmphasis",
"original_gldm_DependenceNonUniformity",
"original_glcm_Idn",
"original_ngtdm_Contrast",
"original_shape2D_MeshSurface",
"original_gldm_LargeDependenceLowGrayLevelEmphasis",
"original_shape2D_Elongation",
"original_glszm_SmallAreaHighGrayLevelEmphasis",
"original_shape2D_MinorAxisLength",
"original_gldm_LowGrayLevelEmphasis",
"original_glcm_Contrast",
"original_glszm_SmallAreaEmphasis"]]
y_test2 = results_df2["cancertype"]

y_test_encoded2 = label_encoder2.transform(y_test2)

model_names2 = ["Random Forest", "Support Vector Machine", "XGBoost", "Gradient Boosting", "AdaBoost", "Decision Tree"]


#### roc curve graph

In [None]:
class_names = ['malignant', 'benign']  

for model_name in model_names:
    model = load(f"models/{model_name}_model.joblib")
    
    try:
        y_pred_proba = model.predict_proba(X_test)
    except AttributeError:
        print(f"{model_name} does not support probability predictions.")
        continue

    fig, ax = plt.subplots(figsize=(12, 8))
    
    for i, class_name in enumerate(class_names):
        fpr, tpr, _ = roc_curve(y_test_encoded == i, y_pred_proba[:, i])
        roc_auc = roc_auc_score(y_test_encoded == i, y_pred_proba[:, i])
        
        ax.plot(fpr, tpr, lw=2,
                label=f'{class_name} (area = {roc_auc:.2f})')
    
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'Receiver Operating Characteristic (ROC) Curve for {model_name}')
    ax.legend(loc="lower right")
    plt.show()

for model_name2 in model_names2:
    model2 = load(f"models/{model_name2}_selected.joblib")
    
    try:
        y_pred_proba2 = model2.predict_proba(X_test2)
    except AttributeError:
        print(f"{model_name2} does not support probability predictions.")
        continue
    
    fig2, ax2 = plt.subplots(figsize=(12, 8))
    
    for i, class_name in enumerate(class_names):
        fpr2, tpr2, _2 = roc_curve(y_test_encoded2 == i, y_pred_proba2[:, i])
        roc_auc2 = roc_auc_score(y_test_encoded2 == i, y_pred_proba2[:, i])
        
        ax2.plot(fpr2, tpr2, lw=2,
                 label=f'{class_name} (area = {roc_auc2:.2f})')
    
    ax2.plot([0, 1], [0, 1], 'k--', lw=2)
    ax2.set_xlim([0.0, 1.0])
    ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title(f'Receiver Operating Characteristic (ROC) Curve for {model_name2} with Selected Features')
    ax2.legend(loc="lower right")
    plt.show()

#### confusion matrix graph

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('CONFUSION MATRIXES', fontsize=16)  

for ax, model_name in zip(axes.flat, model_names):
    model = load(f"models/{model_name}_model.joblib")
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test_encoded, y_pred)
    
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
    disp.plot(ax=ax, cmap='Blues')
    ax.set_title(f'{model_name}')

plt.tight_layout()
plt.show()



fig2, axes2 = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig2.suptitle('CONFUSION MATRIXES FOR SELECTED FEATURES', fontsize=16)  

for ax2, model_name2 in zip(axes2.flat, model_names2):
    model2 = load(f"models/{model_name2}_selected.joblib")
    
    y_pred2 = model2.predict(X_test2)
    
    cm2 = confusion_matrix(y_test_encoded2, y_pred2)
    
    disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2, display_labels=label_encoder2.classes_)
    disp2.plot(ax=ax2, cmap='Blues')
    ax2.set_title(f'{model_name2}')

plt.tight_layout()
plt.show()