## Classification

In [1]:
import os
import random
import findspark

from tqdm import tqdm
from dotenv import load_dotenv

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, FloatType, IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, PCA, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LinearSVC

from xgboost.spark import SparkXGBClassifier

# Load environment variables from .env file
_ = load_dotenv()

# Retrieve environment variables
TARGET_DIR_NAME = os.getenv("TARGET_DIR_NAME")

In [2]:
findspark.init()

# Initialize Spark session to run locally
spark = SparkSession.builder \
    .appName("Big Data Classification") \
    .config("spark.master", "local[*]") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()

# Print the number of cores being used by Spark
print(f"Spark Session running on {spark.sparkContext.defaultParallelism} cores. UI is available at: {spark.sparkContext.uiWebUrl}")

Spark Session running on 12 cores. UI is available at: http://DESKTOP-684SCQF:4050


### Defining functions for infrastructure

In [3]:
def preprocess_data(data: DataFrame) -> DataFrame:
    """Preprocess the data by removing rows with NaN values."""    

    # Remove rows with NaN values and count the number of rows before and after
    total_rows_before = data.count()
    clean_data = data.dropna()
    total_rows_after = clean_data.count()

    if total_rows_before != total_rows_after:
        print(f"Number of rows with NaN values: {total_rows_before - total_rows_after}")

    return clean_data

In [4]:
def check_patient_overlap(train_df: DataFrame, test_df: DataFrame, patient_column: str = 'patient', verbose_just_error: bool = False) -> None:
    """Checks for overlapping patients between train and test DataFrames."""

    # Get unique patients from train and test DataFrames
    train_patients = train_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    test_patients = test_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()

    # Convert lists to sets and check for any overlap
    overlapping_patients = set(train_patients).intersection(set(test_patients))

    # Print overlapping patients, if any
    if overlapping_patients:
        print(f"Overlapping patients found: {overlapping_patients}")
    elif not verbose_just_error:
        print("No overlapping patients between training and test sets.")

In [5]:
def calculate_class_distribution(df1: DataFrame, df2: DataFrame, class_column: str = 'class') -> None:
    """Calculates and prints class distribution percentages for two DataFrames in a single line."""
    
    # Calculate class counts for both DataFrames
    df1_counts = df1.groupBy(class_column).count().orderBy(class_column).collect()
    df2_counts = df2.groupBy(class_column).count().orderBy(class_column).collect()
    
    # Calculate total counts
    total_count_df1 = df1.count()
    total_count_df2 = df2.count()
    
    # Prepare class distributions as strings
    df1_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df1) * 100:.2f}%)" for row in df1_counts])
    df2_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df2) * 100:.2f}%)" for row in df2_counts])
    
    # Print class distributions in one line
    print(f"Class distribution in train_df: [{df1_distribution}] | test_df: [{df2_distribution}]")

In [6]:
def train_test_split_by_patient(data: DataFrame, test_ratio: float = 0.2, patient_column: str = 'patient', seed: int = 42) -> tuple[DataFrame, DataFrame]:
    """ Splits the DataFrame into train and test sets based on unique patient IDs, ensuring no patient data overlap."""

    random.seed(seed) # Set seed for reproducibility
    
    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)
    
    # Split the patients into train and test sets
    split_index = int(len(unique_patients) * (1 - test_ratio))
    train_patients = unique_patients[:split_index]
    test_patients = unique_patients[split_index:]
    
    # Create train and test DataFrames 
    train_df = data.filter(col(patient_column).isin(train_patients))
    test_df = data.filter(col(patient_column).isin(test_patients))
    check_patient_overlap(train_df, test_df)
    
    # Calculate and print the percentages of train and test sets
    total_rows, train_rows, test_rows = data.count(), train_df.count(), test_df.count()
    print(f"Training size: {train_rows} ({(train_rows / total_rows) * 100:.2f}%), Test size: {test_rows} rows ({(test_rows / total_rows) * 100:.2f}%)") 
    
    calculate_class_distribution(train_df, test_df)
    
    return train_df, test_df

In [7]:
def create_folds_by_patient(data: DataFrame, num_folds: int = 5, patient_column: str = 'patient', seed: int = 42) -> list:
    """Splits the data into train-test folds based on unique patient IDs, ensuring no patient overlap across folds."""

    random.seed(seed) # Set seed for reproducibility

    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)

    # Split patients evenly into folds
    fold_size = len(unique_patients) // num_folds
    folds = [unique_patients[i * fold_size:(i + 1) * fold_size] for i in range(num_folds)]

    # Handle any remaining patients by adding them to the last fold
    for i in range(len(unique_patients) % num_folds):
        folds[i].append(unique_patients[-(i + 1)])

    patient_folds, fold_summaries = [], []

    # Create train-test splits for each fold
    total_rows = data.count()
    for i in range(num_folds):

        # Get the test patients for the current fold and assign the rest to the training set
        test_patients = set(folds[i])
        train_patients = set(unique_patients) - test_patients

        # Filter the data based on the train and test patients
        train_df = data.filter(col(patient_column).isin(train_patients))
        test_df = data.filter(col(patient_column).isin(test_patients))
        check_patient_overlap(train_df, test_df, verbose_just_error=True)

        # Append the train-test split to the list of folds
        patient_folds.append((train_df, test_df))

        # Accumulate test set summary for this fold
        fold_summaries.append(f"Fold {i + 1}: {test_df.count()} rows ({(test_df.count() / total_rows) * 100:.2f}%)")

    print(" | ".join(fold_summaries))

    # Return the list of train-test tuples for each fold
    return patient_folds

In [8]:
def apply_pca(data: DataFrame, output_column: str = 'pca_features', k: int = 10) -> DataFrame:
    """Applies PCA to reduce the dimensionality of the feature set to k principal components."""
    
    # Initialize PCA with the specified number of components
    pca = PCA(k=k, inputCol='features', outputCol=output_column)
    
    # Fit PCA on the data and transform the features
    pca_model = pca.fit(data)
    transformed_data = pca_model.transform(data)
    
    print(f"PCA explained variance ratio: {pca_model.explainedVariance.toArray()}")
    
    return transformed_data

In [9]:
def vectorize_features(data: DataFrame, class_column: str = 'class', patient_column: str = 'patient', output_column: str = 'features', verbose: bool = True, standard_scaler: bool = False) -> DataFrame:
    """Assembles feature columns into a single feature vector column, excluding patient and class columns."""
    
    # Get the numeric columns from the DataFrame
    numeric_columns = [field.name for field in data.schema.fields if isinstance(field.dataType, (DoubleType, FloatType, IntegerType))]

    # Define feature columns by excluding the numeric columns and the class and patient columns
    feature_columns = [col for col in numeric_columns if col not in [class_column, patient_column, 'diagnostics_Mask-original_VoxelNum']]
    #feature_columns = [col for col in numeric_columns if col not in [class_column, patient_column]]
    if verbose:
        print(f"Number of initial columns: {len(data.columns)}, number of feature columns: {len(feature_columns)}")
    
    # Assemble features into a feature vector
    assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'assembled_features')
    assembled_data = assembler.transform(data)

    # Standardize features
    if standard_scaler:
        scaler = StandardScaler(inputCol="assembled_features", outputCol = output_column, withStd = True, withMean = True)
        scaler_model = scaler.fit(assembled_data)
        scaled_data = scaler_model.transform(assembled_data)
    else:
        # Rename directly if no standardization is applied
        assembled_data = assembled_data.withColumnRenamed('assembled_features', 'features')
        scaled_data = assembled_data

    return scaled_data

In [10]:
def calculate_metrics(predictions: DataFrame) -> tuple:
    """Calculates accuracy, precision, and recall from the predictions DataFrame."""

    # Calculate true positives, true negatives, false positives, and false negatives
    tp = predictions.filter((col('prediction') == 1) & (col('class') == 1)).count()
    tn = predictions.filter((col('prediction') == 0) & (col('class') == 0)).count()
    fp = predictions.filter((col('prediction') == 1) & (col('class') == 0)).count()
    fn = predictions.filter((col('prediction') == 0) & (col('class') == 1)).count()

    # Calculate accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return accuracy, precision, recall

In [11]:
def determine_best_parameters_on_folds(model, train_df: DataFrame, paramGrid: list[dict], num_folds: int = 5, patient_column: str = 'patient', standard_scaler: bool = False):
    """Evaluates the Decision Tree model using patient-wise cross-validation with specified hyperparameter tuning."""

    folds = create_folds_by_patient(train_df, num_folds, patient_column)

    # Use BinaryClassificationEvaluator to measure performance
    evaluator = BinaryClassificationEvaluator(labelCol = "class", rawPredictionCol = "prediction", metricName = "areaUnderROC")

    best_params, best_metric = None, float("-inf")

    # Iterate over each parameter combination in the grid
    for params in tqdm(paramGrid, desc="Hyperparameter Tuning", leave = True):
        fold_metrics = []

        # Print parameter combination being evaluated
        param_items = [f"{param.name}: {value}" for param, value in params.items()]

        # For each fold, evaluate the parameter combination
        for _, (train_df, test_df) in enumerate(folds):

            # Vectorize features
            train_df = vectorize_features(train_df, verbose = False, standard_scaler = standard_scaler)
            test_df = vectorize_features(test_df, verbose = False, standard_scaler = standard_scaler)
            check_patient_overlap(train_df, test_df, verbose_just_error = True)

            # Train the model with the current parameters
            current_model = model.copy(params).fit(train_df)

            # Evaluate the model on the test data using the specified metric
            predictions = current_model.transform(test_df)
            metric = evaluator.evaluate(predictions)
            fold_metrics.append(metric)
        
        # Calculate average metric across all folds for the current parameter combination
        avg_metric = __builtins__.sum(fold_metrics) / len(fold_metrics)

        print(f"Average {evaluator.getMetricName()}: {avg_metric:.4f} | Evaluating Parameters:", ', '.join(param_items))

        # Update the best parameters if the current average metric is better
        if avg_metric > best_metric:
            best_metric = avg_metric
            best_params = params

    # Print the best parameters after evaluating all combinations
    param_items = [f"{param.name}: {value}" for param, value in best_params.items()]
    print("Best Overall Parameters:", ', '.join(param_items))
    print(f"Best {evaluator.getMetricName()}: {best_metric:.4f}")

    return best_params 

In [12]:
def train_and_evaluate_best_model(model, best_params: list[dict], train_df: DataFrame, test_df: DataFrame, is_tree: bool = False, standard_scaler: bool = False):
    """Train the model on the full training data using the best parameters and evaluate on the test data."""

    # Vectorize features
    train_df = vectorize_features(train_df, verbose = False, standard_scaler = standard_scaler)
    test_df = vectorize_features(test_df, standard_scaler=standard_scaler)

    # Train the model using the best parameters
    best_model = model.copy(best_params).fit(train_df)

    # Evaluate the model on the test data
    evaluator = BinaryClassificationEvaluator(labelCol="class", rawPredictionCol="prediction", metricName="areaUnderROC")
    predictions = best_model.transform(test_df)
    metric = evaluator.evaluate(predictions)

    # Calculate additional metrics: accuracy, precision, recall
    accuracy, precision, recall = calculate_metrics(predictions)

    print(f"Final Model Evaluation on Test Data {evaluator.getMetricName()}: {metric:.4f}")
    print(f"Final Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Print feature importances if the model is a tree-based model
    if is_tree:
        # Extract the feature importances from the model
        feature_importances = best_model.featureImportances.toArray()

        # Get feature column names from the assembler
        feature_columns = train_df.schema['features'].metadata['ml_attr']['attrs']['numeric']
        feature_names = [attr['name'] for attr in feature_columns]

        # Combine feature names and their importances, then sort by importance
        sorted_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

        # Print sorted feature importances
        print("\nSorted Feature Importances:")
        for name, importance in sorted_features:
            print(f"Feature: {name}, Importance: {importance:.4f}")

    # Show the first 5 names of wrongly classified rows
    misclassified = predictions.filter(col('prediction') != col('class'))
    names = misclassified.select('name').limit(5).rdd.flatMap(lambda x: x).collect()
    print("\nFirst 5 names of wrongly classified rows:")
    print(names)

    return best_model

### Main section

In [13]:
def main(model: str, file_name: str, paramGrid: list[dict], is_tree: bool = False, use_standard_scaler: bool = False):

    # Load and preprocess the dataset
    file_path = os.path.join(os.getcwd(), TARGET_DIR_NAME, file_name)
    data = spark.read.csv(file_path, header=True, inferSchema=True)

    print(f"\n{'=' * 75}\n Setting up\n{'=' * 75}\n")
    clean_data = preprocess_data(data)
    #clean_data = clean_data.limit(100)

    # split the data into train and test sets
    train_df, test_df = train_test_split_by_patient(clean_data, test_ratio=0.2)

    # Evaluate the model using patient-wise cross-validation to find the best parameters
    print(f"\n{'=' * 75}\n Hyperparameter tunning\n{'=' * 75}\n")
    best_params = determine_best_parameters_on_folds(model, train_df, paramGrid, num_folds=5, patient_column='patient', standard_scaler=use_standard_scaler)

    # Train the best model on the full training data and evaluate on the test data
    print(f"\n{'=' * 75}\n Testing model on test dataset\n{'=' * 75}\n")
    best_model = train_and_evaluate_best_model(model, best_params, train_df, test_df, is_tree, standard_scaler=use_standard_scaler)

    return best_model

### Testing different models

#### 512 - with lesion mask

In [14]:
file_name = 'features_512_lesion_mask.csv'

In [15]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2174 (79.52%), Test size: 560 rows (20.48%)

 Hyperparameter tunning

Fold 1: 420 rows (19.32%) | Fold 2: 468 rows (21.53%) | Fold 3: 470 rows (21.62%) | Fold 4: 398 rows (18.31%) | Fold 5: 418 rows (19.23%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:44<00:00, 20.53s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.9809

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 57
Final Model Evaluation on Test Data areaUnderROC: 0.9518
Final Accuracy: 0.9518, Precision: 0.9288, Recall: 0.9786

Sorted Feature Importances:
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.9183
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0305
Feature: original_firstorder_Minimum, Importance: 0.0258
Feature: original_firstorder_90Percentile, Importance: 0.0166
Feature: original_firstorder_Median, Importance: 0.0044
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0043
Feature: diagnostics_Image-original_Dimensionality, Importance: 0.0000
Feature: diagnostics_Image-original_Mean, Importance: 0.0000
Feature: diagnostics_Image-original_Minimum, Importance: 0.0000
Feature: diagnostics_Image-original_Maximum, Importance: 0.0000
Feature: diagnostics_Mask-original_VolumeNum, Impor

In [16]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150, 200]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15, 20]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2174 (79.52%), Test size: 560 rows (20.48%)

 Hyperparameter tunning

Fold 1: 420 rows (19.32%) | Fold 2: 468 rows (21.53%) | Fold 3: 470 rows (21.62%) | Fold 4: 398 rows (18.31%) | Fold 5: 418 rows (19.23%)


Hyperparameter Tuning: 100%|██████████| 72/72 [37:45<00:00, 31.47s/it]


Best Overall Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto
Best areaUnderROC: 0.9870

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 57
Final Model Evaluation on Test Data areaUnderROC: 0.9536
Final Accuracy: 0.9536, Precision: 0.9150, Recall: 1.0000

Sorted Feature Importances:
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.2748
Feature: original_gldm_DependenceNonUniformity, Importance: 0.1469
Feature: original_glszm_GrayLevelNonUniformity, Importance: 0.0969
Feature: original_ngtdm_Busyness, Importance: 0.0714
Feature: original_ngtdm_Coarseness, Importance: 0.0655
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0305
Feature: original_ngtdm_Strength, Importance: 0.0263
Feature: original_firstorder_RootMeanSquared, Importance: 0.0262
Feature: original_firstorder_Minimum, Importance: 0.0235
Feature: original_firstorder_Mean, Importance: 0.0233
Feature: original_firstorder_Skewness,

In [17]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2174 (79.52%), Test size: 560 rows (20.48%)

 Hyperparameter tunning

Fold 1: 420 rows (19.32%) | Fold 2: 468 rows (21.53%) | Fold 3: 470 rows (21.62%) | Fold 4: 398 rows (18.31%) | Fold 5: 418 rows (19.23%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:51<00:00, 34.33s/it]


Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.01
Best areaUnderROC: 0.9840

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 57
Final Model Evaluation on Test Data areaUnderROC: 0.9500
Final Accuracy: 0.9500, Precision: 0.9091, Recall: 1.0000

First 5 names of wrongly classified rows:
['hcs_003-000265_003-000265_MG_BL_Series-4_Image-1-0.png', 'hcs_003-001369_003-001369_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-1_Image-1-0.png']


In [18]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2174 (79.52%), Test size: 560 rows (20.48%)

 Hyperparameter tunning

Fold 1: 420 rows (19.32%) | Fold 2: 468 rows (21.53%) | Fold 3: 470 rows (21.62%) | Fold 4: 398 rows (18.31%) | Fold 5: 418 rows (19.23%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-16 02:00:53,139 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-16 02:01:00,888 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-16 02:01:08,170 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-16 02:01:12,987 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-16 02:01:18,756 INFO XGBoos

Best Overall Parameters: max_depth: 6, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.9901

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 57


2024-09-16 02:17:29,923 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-16 02:17:35,813 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9571
Final Accuracy: 0.9571, Precision: 0.9211, Recall: 1.0000

First 5 names of wrongly classified rows:
['hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-1_Image-1-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-3_Image-1-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-4_Image-1-0.png']


#### 512 - with full mask

In [14]:
file_name = 'features_512_full_mask.csv'

In [20]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning: 100%|██████████| 8/8 [03:02<00:00, 22.76s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.6755

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 98
Final Model Evaluation on Test Data areaUnderROC: 0.6731
Final Accuracy: 0.6731, Precision: 0.6813, Recall: 0.6503

Sorted Feature Importances:
Feature: original_firstorder_Variance, Importance: 0.5438
Feature: original_firstorder_90Percentile, Importance: 0.0845
Feature: original_glcm_Imc2, Importance: 0.0819
Feature: original_firstorder_Skewness, Importance: 0.0680
Feature: original_firstorder_10Percentile, Importance: 0.0508
Feature: original_glrlm_RunEntropy, Importance: 0.0288
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0242
Feature: original_firstorder_Kurtosis, Importance: 0.0232
Feature: original_glcm_ClusterProminence, Importance: 0.0216
Feature: original_glszm_SmallAreaEmphasis, Importance: 0.0172
Feature: original_firstorder_Entropy, Importance: 0.0120
Feature: original_glrlm_GrayLevelN

In [15]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning:   3%|▎         | 1/36 [00:29<17:28, 29.95s/it]

Average areaUnderROC: 0.6850 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:   6%|▌         | 2/36 [00:48<13:16, 23.42s/it]

Average areaUnderROC: 0.6850 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:   8%|▊         | 3/36 [01:09<12:14, 22.27s/it]

Average areaUnderROC: 0.6774 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  11%|█         | 4/36 [01:31<11:41, 21.92s/it]

Average areaUnderROC: 0.6774 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  14%|█▍        | 5/36 [01:50<10:48, 20.91s/it]

Average areaUnderROC: 0.6791 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  17%|█▋        | 6/36 [02:09<10:15, 20.51s/it]

Average areaUnderROC: 0.6791 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  19%|█▉        | 7/36 [02:36<10:53, 22.53s/it]

Average areaUnderROC: 0.6793 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  22%|██▏       | 8/36 [03:02<11:01, 23.61s/it]

Average areaUnderROC: 0.6793 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  25%|██▌       | 9/36 [03:31<11:25, 25.37s/it]

Average areaUnderROC: 0.6874 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  28%|██▊       | 10/36 [04:01<11:38, 26.86s/it]

Average areaUnderROC: 0.6874 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  31%|███       | 11/36 [04:38<12:27, 29.91s/it]

Average areaUnderROC: 0.6754 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  33%|███▎      | 12/36 [05:13<12:30, 31.29s/it]

Average areaUnderROC: 0.6754 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  36%|███▌      | 13/36 [05:52<12:52, 33.60s/it]

Average areaUnderROC: 0.6713 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  39%|███▉      | 14/36 [06:32<13:04, 35.65s/it]

Average areaUnderROC: 0.6713 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  42%|████▏     | 15/36 [07:19<13:38, 38.99s/it]

Average areaUnderROC: 0.6716 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  44%|████▍     | 16/36 [08:05<13:41, 41.10s/it]

Average areaUnderROC: 0.6716 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  47%|████▋     | 17/36 [09:01<14:26, 45.59s/it]

Average areaUnderROC: 0.6693 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  50%|█████     | 18/36 [09:57<14:36, 48.72s/it]

Average areaUnderROC: 0.6693 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  53%|█████▎    | 19/36 [10:22<11:50, 41.78s/it]

Average areaUnderROC: 0.6743 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  56%|█████▌    | 20/36 [10:48<09:50, 36.88s/it]

Average areaUnderROC: 0.6743 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  58%|█████▊    | 21/36 [11:12<08:16, 33.13s/it]

Average areaUnderROC: 0.6703 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  61%|██████    | 22/36 [11:40<07:21, 31.57s/it]

Average areaUnderROC: 0.6703 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  64%|██████▍   | 23/36 [12:12<06:53, 31.79s/it]

Average areaUnderROC: 0.6723 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  67%|██████▋   | 24/36 [12:48<06:33, 32.79s/it]

Average areaUnderROC: 0.6723 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  69%|██████▉   | 25/36 [13:40<07:04, 38.62s/it]

Average areaUnderROC: 0.6776 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  72%|███████▏  | 26/36 [14:33<07:08, 42.86s/it]

Average areaUnderROC: 0.6776 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  75%|███████▌  | 27/36 [15:39<07:28, 49.84s/it]

Average areaUnderROC: 0.6819 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  78%|███████▊  | 28/36 [16:41<07:08, 53.53s/it]

Average areaUnderROC: 0.6819 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  81%|████████  | 29/36 [18:15<07:39, 65.58s/it]

Average areaUnderROC: 0.6803 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  83%|████████▎ | 30/36 [19:42<07:13, 72.27s/it]

Average areaUnderROC: 0.6803 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  86%|████████▌ | 31/36 [21:10<06:24, 76.97s/it]

Average areaUnderROC: 0.6680 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  89%|████████▉ | 32/36 [22:41<05:24, 81.15s/it]

Average areaUnderROC: 0.6680 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  92%|█████████▏| 33/36 [24:26<04:24, 88.21s/it]

Average areaUnderROC: 0.6762 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  94%|█████████▍| 34/36 [26:16<03:09, 94.66s/it]

Average areaUnderROC: 0.6762 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  97%|█████████▋| 35/36 [28:22<01:44, 104.23s/it]

Average areaUnderROC: 0.6711 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning: 100%|██████████| 36/36 [30:30<00:00, 50.85s/it] 

Average areaUnderROC: 0.6711 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt
Best Overall Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto
Best areaUnderROC: 0.6874

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.6818
Final Accuracy: 0.6818, Precision: 0.6745, Recall: 0.7028

Sorted Feature Importances:
Feature: original_firstorder_Variance, Importance: 0.0573
Feature: original_firstorder_90Percentile, Importance: 0.0479
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0404
Feature: original_firstorder_RootMeanSquared, Importance: 0.0395
Feature: original_firstorder_InterquartileRange, Importance: 0.0368
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0344
Feature: original_firstorder_Skewness, Importance: 0.0332
Feature: original_firstorder_Kurtosis, Importance: 0.0264
Feature: original_firstorder_Energy, Importance: 0.0200
Feature: original_firstorder_TotalEnergy, Importance: 0.0177
Feature: original_glszm_SmallAreaLowGrayLevelEmphasis, Importance: 0.0163
Feature: original_firstorder_10Percentile, Importance: 0.0159
Feature: original_glszm_SmallAreaHighGrayLevelEmphasis, Importance: 0.0152
Feature: or

In [16]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4, 1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning:   8%|▊         | 1/12 [00:38<07:03, 38.52s/it]

Average areaUnderROC: 0.7002 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  17%|█▋        | 2/12 [01:07<05:29, 32.92s/it]

Average areaUnderROC: 0.6918 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  25%|██▌       | 3/12 [01:32<04:23, 29.30s/it]

Average areaUnderROC: 0.6895 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  33%|███▎      | 4/12 [01:54<03:32, 26.55s/it]

Average areaUnderROC: 0.6900 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  42%|████▏     | 5/12 [02:49<04:17, 36.83s/it]

Average areaUnderROC: 0.7028 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  50%|█████     | 6/12 [03:14<03:15, 32.66s/it]

Average areaUnderROC: 0.6918 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  58%|█████▊    | 7/12 [03:39<02:30, 30.11s/it]

Average areaUnderROC: 0.6895 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  67%|██████▋   | 8/12 [04:01<01:50, 27.69s/it]

Average areaUnderROC: 0.6900 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  75%|███████▌  | 9/12 [05:03<01:54, 38.33s/it]

Average areaUnderROC: 0.7028 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  83%|████████▎ | 10/12 [05:28<01:08, 34.35s/it]

Average areaUnderROC: 0.6918 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  92%|█████████▏| 11/12 [05:58<00:32, 32.80s/it]

Average areaUnderROC: 0.6895 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning: 100%|██████████| 12/12 [06:17<00:00, 31.47s/it]

Average areaUnderROC: 0.6900 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.01
Best Overall Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.7028

 Testing model on test dataset






Number of initial columns: 119, number of feature columns: 99
Final Model Evaluation on Test Data areaUnderROC: 0.6993
Final Accuracy: 0.6993, Precision: 0.6875, Recall: 0.7308

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png']


In [17]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning:   0%|          | 0/12 [00:00<?, ?it/s]2024-09-19 20:28:09,043 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:28:15,427 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:28:21,055 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:28:25,795 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:28:31,201 INFO XGBoos

Average areaUnderROC: 0.7087 | Evaluating Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:29:05,077 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:29:10,583 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:29:17,609 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:29:23,201 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:29:30,125 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7042 | Evaluating Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:30:05,250 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:30:11,138 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:30:18,048 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:30:23,905 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:30:30,879 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6984 | Evaluating Parameters: max_depth: 3, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:31:05,085 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:31:11,037 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:31:17,950 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:31:23,933 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:31:30,817 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.6816 | Evaluating Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:32:07,527 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:32:13,247 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:32:19,092 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:32:25,672 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:32:32,571 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6775 | Evaluating Parameters: max_depth: 6, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:33:12,575 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:33:20,073 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:33:27,172 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:33:34,623 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:33:40,098 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6800 | Evaluating Parameters: max_depth: 6, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:34:22,934 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:34:29,671 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:34:36,557 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:34:43,288 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:34:50,308 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.6775 | Evaluating Parameters: max_depth: 9, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:35:30,287 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:35:37,984 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:35:43,552 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:35:50,402 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:35:57,310 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6707 | Evaluating Parameters: max_depth: 9, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:36:41,073 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:36:50,418 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:36:56,281 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:37:04,516 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:37:11,622 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6738 | Evaluating Parameters: max_depth: 9, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:38:01,200 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:38:07,890 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:38:14,425 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:38:22,071 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:38:28,989 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.6783 | Evaluating Parameters: max_depth: 12, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:39:10,686 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:39:19,333 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:39:26,418 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:39:35,150 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:39:42,126 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.6724 | Evaluating Parameters: max_depth: 12, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 20:40:27,746 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:40:37,185 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:40:42,637 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:40:51,998 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 20:40:59,073 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.6807 | Evaluating Parameters: max_depth: 12, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best Overall Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7087

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99



2024-09-19 20:41:48,241 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 20:41:53,907 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7133
Final Accuracy: 0.7133, Precision: 0.6993, Recall: 0.7483

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png']


In [18]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning:  12%|█▎        | 1/8 [00:17<02:00, 17.15s/it]

Average areaUnderROC: 0.6755 | Evaluating Parameters: maxDepth: 5, maxBins: 32


Hyperparameter Tuning:  25%|██▌       | 2/8 [00:35<01:46, 17.75s/it]

Average areaUnderROC: 0.6621 | Evaluating Parameters: maxDepth: 5, maxBins: 64


Hyperparameter Tuning:  38%|███▊      | 3/8 [00:53<01:29, 17.97s/it]

Average areaUnderROC: 0.6439 | Evaluating Parameters: maxDepth: 10, maxBins: 32


Hyperparameter Tuning:  50%|█████     | 4/8 [01:11<01:11, 17.93s/it]

Average areaUnderROC: 0.6287 | Evaluating Parameters: maxDepth: 10, maxBins: 64


Hyperparameter Tuning:  62%|██████▎   | 5/8 [01:32<00:56, 18.96s/it]

Average areaUnderROC: 0.6247 | Evaluating Parameters: maxDepth: 15, maxBins: 32


Hyperparameter Tuning:  75%|███████▌  | 6/8 [01:50<00:37, 18.77s/it]

Average areaUnderROC: 0.6065 | Evaluating Parameters: maxDepth: 15, maxBins: 64


Hyperparameter Tuning:  88%|████████▊ | 7/8 [02:12<00:19, 19.92s/it]

Average areaUnderROC: 0.6243 | Evaluating Parameters: maxDepth: 20, maxBins: 32


Hyperparameter Tuning: 100%|██████████| 8/8 [02:31<00:00, 19.00s/it]

Average areaUnderROC: 0.6045 | Evaluating Parameters: maxDepth: 20, maxBins: 64
Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.6755

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.6731
Final Accuracy: 0.6731, Precision: 0.6813, Recall: 0.6503

Sorted Feature Importances:
Feature: original_firstorder_Variance, Importance: 0.5438
Feature: original_firstorder_90Percentile, Importance: 0.0845
Feature: original_glcm_Imc2, Importance: 0.0819
Feature: original_firstorder_Skewness, Importance: 0.0680
Feature: original_firstorder_10Percentile, Importance: 0.0508
Feature: original_glrlm_RunEntropy, Importance: 0.0288
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0242
Feature: original_firstorder_Kurtosis, Importance: 0.0232
Feature: original_glcm_ClusterProminence, Importance: 0.0216
Feature: original_glszm_SmallAreaEmphasis, Importance: 0.0172
Feature: original_firstorder_Entropy, Importance: 0.0120
Feature: original_glrlm_GrayLevelNonUniformityNormalized, Importance: 0.0107
Feature: original_glcm_Imc1, Importance: 0.0105
Feature: original_firstorder_Energy, Importance: 0.0099
Feature: original_firsto

#### 256 - with lesion mask

In [19]:
file_name = 'features_256_lesion_mask.csv'

In [20]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2255 (80.74%), Test size: 538 rows (19.26%)

 Hyperparameter tunning

Fold 1: 510 rows (22.62%) | Fold 2: 414 rows (18.36%) | Fold 3: 417 rows (18.49%) | Fold 4: 474 rows (21.02%) | Fold 5: 440 rows (19.51%)


Hyperparameter Tuning:  12%|█▎        | 1/8 [00:14<01:38, 14.12s/it]

Average areaUnderROC: 0.9719 | Evaluating Parameters: maxDepth: 5, maxBins: 32


Hyperparameter Tuning:  25%|██▌       | 2/8 [00:27<01:23, 13.94s/it]

Average areaUnderROC: 0.9724 | Evaluating Parameters: maxDepth: 5, maxBins: 64


Hyperparameter Tuning:  38%|███▊      | 3/8 [00:42<01:10, 14.18s/it]

Average areaUnderROC: 0.9524 | Evaluating Parameters: maxDepth: 10, maxBins: 32


Hyperparameter Tuning:  50%|█████     | 4/8 [00:56<00:56, 14.23s/it]

Average areaUnderROC: 0.9515 | Evaluating Parameters: maxDepth: 10, maxBins: 64


Hyperparameter Tuning:  62%|██████▎   | 5/8 [01:11<00:43, 14.48s/it]

Average areaUnderROC: 0.9474 | Evaluating Parameters: maxDepth: 15, maxBins: 32


Hyperparameter Tuning:  75%|███████▌  | 6/8 [01:26<00:29, 14.61s/it]

Average areaUnderROC: 0.9482 | Evaluating Parameters: maxDepth: 15, maxBins: 64


Hyperparameter Tuning:  88%|████████▊ | 7/8 [01:41<00:14, 14.72s/it]

Average areaUnderROC: 0.9479 | Evaluating Parameters: maxDepth: 20, maxBins: 32


Hyperparameter Tuning: 100%|██████████| 8/8 [01:56<00:00, 14.53s/it]

Average areaUnderROC: 0.9482 | Evaluating Parameters: maxDepth: 20, maxBins: 64
Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.9724

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 58





Final Model Evaluation on Test Data areaUnderROC: 0.9740
Final Accuracy: 0.9740, Precision: 0.9636, Recall: 0.9851

Sorted Feature Importances:
Feature: diagnostics_Mask-original_VoxelNum, Importance: 0.9875
Feature: original_firstorder_90Percentile, Importance: 0.0076
Feature: original_firstorder_Median, Importance: 0.0020
Feature: original_firstorder_Skewness, Importance: 0.0015
Feature: original_glszm_SmallAreaHighGrayLevelEmphasis, Importance: 0.0013
Feature: diagnostics_Image-original_Dimensionality, Importance: 0.0000
Feature: diagnostics_Image-original_Mean, Importance: 0.0000
Feature: diagnostics_Image-original_Minimum, Importance: 0.0000
Feature: diagnostics_Image-original_Maximum, Importance: 0.0000
Feature: diagnostics_Mask-original_VolumeNum, Importance: 0.0000
Feature: original_firstorder_10Percentile, Importance: 0.0000
Feature: original_firstorder_Energy, Importance: 0.0000
Feature: original_firstorder_Entropy, Importance: 0.0000
Feature: original_firstorder_Interquartil

In [21]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2255 (80.74%), Test size: 538 rows (19.26%)

 Hyperparameter tunning

Fold 1: 510 rows (22.62%) | Fold 2: 414 rows (18.36%) | Fold 3: 417 rows (18.49%) | Fold 4: 474 rows (21.02%) | Fold 5: 440 rows (19.51%)


Hyperparameter Tuning:   3%|▎         | 1/36 [00:15<08:47, 15.06s/it]

Average areaUnderROC: 0.9605 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:   6%|▌         | 2/36 [00:30<08:30, 15.01s/it]

Average areaUnderROC: 0.9605 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:   8%|▊         | 3/36 [00:45<08:18, 15.10s/it]

Average areaUnderROC: 0.9598 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  11%|█         | 4/36 [01:00<08:05, 15.19s/it]

Average areaUnderROC: 0.9598 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  14%|█▍        | 5/36 [01:16<08:02, 15.58s/it]

Average areaUnderROC: 0.9582 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  17%|█▋        | 6/36 [01:32<07:52, 15.75s/it]

Average areaUnderROC: 0.9582 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  19%|█▉        | 7/36 [01:51<07:59, 16.53s/it]

Average areaUnderROC: 0.9659 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  22%|██▏       | 8/36 [02:09<08:00, 17.16s/it]

Average areaUnderROC: 0.9659 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  25%|██▌       | 9/36 [02:28<07:58, 17.72s/it]

Average areaUnderROC: 0.9633 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  28%|██▊       | 10/36 [02:47<07:50, 18.10s/it]

Average areaUnderROC: 0.9633 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  31%|███       | 11/36 [03:08<07:53, 18.95s/it]

Average areaUnderROC: 0.9642 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  33%|███▎      | 12/36 [03:29<07:48, 19.53s/it]

Average areaUnderROC: 0.9642 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  36%|███▌      | 13/36 [03:49<07:35, 19.80s/it]

Average areaUnderROC: 0.9641 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  39%|███▉      | 14/36 [04:10<07:22, 20.11s/it]

Average areaUnderROC: 0.9641 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  42%|████▏     | 15/36 [04:32<07:12, 20.58s/it]

Average areaUnderROC: 0.9633 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  44%|████▍     | 16/36 [04:53<06:58, 20.92s/it]

Average areaUnderROC: 0.9633 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  47%|████▋     | 17/36 [05:18<06:56, 21.93s/it]

Average areaUnderROC: 0.9628 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  50%|█████     | 18/36 [05:43<06:51, 22.87s/it]

Average areaUnderROC: 0.9628 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  53%|█████▎    | 19/36 [06:00<06:02, 21.32s/it]

Average areaUnderROC: 0.9646 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  56%|█████▌    | 20/36 [06:18<05:23, 20.21s/it]

Average areaUnderROC: 0.9646 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  58%|█████▊    | 21/36 [06:37<04:58, 19.87s/it]

Average areaUnderROC: 0.9609 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  61%|██████    | 22/36 [06:56<04:36, 19.73s/it]

Average areaUnderROC: 0.9609 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  64%|██████▍   | 23/36 [07:19<04:26, 20.53s/it]

Average areaUnderROC: 0.9613 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  67%|██████▋   | 24/36 [07:42<04:17, 21.43s/it]

Average areaUnderROC: 0.9613 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  69%|██████▉   | 25/36 [08:11<04:19, 23.61s/it]

Average areaUnderROC: 0.9666 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  72%|███████▏  | 26/36 [08:40<04:11, 25.17s/it]

Average areaUnderROC: 0.9666 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  75%|███████▌  | 27/36 [09:15<04:13, 28.20s/it]

Average areaUnderROC: 0.9658 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  78%|███████▊  | 28/36 [09:48<03:55, 29.48s/it]

Average areaUnderROC: 0.9658 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  81%|████████  | 29/36 [10:31<03:54, 33.54s/it]

Average areaUnderROC: 0.9642 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  83%|████████▎ | 30/36 [11:13<03:37, 36.30s/it]

Average areaUnderROC: 0.9642 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  86%|████████▌ | 31/36 [11:51<03:03, 36.79s/it]

Average areaUnderROC: 0.9648 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  89%|████████▉ | 32/36 [12:27<02:25, 36.47s/it]

Average areaUnderROC: 0.9648 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  92%|█████████▏| 33/36 [13:10<01:54, 38.33s/it]

Average areaUnderROC: 0.9653 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  94%|█████████▍| 34/36 [13:53<01:19, 39.69s/it]

Average areaUnderROC: 0.9653 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  97%|█████████▋| 35/36 [14:45<00:43, 43.52s/it]

Average areaUnderROC: 0.9639 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning: 100%|██████████| 36/36 [15:37<00:00, 26.05s/it]

Average areaUnderROC: 0.9639 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt
Best Overall Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto
Best areaUnderROC: 0.9666

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 58





Final Model Evaluation on Test Data areaUnderROC: 0.9740
Final Accuracy: 0.9740, Precision: 0.9636, Recall: 0.9851

Sorted Feature Importances:
Feature: diagnostics_Mask-original_VoxelNum, Importance: 0.2132
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.1198
Feature: original_glszm_GrayLevelNonUniformity, Importance: 0.0705
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0655
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0547
Feature: original_firstorder_Mean, Importance: 0.0527
Feature: original_firstorder_RootMeanSquared, Importance: 0.0520
Feature: original_firstorder_Median, Importance: 0.0510
Feature: original_firstorder_90Percentile, Importance: 0.0362
Feature: original_ngtdm_Busyness, Importance: 0.0218
Feature: original_firstorder_Skewness, Importance: 0.0217
Feature: original_firstorder_Range, Importance: 0.0199
Feature: original_ngtdm_Coarseness, Importance: 0.0191
Feature: original_firstorder_Minimum, Importance: 0.0122
Feature: or

In [22]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2255 (80.74%), Test size: 538 rows (19.26%)

 Hyperparameter tunning

Fold 1: 510 rows (22.62%) | Fold 2: 414 rows (18.36%) | Fold 3: 417 rows (18.49%) | Fold 4: 474 rows (21.02%) | Fold 5: 440 rows (19.51%)


Hyperparameter Tuning:   8%|▊         | 1/12 [00:33<06:06, 33.32s/it]

Average areaUnderROC: 0.9361 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  17%|█▋        | 2/12 [00:55<04:26, 26.62s/it]

Average areaUnderROC: 0.9338 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  25%|██▌       | 3/12 [01:17<03:41, 24.63s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  33%|███▎      | 4/12 [01:35<02:57, 22.19s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  42%|████▏     | 5/12 [02:14<03:17, 28.15s/it]

Average areaUnderROC: 0.9361 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  50%|█████     | 6/12 [02:33<02:29, 24.91s/it]

Average areaUnderROC: 0.9338 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  58%|█████▊    | 7/12 [02:57<02:02, 24.59s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  67%|██████▋   | 8/12 [03:15<01:30, 22.65s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  75%|███████▌  | 9/12 [03:54<01:22, 27.55s/it]

Average areaUnderROC: 0.9361 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  83%|████████▎ | 10/12 [04:12<00:49, 24.85s/it]

Average areaUnderROC: 0.9338 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  92%|█████████▏| 11/12 [04:37<00:24, 24.91s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning: 100%|██████████| 12/12 [04:56<00:00, 24.73s/it]

Average areaUnderROC: 0.9189 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.01
Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.9361

 Testing model on test dataset






Number of initial columns: 119, number of feature columns: 58
Final Model Evaluation on Test Data areaUnderROC: 0.9610
Final Accuracy: 0.9610, Precision: 0.9397, Recall: 0.9851

First 5 names of wrongly classified rows:
['hcs_003-000017_003-000017_MG_BL_Series-1004_Image-1004-1.png', 'hcs_003-000035_003-000035_MG_BL_Series-1002_Image-1002-1.png', 'hcs_003-000307_003-000307_MG_BL_Series-4_Image-1-0.png', 'hcs_003-000971_003-000971_MG_BL_Series-1004_Image-1004-1.png', 'hcs_003-001334_003-001334_MG_TP0_1_Series-1001_Image-1004-1.png']


In [23]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2255 (80.74%), Test size: 538 rows (19.26%)

 Hyperparameter tunning

Fold 1: 510 rows (22.62%) | Fold 2: 414 rows (18.36%) | Fold 3: 417 rows (18.49%) | Fold 4: 474 rows (21.02%) | Fold 5: 440 rows (19.51%)


Hyperparameter Tuning:   0%|          | 0/12 [00:00<?, ?it/s]2024-09-19 21:23:10,285 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:23:15,004 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:23:20,252 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:23:24,926 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:23:30,176 INFO XGBoos

Average areaUnderROC: 0.9709 | Evaluating Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:24:01,234 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:24:06,768 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:24:13,691 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:24:19,357 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:24:26,373 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9692 | Evaluating Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:25:17,075 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:25:23,144 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:25:28,258 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:25:33,082 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:25:38,278 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9684 | Evaluating Parameters: max_depth: 3, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:26:08,628 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:26:13,383 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:26:18,666 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:26:23,479 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:26:28,738 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.9693 | Evaluating Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:26:59,128 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:27:04,735 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:27:11,763 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:27:17,356 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:27:24,228 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9706 | Evaluating Parameters: max_depth: 6, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:27:56,643 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:28:01,761 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:28:08,539 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:28:14,327 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:28:21,287 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9698 | Evaluating Parameters: max_depth: 6, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:28:54,883 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:28:59,774 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:29:06,755 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:29:12,353 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:29:19,218 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.9683 | Evaluating Parameters: max_depth: 9, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:29:53,467 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:29:58,610 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:30:04,588 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:30:10,295 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:30:17,187 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9693 | Evaluating Parameters: max_depth: 9, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:30:54,652 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:30:59,860 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:31:05,215 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:31:10,460 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:31:17,215 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9693 | Evaluating Parameters: max_depth: 9, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:31:55,639 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:32:01,199 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:32:08,107 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:32:13,660 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:32:18,997 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.9701 | Evaluating Parameters: max_depth: 12, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:32:54,760 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:33:00,564 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:33:07,495 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:33:13,356 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:33:20,425 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.9703 | Evaluating Parameters: max_depth: 12, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 21:33:56,998 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:34:02,727 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:34:09,648 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:34:15,590 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 21:34:22,636 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.9707 | Evaluating Parameters: max_depth: 12, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best Overall Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.9709

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 58



2024-09-19 21:34:59,246 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 21:35:04,998 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9740
Final Accuracy: 0.9740, Precision: 0.9570, Recall: 0.9926

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-1.png', 'hcs_003-000286_003-000286_MG_BL_Series-1010_Image-1-0.png', 'hcs_003-000971_003-000971_MG_BL_Series-1004_Image-1004-1.png', 'hcs_003-001204_003-001204_MG_BL_Series-1001_Image-1002-0.png']


#### 256 - with full mask

In [24]:
file_name = 'features_256_full_mask.csv'

In [25]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning:  12%|█▎        | 1/8 [00:17<02:05, 17.96s/it]

Average areaUnderROC: 0.7169 | Evaluating Parameters: maxDepth: 5, maxBins: 32


Hyperparameter Tuning:  25%|██▌       | 2/8 [00:37<01:53, 18.88s/it]

Average areaUnderROC: 0.7226 | Evaluating Parameters: maxDepth: 5, maxBins: 64


Hyperparameter Tuning:  38%|███▊      | 3/8 [00:56<01:34, 18.93s/it]

Average areaUnderROC: 0.6844 | Evaluating Parameters: maxDepth: 10, maxBins: 32


Hyperparameter Tuning:  50%|█████     | 4/8 [01:17<01:18, 19.60s/it]

Average areaUnderROC: 0.6656 | Evaluating Parameters: maxDepth: 10, maxBins: 64


Hyperparameter Tuning:  62%|██████▎   | 5/8 [01:39<01:01, 20.44s/it]

Average areaUnderROC: 0.6628 | Evaluating Parameters: maxDepth: 15, maxBins: 32


Hyperparameter Tuning:  75%|███████▌  | 6/8 [01:59<00:40, 20.40s/it]

Average areaUnderROC: 0.6569 | Evaluating Parameters: maxDepth: 15, maxBins: 64


Hyperparameter Tuning:  88%|████████▊ | 7/8 [02:23<00:21, 21.47s/it]

Average areaUnderROC: 0.6582 | Evaluating Parameters: maxDepth: 20, maxBins: 32


Hyperparameter Tuning: 100%|██████████| 8/8 [02:43<00:00, 20.41s/it]

Average areaUnderROC: 0.6489 | Evaluating Parameters: maxDepth: 20, maxBins: 64
Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.7226

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.7244
Final Accuracy: 0.7244, Precision: 0.7280, Recall: 0.7165

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.7217
Feature: original_firstorder_Variance, Importance: 0.0899
Feature: original_firstorder_Kurtosis, Importance: 0.0313
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0222
Feature: original_glcm_ClusterProminence, Importance: 0.0189
Feature: original_glszm_LargeAreaEmphasis, Importance: 0.0148
Feature: original_gldm_SmallDependenceHighGrayLevelEmphasis, Importance: 0.0148
Feature: original_glrlm_LongRunHighGrayLevelEmphasis, Importance: 0.0127
Feature: original_glrlm_ShortRunHighGrayLevelEmphasis, Importance: 0.0123
Feature: original_glszm_SmallAreaLowGrayLevelEmphasis, Importance: 0.0111
Feature: original_glszm_ZoneEntropy, Importance: 0.0109
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.0101
Feature: original_glrlm_HighGrayLevelRunEmphasis, Importance

In [26]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning:   3%|▎         | 1/36 [00:18<10:45, 18.46s/it]

Average areaUnderROC: 0.7319 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:   6%|▌         | 2/36 [00:40<11:29, 20.29s/it]

Average areaUnderROC: 0.7319 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:   8%|▊         | 3/36 [01:01<11:20, 20.61s/it]

Average areaUnderROC: 0.7309 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  11%|█         | 4/36 [01:21<10:53, 20.42s/it]

Average areaUnderROC: 0.7309 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  14%|█▍        | 5/36 [01:45<11:15, 21.79s/it]

Average areaUnderROC: 0.7353 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  17%|█▋        | 6/36 [02:04<10:27, 20.92s/it]

Average areaUnderROC: 0.7353 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  19%|█▉        | 7/36 [02:34<11:36, 24.02s/it]

Average areaUnderROC: 0.7334 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  22%|██▏       | 8/36 [03:07<12:26, 26.67s/it]

Average areaUnderROC: 0.7334 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  25%|██▌       | 9/36 [03:43<13:19, 29.62s/it]

Average areaUnderROC: 0.7302 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  28%|██▊       | 10/36 [04:23<14:11, 32.75s/it]

Average areaUnderROC: 0.7302 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  31%|███       | 11/36 [05:10<15:28, 37.14s/it]

Average areaUnderROC: 0.7318 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  33%|███▎      | 12/36 [05:58<16:09, 40.39s/it]

Average areaUnderROC: 0.7318 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  36%|███▌      | 13/36 [06:48<16:41, 43.54s/it]

Average areaUnderROC: 0.7259 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  39%|███▉      | 14/36 [07:35<16:19, 44.54s/it]

Average areaUnderROC: 0.7259 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  42%|████▏     | 15/36 [08:35<17:11, 49.10s/it]

Average areaUnderROC: 0.7350 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  44%|████▍     | 16/36 [09:30<16:59, 50.96s/it]

Average areaUnderROC: 0.7350 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  47%|████▋     | 17/36 [10:45<18:24, 58.11s/it]

Average areaUnderROC: 0.7326 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  50%|█████     | 18/36 [11:59<18:50, 62.78s/it]

Average areaUnderROC: 0.7326 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  53%|█████▎    | 19/36 [12:30<15:04, 53.23s/it]

Average areaUnderROC: 0.7323 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  56%|█████▌    | 20/36 [13:00<12:21, 46.37s/it]

Average areaUnderROC: 0.7323 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  58%|█████▊    | 21/36 [13:34<10:40, 42.69s/it]

Average areaUnderROC: 0.7340 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  61%|██████    | 22/36 [14:09<09:23, 40.27s/it]

Average areaUnderROC: 0.7340 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  64%|██████▍   | 23/36 [14:49<08:45, 40.42s/it]

Average areaUnderROC: 0.7300 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  67%|██████▋   | 24/36 [15:29<08:03, 40.25s/it]

Average areaUnderROC: 0.7300 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  69%|██████▉   | 25/36 [16:38<08:57, 48.88s/it]

Average areaUnderROC: 0.7345 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  72%|███████▏  | 26/36 [17:48<09:10, 55.08s/it]

Average areaUnderROC: 0.7345 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  75%|███████▌  | 27/36 [19:19<09:52, 65.78s/it]

Average areaUnderROC: 0.7368 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  78%|███████▊  | 28/36 [20:43<09:31, 71.48s/it]

Average areaUnderROC: 0.7368 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  81%|████████  | 29/36 [22:53<10:22, 89.00s/it]

Average areaUnderROC: 0.7347 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  83%|████████▎ | 30/36 [24:39<09:24, 94.13s/it]

Average areaUnderROC: 0.7347 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  86%|████████▌ | 31/36 [26:27<08:11, 98.27s/it]

Average areaUnderROC: 0.7292 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  89%|████████▉ | 32/36 [28:15<06:44, 101.06s/it]

Average areaUnderROC: 0.7292 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  92%|█████████▏| 33/36 [30:21<05:25, 108.56s/it]

Average areaUnderROC: 0.7402 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  94%|█████████▍| 34/36 [32:25<03:46, 113.11s/it]

Average areaUnderROC: 0.7402 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  97%|█████████▋| 35/36 [35:21<02:12, 132.21s/it]

Average areaUnderROC: 0.7409 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning: 100%|██████████| 36/36 [37:53<00:00, 63.14s/it] 

Average areaUnderROC: 0.7409 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt
Best Overall Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto
Best areaUnderROC: 0.7409

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.7441
Final Accuracy: 0.7441, Precision: 0.7562, Recall: 0.7205

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.0611
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0532
Feature: original_firstorder_Variance, Importance: 0.0469
Feature: original_firstorder_RootMeanSquared, Importance: 0.0406
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0349
Feature: original_firstorder_InterquartileRange, Importance: 0.0347
Feature: original_firstorder_TotalEnergy, Importance: 0.0273
Feature: original_firstorder_Energy, Importance: 0.0255
Feature: original_firstorder_Kurtosis, Importance: 0.0255
Feature: original_firstorder_Mean, Importance: 0.0225
Feature: original_firstorder_Skewness, Importance: 0.0214
Feature: diagnostics_Image-original_Mean, Importance: 0.0213
Feature: original_firstorder_Median, Importance: 0.0194
Feature: original_glszm_SizeZoneNonUniformity, Impo

In [27]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning:   8%|▊         | 1/12 [00:39<07:19, 39.95s/it]

Average areaUnderROC: 0.7358 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  17%|█▋        | 2/12 [01:06<05:18, 31.89s/it]

Average areaUnderROC: 0.7392 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  25%|██▌       | 3/12 [01:35<04:38, 30.93s/it]

Average areaUnderROC: 0.7364 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  33%|███▎      | 4/12 [01:58<03:41, 27.63s/it]

Average areaUnderROC: 0.7360 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  42%|████▏     | 5/12 [02:59<04:38, 39.72s/it]

Average areaUnderROC: 0.7339 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  50%|█████     | 6/12 [03:21<03:20, 33.47s/it]

Average areaUnderROC: 0.7392 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  58%|█████▊    | 7/12 [03:53<02:45, 33.03s/it]

Average areaUnderROC: 0.7364 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  67%|██████▋   | 8/12 [04:18<02:01, 30.44s/it]

Average areaUnderROC: 0.7360 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  75%|███████▌  | 9/12 [05:16<01:57, 39.07s/it]

Average areaUnderROC: 0.7339 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  83%|████████▎ | 10/12 [05:41<01:09, 34.90s/it]

Average areaUnderROC: 0.7392 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  92%|█████████▏| 11/12 [06:14<00:34, 34.32s/it]

Average areaUnderROC: 0.7364 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning: 100%|██████████| 12/12 [06:39<00:00, 33.32s/it]

Average areaUnderROC: 0.7360 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.01
Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.01
Best areaUnderROC: 0.7392

 Testing model on test dataset






Number of initial columns: 119, number of feature columns: 99
Final Model Evaluation on Test Data areaUnderROC: 0.7382
Final Accuracy: 0.7382, Precision: 0.7249, Recall: 0.7677

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-1.png', 'hcs_003-000024_003-000024_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png']


In [28]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning:   0%|          | 0/12 [00:00<?, ?it/s]2024-09-19 22:25:06,036 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:25:22,296 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:25:27,596 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:25:32,490 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:25:37,797 INFO XGBoos

Average areaUnderROC: 0.7464 | Evaluating Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:26:11,513 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:26:17,552 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:26:24,481 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:26:30,458 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:26:37,441 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7427 | Evaluating Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:27:16,429 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:27:22,078 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:27:28,310 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:27:34,262 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:27:41,249 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7408 | Evaluating Parameters: max_depth: 3, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:28:20,787 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:28:26,808 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:28:33,948 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:28:39,955 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:28:46,683 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.7442 | Evaluating Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:29:29,183 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:29:36,554 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:29:44,245 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:29:50,922 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:29:58,145 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7420 | Evaluating Parameters: max_depth: 6, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:30:40,661 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:30:48,624 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:30:56,074 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:31:03,973 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:31:11,346 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7382 | Evaluating Parameters: max_depth: 6, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:31:57,026 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:32:03,968 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:32:11,304 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:32:18,298 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:32:25,556 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.7502 | Evaluating Parameters: max_depth: 9, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:33:08,906 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:33:16,855 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:33:24,197 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:33:32,222 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:33:39,624 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7547 | Evaluating Parameters: max_depth: 9, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:34:26,325 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:34:35,893 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:34:43,343 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:34:53,028 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:35:00,404 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7537 | Evaluating Parameters: max_depth: 9, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:35:51,701 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:35:59,350 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:36:06,810 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:36:14,422 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:36:21,774 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7515 | Evaluating Parameters: max_depth: 12, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:37:06,740 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:37:15,537 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:37:23,042 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:37:31,882 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:37:39,220 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.7453 | Evaluating Parameters: max_depth: 12, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-19 22:38:27,854 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:38:38,242 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:38:45,661 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:38:56,596 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-19 22:39:04,146 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.7414 | Evaluating Parameters: max_depth: 12, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best Overall Parameters: max_depth: 9, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7547

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99



2024-09-19 22:39:55,469 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-19 22:40:03,828 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7579
Final Accuracy: 0.7579, Precision: 0.7673, Recall: 0.7402

First 5 names of wrongly classified rows:
['auth_001-000074_001-000074_MG_TP1_Series-4_Image-1-1.png', 'auth_001-000084_001-000084_MG_BL_Series-1_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000024_003-000024_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png']


#### 128 - with lesion mask

In [14]:
file_name = 'features_128_lesion_mask.csv'

In [30]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning:  12%|█▎        | 1/8 [00:18<02:09, 18.52s/it]

Average areaUnderROC: 0.9066 | Evaluating Parameters: maxDepth: 5, maxBins: 32


Hyperparameter Tuning:  25%|██▌       | 2/8 [00:39<02:01, 20.18s/it]

Average areaUnderROC: 0.9093 | Evaluating Parameters: maxDepth: 5, maxBins: 64


Hyperparameter Tuning:  38%|███▊      | 3/8 [00:57<01:34, 18.90s/it]

Average areaUnderROC: 0.8883 | Evaluating Parameters: maxDepth: 10, maxBins: 32


Hyperparameter Tuning:  50%|█████     | 4/8 [01:19<01:20, 20.16s/it]

Average areaUnderROC: 0.8926 | Evaluating Parameters: maxDepth: 10, maxBins: 64


Hyperparameter Tuning:  62%|██████▎   | 5/8 [01:36<00:57, 19.05s/it]

Average areaUnderROC: 0.8712 | Evaluating Parameters: maxDepth: 15, maxBins: 32


Hyperparameter Tuning:  75%|███████▌  | 6/8 [01:59<00:40, 20.43s/it]

Average areaUnderROC: 0.8719 | Evaluating Parameters: maxDepth: 15, maxBins: 64


Hyperparameter Tuning:  88%|████████▊ | 7/8 [02:17<00:19, 19.55s/it]

Average areaUnderROC: 0.8703 | Evaluating Parameters: maxDepth: 20, maxBins: 32


Hyperparameter Tuning: 100%|██████████| 8/8 [02:41<00:00, 20.13s/it]


Average areaUnderROC: 0.8706 | Evaluating Parameters: maxDepth: 20, maxBins: 64
Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.9093

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99
Final Model Evaluation on Test Data areaUnderROC: 0.9459
Final Accuracy: 0.9459, Precision: 0.9314, Recall: 0.9627

Sorted Feature Importances:
Feature: diagnostics_Mask-original_VoxelNum, Importance: 0.7278
Feature: original_firstorder_Energy, Importance: 0.2294
Feature: original_firstorder_90Percentile, Importance: 0.0163
Feature: original_glcm_Correlation, Importance: 0.0122
Feature: original_glrlm_HighGrayLevelRunEmphasis, Importance: 0.0121
Feature: original_firstorder_Skewness, Importance: 0.0021
Feature: diagnostics_Image-original_Dimensionality, Importance: 0.0000
Feature: diagnostics_Image-original_Mean, Importance: 0.0000
Feature: diagnostics_Image-original_Minimum, Importance: 0.0000
Feature: diagnostics_Image-original_Maximum, 

In [31]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning:   3%|▎         | 1/36 [00:22<12:57, 22.23s/it]

Average areaUnderROC: 0.8810 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:   6%|▌         | 2/36 [00:38<10:45, 18.97s/it]

Average areaUnderROC: 0.8810 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:   8%|▊         | 3/36 [01:00<11:12, 20.37s/it]

Average areaUnderROC: 0.8822 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  11%|█         | 4/36 [01:18<10:10, 19.07s/it]

Average areaUnderROC: 0.8822 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  14%|█▍        | 5/36 [01:41<10:38, 20.58s/it]

Average areaUnderROC: 0.8807 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  17%|█▋        | 6/36 [01:59<09:53, 19.79s/it]

Average areaUnderROC: 0.8807 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  19%|█▉        | 7/36 [02:25<10:34, 21.89s/it]

Average areaUnderROC: 0.8886 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  22%|██▏       | 8/36 [02:52<10:53, 23.32s/it]

Average areaUnderROC: 0.8886 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  25%|██▌       | 9/36 [03:15<10:32, 23.41s/it]

Average areaUnderROC: 0.8945 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  28%|██▊       | 10/36 [03:44<10:50, 25.00s/it]

Average areaUnderROC: 0.8945 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  31%|███       | 11/36 [04:17<11:26, 27.48s/it]

Average areaUnderROC: 0.8939 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  33%|███▎      | 12/36 [04:48<11:27, 28.64s/it]

Average areaUnderROC: 0.8939 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  36%|███▌      | 13/36 [05:17<11:00, 28.70s/it]

Average areaUnderROC: 0.8865 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  39%|███▉      | 14/36 [05:49<10:51, 29.60s/it]

Average areaUnderROC: 0.8865 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  42%|████▏     | 15/36 [06:23<10:50, 30.96s/it]

Average areaUnderROC: 0.8931 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  44%|████▍     | 16/36 [06:57<10:36, 31.85s/it]

Average areaUnderROC: 0.8931 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  47%|████▋     | 17/36 [07:35<10:41, 33.79s/it]

Average areaUnderROC: 0.8936 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  50%|█████     | 18/36 [08:17<10:50, 36.12s/it]

Average areaUnderROC: 0.8936 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  53%|█████▎    | 19/36 [08:37<08:51, 31.28s/it]

Average areaUnderROC: 0.8808 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  56%|█████▌    | 20/36 [09:00<07:43, 28.96s/it]

Average areaUnderROC: 0.8808 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  58%|█████▊    | 21/36 [09:24<06:51, 27.40s/it]

Average areaUnderROC: 0.8839 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  61%|██████    | 22/36 [09:47<06:05, 26.14s/it]

Average areaUnderROC: 0.8839 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  64%|██████▍   | 23/36 [10:16<05:52, 27.08s/it]

Average areaUnderROC: 0.8850 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  67%|██████▋   | 24/36 [10:46<05:35, 27.94s/it]

Average areaUnderROC: 0.8850 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  69%|██████▉   | 25/36 [11:29<05:56, 32.42s/it]

Average areaUnderROC: 0.8954 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  72%|███████▏  | 26/36 [12:12<05:54, 35.48s/it]

Average areaUnderROC: 0.8954 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  75%|███████▌  | 27/36 [13:02<06:00, 40.02s/it]

Average areaUnderROC: 0.8965 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  78%|███████▊  | 28/36 [13:57<05:55, 44.39s/it]

Average areaUnderROC: 0.8965 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  81%|████████  | 29/36 [15:05<06:00, 51.48s/it]

Average areaUnderROC: 0.8980 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  83%|████████▎ | 30/36 [16:16<05:43, 57.22s/it]

Average areaUnderROC: 0.8980 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  86%|████████▌ | 31/36 [17:22<05:00, 60.05s/it]

Average areaUnderROC: 0.8916 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  89%|████████▉ | 32/36 [18:30<04:09, 62.27s/it]

Average areaUnderROC: 0.8916 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  92%|█████████▏| 33/36 [19:48<03:21, 67.16s/it]

Average areaUnderROC: 0.8943 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  94%|█████████▍| 34/36 [21:06<02:20, 70.28s/it]

Average areaUnderROC: 0.8943 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  97%|█████████▋| 35/36 [22:42<01:18, 78.02s/it]

Average areaUnderROC: 0.8960 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning: 100%|██████████| 36/36 [24:23<00:00, 40.65s/it]

Average areaUnderROC: 0.8960 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt
Best Overall Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto
Best areaUnderROC: 0.8980

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.9347
Final Accuracy: 0.9347, Precision: 0.9299, Recall: 0.9403

Sorted Feature Importances:
Feature: diagnostics_Mask-original_VoxelNum, Importance: 0.1259
Feature: original_firstorder_RootMeanSquared, Importance: 0.0838
Feature: original_firstorder_90Percentile, Importance: 0.0615
Feature: original_firstorder_Mean, Importance: 0.0599
Feature: original_firstorder_Median, Importance: 0.0583
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0507
Feature: original_glszm_GrayLevelNonUniformity, Importance: 0.0368
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.0301
Feature: original_glrlm_GrayLevelNonUniformity, Importance: 0.0293
Feature: original_firstorder_Skewness, Importance: 0.0287
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.0234
Feature: original_glrlm_ShortRunLowGrayLevelEmphasis, Importance: 0.0200
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0142
Feature: original_firsto

In [32]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning:   8%|▊         | 1/12 [00:43<07:57, 43.43s/it]

Average areaUnderROC: 0.8778 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  17%|█▋        | 2/12 [01:11<05:42, 34.20s/it]

Average areaUnderROC: 0.8782 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  25%|██▌       | 3/12 [01:42<04:57, 33.03s/it]

Average areaUnderROC: 0.8662 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  33%|███▎      | 4/12 [02:09<04:03, 30.46s/it]

Average areaUnderROC: 0.8653 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  42%|████▏     | 5/12 [03:05<04:36, 39.57s/it]

Average areaUnderROC: 0.8791 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  50%|█████     | 6/12 [03:33<03:34, 35.83s/it]

Average areaUnderROC: 0.8782 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  58%|█████▊    | 7/12 [04:07<02:56, 35.20s/it]

Average areaUnderROC: 0.8662 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  67%|██████▋   | 8/12 [04:32<02:07, 31.87s/it]

Average areaUnderROC: 0.8653 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  75%|███████▌  | 9/12 [05:31<02:01, 40.44s/it]

Average areaUnderROC: 0.8791 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  83%|████████▎ | 10/12 [06:00<01:13, 36.80s/it]

Average areaUnderROC: 0.8782 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  92%|█████████▏| 11/12 [06:31<00:35, 35.24s/it]

Average areaUnderROC: 0.8662 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning: 100%|██████████| 12/12 [06:54<00:00, 34.53s/it]

Average areaUnderROC: 0.8653 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.01
Best Overall Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.8791

 Testing model on test dataset






Number of initial columns: 119, number of feature columns: 99
Final Model Evaluation on Test Data areaUnderROC: 0.9086
Final Accuracy: 0.9086, Precision: 0.8842, Recall: 0.9403

First 5 names of wrongly classified rows:
['auth_001-000071_001-000071_MG_TP3_Series-4_Image-1-0.png', 'hcs_003-000242_003-000242_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-2-0.png', 'hcs_003-000277_003-000277_MG_BL_Series-3_Image-1-0.png']


In [15]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [6]) \
    .addGrid(xgb_classifier.n_estimators, [50]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning:   0%|          | 0/1 [00:00<?, ?it/s]2024-09-20 17:59:30,111 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 17:59:37,630 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 17:59:44,669 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 17:59:49,984 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 17:59:55,786 INFO XGBoost

Average areaUnderROC: 0.8806 | Evaluating Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best Overall Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.8806

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 98



2024-09-20 18:00:27,233 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 18:00:32,705 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9104
Final Accuracy: 0.9104, Precision: 0.9074, Recall: 0.9142

First 5 names of wrongly classified rows:
['hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000247_003-000247_MG_BL_Series-1005_Image-1-1.png', 'hcs_003-000247_003-000247_MG_BL_Series-1005_Image-2-1.png', 'hcs_003-000252_003-000252_MG_TP3_Series-2_Image-1-1.png', 'hcs_003-000257_003-000257_MG_BL_Series-1010_Image-3-1.png']


#### 128 - with full mask

In [34]:
file_name = 'features_128_full_mask.csv'

In [35]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning:  12%|█▎        | 1/8 [00:19<02:13, 19.09s/it]

Average areaUnderROC: 0.7558 | Evaluating Parameters: maxDepth: 5, maxBins: 32


Hyperparameter Tuning:  25%|██▌       | 2/8 [00:41<02:07, 21.17s/it]

Average areaUnderROC: 0.7671 | Evaluating Parameters: maxDepth: 5, maxBins: 64


Hyperparameter Tuning:  38%|███▊      | 3/8 [00:58<01:36, 19.20s/it]

Average areaUnderROC: 0.7288 | Evaluating Parameters: maxDepth: 10, maxBins: 32


Hyperparameter Tuning:  50%|█████     | 4/8 [01:22<01:23, 20.95s/it]

Average areaUnderROC: 0.7387 | Evaluating Parameters: maxDepth: 10, maxBins: 64


Hyperparameter Tuning:  62%|██████▎   | 5/8 [01:41<01:01, 20.34s/it]

Average areaUnderROC: 0.7058 | Evaluating Parameters: maxDepth: 15, maxBins: 32


Hyperparameter Tuning:  75%|███████▌  | 6/8 [02:04<00:42, 21.32s/it]

Average areaUnderROC: 0.7144 | Evaluating Parameters: maxDepth: 15, maxBins: 64


Hyperparameter Tuning:  88%|████████▊ | 7/8 [02:28<00:22, 22.11s/it]

Average areaUnderROC: 0.7020 | Evaluating Parameters: maxDepth: 20, maxBins: 32


Hyperparameter Tuning: 100%|██████████| 8/8 [02:48<00:00, 21.07s/it]

Average areaUnderROC: 0.7129 | Evaluating Parameters: maxDepth: 20, maxBins: 64
Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.7671

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.7807
Final Accuracy: 0.7807, Precision: 0.8159, Recall: 0.7249

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.7645
Feature: original_firstorder_Variance, Importance: 0.0742
Feature: original_gldm_LargeDependenceLowGrayLevelEmphasis, Importance: 0.0277
Feature: original_firstorder_10Percentile, Importance: 0.0240
Feature: original_glcm_Correlation, Importance: 0.0164
Feature: original_firstorder_Skewness, Importance: 0.0163
Feature: original_ngtdm_Contrast, Importance: 0.0159
Feature: original_glrlm_GrayLevelNonUniformityNormalized, Importance: 0.0108
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0087
Feature: original_firstorder_Kurtosis, Importance: 0.0084
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0073
Feature: original_glrlm_HighGrayLevelRunEmphasis, Importance: 0.0068
Feature: original_gldm_SmallDependenceHighGrayLevelEmphasis, Importance: 0.0039
Feature: 

In [36]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning:   3%|▎         | 1/36 [00:17<10:26, 17.90s/it]

Average areaUnderROC: 0.7757 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:   6%|▌         | 2/36 [00:39<11:21, 20.06s/it]

Average areaUnderROC: 0.7757 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:   8%|▊         | 3/36 [00:57<10:26, 18.99s/it]

Average areaUnderROC: 0.7761 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  11%|█         | 4/36 [01:19<10:53, 20.41s/it]

Average areaUnderROC: 0.7761 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  14%|█▍        | 5/36 [01:40<10:30, 20.35s/it]

Average areaUnderROC: 0.7782 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  17%|█▋        | 6/36 [02:02<10:27, 20.92s/it]

Average areaUnderROC: 0.7782 | Evaluating Parameters: numTrees: 50, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  19%|█▉        | 7/36 [02:31<11:26, 23.68s/it]

Average areaUnderROC: 0.7727 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  22%|██▏       | 8/36 [02:58<11:31, 24.68s/it]

Average areaUnderROC: 0.7727 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  25%|██▌       | 9/36 [03:29<11:59, 26.64s/it]

Average areaUnderROC: 0.7718 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  28%|██▊       | 10/36 [04:02<12:27, 28.73s/it]

Average areaUnderROC: 0.7718 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  31%|███       | 11/36 [04:44<13:39, 32.76s/it]

Average areaUnderROC: 0.7709 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  33%|███▎      | 12/36 [05:28<14:28, 36.18s/it]

Average areaUnderROC: 0.7709 | Evaluating Parameters: numTrees: 50, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  36%|███▌      | 13/36 [06:14<15:03, 39.29s/it]

Average areaUnderROC: 0.7702 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  39%|███▉      | 14/36 [07:06<15:44, 42.94s/it]

Average areaUnderROC: 0.7702 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  42%|████▏     | 15/36 [08:00<16:12, 46.31s/it]

Average areaUnderROC: 0.7707 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  44%|████▍     | 16/36 [08:58<16:34, 49.75s/it]

Average areaUnderROC: 0.7707 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  47%|████▋     | 17/36 [10:12<18:05, 57.12s/it]

Average areaUnderROC: 0.7626 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  50%|█████     | 18/36 [11:19<18:04, 60.23s/it]

Average areaUnderROC: 0.7626 | Evaluating Parameters: numTrees: 50, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  53%|█████▎    | 19/36 [11:43<13:54, 49.09s/it]

Average areaUnderROC: 0.7781 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  56%|█████▌    | 20/36 [12:10<11:20, 42.56s/it]

Average areaUnderROC: 0.7781 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  58%|█████▊    | 21/36 [12:41<09:46, 39.11s/it]

Average areaUnderROC: 0.7769 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  61%|██████    | 22/36 [13:11<08:31, 36.52s/it]

Average areaUnderROC: 0.7769 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  64%|██████▍   | 23/36 [13:46<07:45, 35.84s/it]

Average areaUnderROC: 0.7783 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  67%|██████▋   | 24/36 [14:19<07:01, 35.11s/it]

Average areaUnderROC: 0.7783 | Evaluating Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  69%|██████▉   | 25/36 [15:21<07:54, 43.14s/it]

Average areaUnderROC: 0.7734 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  72%|███████▏  | 26/36 [16:23<08:06, 48.68s/it]

Average areaUnderROC: 0.7734 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  75%|███████▌  | 27/36 [17:35<08:21, 55.76s/it]

Average areaUnderROC: 0.7701 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  78%|███████▊  | 28/36 [18:47<08:05, 60.64s/it]

Average areaUnderROC: 0.7701 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  81%|████████  | 29/36 [20:31<08:36, 73.79s/it]

Average areaUnderROC: 0.7753 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning:  83%|████████▎ | 30/36 [22:17<08:19, 83.27s/it]

Average areaUnderROC: 0.7753 | Evaluating Parameters: numTrees: 150, maxDepth: 10, maxBins: 128, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  86%|████████▌ | 31/36 [23:51<07:12, 86.55s/it]

Average areaUnderROC: 0.7700 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: auto


Hyperparameter Tuning:  89%|████████▉ | 32/36 [25:23<05:52, 88.24s/it]

Average areaUnderROC: 0.7700 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 32, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  92%|█████████▏| 33/36 [27:18<04:48, 96.15s/it]

Average areaUnderROC: 0.7670 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto


Hyperparameter Tuning:  94%|█████████▍| 34/36 [29:12<03:23, 101.70s/it]

Average areaUnderROC: 0.7670 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: sqrt


Hyperparameter Tuning:  97%|█████████▋| 35/36 [31:58<02:00, 120.90s/it]

Average areaUnderROC: 0.7679 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: auto


Hyperparameter Tuning: 100%|██████████| 36/36 [34:43<00:00, 57.88s/it] 

Average areaUnderROC: 0.7679 | Evaluating Parameters: numTrees: 150, maxDepth: 15, maxBins: 128, featureSubsetStrategy: sqrt
Best Overall Parameters: numTrees: 150, maxDepth: 5, maxBins: 128, featureSubsetStrategy: auto
Best areaUnderROC: 0.7783

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99





Final Model Evaluation on Test Data areaUnderROC: 0.7844
Final Accuracy: 0.7844, Precision: 0.7823, Recall: 0.7881

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.1212
Feature: original_firstorder_RootMeanSquared, Importance: 0.1071
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0766
Feature: original_firstorder_Mean, Importance: 0.0662
Feature: original_firstorder_TotalEnergy, Importance: 0.0642
Feature: original_firstorder_Energy, Importance: 0.0586
Feature: original_firstorder_Variance, Importance: 0.0561
Feature: diagnostics_Image-original_Mean, Importance: 0.0414
Feature: original_firstorder_Median, Importance: 0.0369
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0342
Feature: original_firstorder_InterquartileRange, Importance: 0.0315
Feature: original_firstorder_Skewness, Importance: 0.0256
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0202
Feature: original_glszm_GrayLevelNonUniform

In [37]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)



 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning:   8%|▊         | 1/12 [00:39<07:09, 39.09s/it]

Average areaUnderROC: 0.7765 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  17%|█▋        | 2/12 [01:04<05:10, 31.01s/it]

Average areaUnderROC: 0.7746 | Evaluating Parameters: maxIter: 100, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  25%|██▌       | 3/12 [01:34<04:34, 30.54s/it]

Average areaUnderROC: 0.7734 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  33%|███▎      | 4/12 [01:54<03:30, 26.37s/it]

Average areaUnderROC: 0.7752 | Evaluating Parameters: maxIter: 100, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  42%|████▏     | 5/12 [02:41<03:57, 33.95s/it]

Average areaUnderROC: 0.7767 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  50%|█████     | 6/12 [03:07<03:06, 31.03s/it]

Average areaUnderROC: 0.7746 | Evaluating Parameters: maxIter: 500, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  58%|█████▊    | 7/12 [03:37<02:34, 30.85s/it]

Average areaUnderROC: 0.7734 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning:  67%|██████▋   | 8/12 [03:57<01:49, 27.39s/it]

Average areaUnderROC: 0.7752 | Evaluating Parameters: maxIter: 500, regParam: 0.1, tol: 0.01


Hyperparameter Tuning:  75%|███████▌  | 9/12 [04:50<01:46, 35.42s/it]

Average areaUnderROC: 0.7767 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.0001


Hyperparameter Tuning:  83%|████████▎ | 10/12 [05:11<01:02, 31.01s/it]

Average areaUnderROC: 0.7746 | Evaluating Parameters: maxIter: 1000, regParam: 0.01, tol: 0.01


Hyperparameter Tuning:  92%|█████████▏| 11/12 [05:43<00:31, 31.08s/it]

Average areaUnderROC: 0.7734 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.0001


Hyperparameter Tuning: 100%|██████████| 12/12 [06:08<00:00, 30.69s/it]

Average areaUnderROC: 0.7752 | Evaluating Parameters: maxIter: 1000, regParam: 0.1, tol: 0.01
Best Overall Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.7767

 Testing model on test dataset






Number of initial columns: 119, number of feature columns: 99
Final Model Evaluation on Test Data areaUnderROC: 0.8086
Final Accuracy: 0.8086, Precision: 0.7923, Recall: 0.8364

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000107_003-000107_MG_BL_Series-1001_Image-1001-1.png']


In [38]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning:   0%|          | 0/8 [00:00<?, ?it/s]2024-09-20 00:16:18,898 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:16:27,804 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:16:33,157 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:16:37,906 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:16:43,257 INFO XGBoost

Average areaUnderROC: 0.7759 | Evaluating Parameters: max_depth: 3, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:17:14,879 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:17:20,527 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:17:27,560 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:17:33,193 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:17:40,073 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7785 | Evaluating Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:18:18,326 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:18:24,234 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:18:29,746 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:18:35,093 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:18:41,363 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.7696 | Evaluating Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:19:20,341 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:19:26,696 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:19:33,703 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:19:40,444 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:19:47,442 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7663 | Evaluating Parameters: max_depth: 6, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:20:27,913 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:20:34,522 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:20:41,547 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:20:48,258 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:20:55,302 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	boos

Average areaUnderROC: 0.7787 | Evaluating Parameters: max_depth: 9, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:21:36,049 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:21:43,543 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:21:50,555 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:21:58,032 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:22:05,082 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7705 | Evaluating Parameters: max_depth: 9, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:22:48,437 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:22:55,555 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:23:02,610 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:23:09,909 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:23:17,000 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	bo

Average areaUnderROC: 0.7670 | Evaluating Parameters: max_depth: 12, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8


2024-09-20 00:23:59,541 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:24:07,582 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:24:15,086 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 12, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:24:23,210 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-20 00:24:30,218 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	

Average areaUnderROC: 0.7717 | Evaluating Parameters: max_depth: 12, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best Overall Parameters: max_depth: 9, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7787

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 99



2024-09-20 00:25:13,469 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 9, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-20 00:25:20,424 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7807
Final Accuracy: 0.7807, Precision: 0.7849, Recall: 0.7732

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1003_Image-1003-1.png']


In [39]:
spark.stop()