## Classification

In [1]:
import os
import random
import findspark

from tqdm import tqdm
from dotenv import load_dotenv

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, FloatType, IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, PCA, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LinearSVC

from xgboost.spark import SparkXGBClassifier

# Load environment variables from .env file
_ = load_dotenv()

# Retrieve environment variables
TARGET_DIR_NAME = os.getenv("TARGET_DIR_NAME")

In [2]:
findspark.init()

# Initialize Spark session to run locally
spark = SparkSession.builder \
    .appName("Big Data Classification") \
    .config("spark.master", "local[*]") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()

# Print the number of cores being used by Spark
print(f"Spark Session running on {spark.sparkContext.defaultParallelism} cores. UI is available at: {spark.sparkContext.uiWebUrl}")

Spark Session running on 12 cores. UI is available at: http://172.20.0.1:4050


### Defining functions for infrastructure

In [3]:
def apply_column_types(df):
    """Assign StringType, IntegerType, and set default as DoubleType for other columns."""

    # Define specific StringType and IntegerType columns
    string_columns = ["name", "provider"]
    integer_columns = ["patient", "class"]

    # Apply column types: cast to StringType, IntegerType, and default to DoubleType for others in one line
    df = df.select([col(c).cast(StringType()) if c in string_columns
                    else col(c).cast(IntegerType()) if c in integer_columns
                    else col(c).cast(DoubleType()) for c in df.columns])
    
    return df

In [4]:
def preprocess_data(data: DataFrame, lesion_mask: bool) -> DataFrame:
    """Preprocess the data by removing rows with NaN values."""    

    # Remove rows with NaN values and count the number of rows before and after
    total_rows_before = data.count()
    clean_data = data.dropna() 
    
    if lesion_mask:
        clean_data = clean_data.filter(col('name') != 'hcs_003-001341_003-001341_MG_BL_Series-1005_Image-1005-0.png')
    
    # Apply column types to the cleaned data
    clean_data = apply_column_types(clean_data)
    total_rows_after = clean_data.count()

    if total_rows_before != total_rows_after:
        print(f"Number of rows with NaN values: {total_rows_before - total_rows_after}")

    return clean_data

In [5]:
def check_patient_overlap(train_df: DataFrame, test_df: DataFrame, patient_column: str = 'patient', verbose_just_error: bool = False) -> None:
    """Checks for overlapping patients between train and test DataFrames."""

    # Get unique patients from train and test DataFrames
    train_patients = train_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    test_patients = test_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()

    # Convert lists to sets and check for any overlap
    overlapping_patients = set(train_patients).intersection(set(test_patients))

    # Print overlapping patients, if any
    if overlapping_patients:
        print(f"Overlapping patients found: {overlapping_patients}")
    elif not verbose_just_error:
        print("No overlapping patients between training and test sets.")

In [6]:
def calculate_class_distribution(df1: DataFrame, df2: DataFrame, class_column: str = 'class') -> None:
    """Calculates and prints class distribution percentages for two DataFrames in a single line."""
    
    # Calculate class counts for both DataFrames
    df1_counts = df1.groupBy(class_column).count().orderBy(class_column).collect()
    df2_counts = df2.groupBy(class_column).count().orderBy(class_column).collect()
    
    # Calculate total counts
    total_count_df1 = df1.count()
    total_count_df2 = df2.count()
    
    # Prepare class distributions as strings
    df1_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df1) * 100:.2f}%)" for row in df1_counts])
    df2_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df2) * 100:.2f}%)" for row in df2_counts])
    
    # Print class distributions in one line
    print(f"Class distribution in train_df: [{df1_distribution}] | test_df: [{df2_distribution}]")

In [7]:
def train_test_split_by_patient(data: DataFrame, test_ratio: float = 0.2, patient_column: str = 'patient', seed: int = 42) -> tuple[DataFrame, DataFrame]:
    """ Splits the DataFrame into train and test sets based on unique patient IDs, ensuring no patient data overlap."""

    random.seed(seed) # Set seed for reproducibility
    
    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)
    
    # Split the patients into train and test sets
    split_index = int(len(unique_patients) * (1 - test_ratio))
    train_patients = unique_patients[:split_index]
    test_patients = unique_patients[split_index:]
    
    # Create train and test DataFrames 
    train_df = data.filter(col(patient_column).isin(train_patients))
    test_df = data.filter(col(patient_column).isin(test_patients))
    check_patient_overlap(train_df, test_df)
    
    # Calculate and print the percentages of train and test sets
    total_rows, train_rows, test_rows = data.count(), train_df.count(), test_df.count()
    print(f"Training size: {train_rows} ({(train_rows / total_rows) * 100:.2f}%), Test size: {test_rows} rows ({(test_rows / total_rows) * 100:.2f}%)") 
    
    calculate_class_distribution(train_df, test_df)
    
    return train_df, test_df

In [8]:
def create_folds_by_patient(data: DataFrame, num_folds: int = 5, patient_column: str = 'patient', seed: int = 42) -> list:
    """Splits the data into train-test folds based on unique patient IDs, ensuring no patient overlap across folds."""

    random.seed(seed) # Set seed for reproducibility

    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)

    # Split patients evenly into folds
    fold_size = len(unique_patients) // num_folds
    folds = [unique_patients[i * fold_size:(i + 1) * fold_size] for i in range(num_folds)]

    # Handle any remaining patients by adding them to the last fold
    for i in range(len(unique_patients) % num_folds):
        folds[i].append(unique_patients[-(i + 1)])

    patient_folds, fold_summaries = [], []

    # Create train-test splits for each fold
    total_rows = data.count()
    for i in range(num_folds):

        # Get the test patients for the current fold and assign the rest to the training set
        test_patients = set(folds[i])
        train_patients = set(unique_patients) - test_patients

        # Filter the data based on the train and test patients
        train_df = data.filter(col(patient_column).isin(train_patients))
        test_df = data.filter(col(patient_column).isin(test_patients))
        check_patient_overlap(train_df, test_df, verbose_just_error=True)

        # Append the train-test split to the list of folds
        patient_folds.append((train_df, test_df))

        # Accumulate test set summary for this fold
        fold_summaries.append(f"Fold {i + 1}: {test_df.count()} rows ({(test_df.count() / total_rows) * 100:.2f}%)")

    print(" | ".join(fold_summaries))

    # Return the list of train-test tuples for each fold
    return patient_folds

In [9]:
def apply_pca(data: DataFrame, output_column: str = 'pca_features', k: int = 10) -> DataFrame:
    """Applies PCA to reduce the dimensionality of the feature set to k principal components."""
    
    # Initialize PCA with the specified number of components
    pca = PCA(k=k, inputCol='features', outputCol=output_column)
    
    # Fit PCA on the data and transform the features
    pca_model = pca.fit(data)
    transformed_data = pca_model.transform(data)
    
    print(f"PCA explained variance ratio: {pca_model.explainedVariance.toArray()}")
    
    return transformed_data

In [10]:
def vectorize_features(data: DataFrame, class_column: str = 'class', patient_column: str = 'patient', output_column: str = 'features', verbose: bool = True, standard_scaler: bool = False) -> DataFrame:
    """Assembles feature columns into a single feature vector column, excluding patient and class columns."""
    
    # Get the numeric columns from the DataFrame
    numeric_columns = [field.name for field in data.schema.fields if isinstance(field.dataType, (DoubleType, FloatType, IntegerType))]

    # Exclude class and patient columns
    feature_columns = [
        col for col in numeric_columns 
        if not col.startswith("diagnostics_") and col not in [class_column, patient_column]
    ]

    if verbose:
        print(f"Number of initial columns: {len(data.columns)}, number of feature columns: {len(feature_columns)}")
    
    # Assemble features into a feature vector
    assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'assembled_features')
    assembled_data = assembler.transform(data)

    # Standardize features
    if standard_scaler:
        scaler = StandardScaler(inputCol="assembled_features", outputCol = output_column, withStd = True, withMean = True)
        scaler_model = scaler.fit(assembled_data)
        scaled_data = scaler_model.transform(assembled_data)
    else:
        # Rename directly if no standardization is applied
        assembled_data = assembled_data.withColumnRenamed('assembled_features', 'features')
        scaled_data = assembled_data

    return scaled_data

In [11]:
def calculate_metrics(predictions: DataFrame) -> tuple:
    """Calculates accuracy, precision, and recall from the predictions DataFrame."""

    # Calculate true positives, true negatives, false positives, and false negatives
    tp = predictions.filter((col('prediction') == 1) & (col('class') == 1)).count()
    tn = predictions.filter((col('prediction') == 0) & (col('class') == 0)).count()
    fp = predictions.filter((col('prediction') == 1) & (col('class') == 0)).count()
    fn = predictions.filter((col('prediction') == 0) & (col('class') == 1)).count()

    # Calculate accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return accuracy, precision, recall

In [12]:
def determine_best_parameters_on_folds(model, train_df: DataFrame, paramGrid: list[dict], num_folds: int = 5, patient_column: str = 'patient', standard_scaler: bool = False):
    """Evaluates the Decision Tree model using patient-wise cross-validation with specified hyperparameter tuning."""

    folds = create_folds_by_patient(train_df, num_folds, patient_column)

    # Use BinaryClassificationEvaluator to measure performance
    evaluator = BinaryClassificationEvaluator(labelCol = "class", rawPredictionCol = "prediction", metricName = "areaUnderROC")

    best_params, best_metric = None, float("-inf")

    # Iterate over each parameter combination in the grid
    for params in tqdm(paramGrid, desc="Hyperparameter Tuning", leave = True):
        fold_metrics = []

        # Print parameter combination being evaluated
        param_items = [f"{param.name}: {value}" for param, value in params.items()]

        # For each fold, evaluate the parameter combination
        for _, (train_df, test_df) in enumerate(folds):

            # Vectorize features
            train_df = vectorize_features(train_df, verbose = False, standard_scaler = standard_scaler)
            test_df = vectorize_features(test_df, verbose = False, standard_scaler = standard_scaler)
            check_patient_overlap(train_df, test_df, verbose_just_error = True)

            # Train the model with the current parameters
            current_model = model.copy(params).fit(train_df)

            # Evaluate the model on the test data using the specified metric
            predictions = current_model.transform(test_df)
            metric = evaluator.evaluate(predictions)
            fold_metrics.append(metric)
        
        # Calculate average metric across all folds for the current parameter combination
        avg_metric = __builtins__.sum(fold_metrics) / len(fold_metrics)

        #print(f"Average {evaluator.getMetricName()}: {avg_metric:.4f} | Evaluating Parameters:", ', '.join(param_items))

        # Update the best parameters if the current average metric is better
        if avg_metric > best_metric:
            best_metric = avg_metric
            best_params = params

    # Print the best parameters after evaluating all combinations
    param_items = [f"{param.name}: {value}" for param, value in best_params.items()]
    print("Best Overall Parameters:", ', '.join(param_items))
    print(f"Best {evaluator.getMetricName()}: {best_metric:.4f}")

    return best_params 

In [13]:
def train_and_evaluate_best_model(model, best_params: list[dict], train_df: DataFrame, test_df: DataFrame, is_tree: bool = False, standard_scaler: bool = False):
    """Train the model on the full training data using the best parameters and evaluate on the test data."""

    # Vectorize features
    train_df = vectorize_features(train_df, verbose = False, standard_scaler = standard_scaler)
    test_df = vectorize_features(test_df, standard_scaler=standard_scaler)

    # Train the model using the best parameters
    best_model = model.copy(best_params).fit(train_df)

    # Evaluate the model on the test data
    evaluator = BinaryClassificationEvaluator(labelCol="class", rawPredictionCol="prediction", metricName="areaUnderROC")
    predictions = best_model.transform(test_df)
    metric = evaluator.evaluate(predictions)

    # Calculate additional metrics: accuracy, precision, recall
    accuracy, precision, recall = calculate_metrics(predictions)

    print(f"Final Model Evaluation on Test Data {evaluator.getMetricName()}: {metric:.4f}")
    print(f"Final Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Print feature importances if the model is a tree-based model
    if is_tree:
        # Extract the feature importances from the model
        feature_importances = best_model.featureImportances.toArray()

        # Get feature column names from the assembler
        feature_columns = train_df.schema['features'].metadata['ml_attr']['attrs']['numeric']
        feature_names = [attr['name'] for attr in feature_columns]

        # Combine feature names and their importances, then sort by importance
        sorted_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

        # Print sorted feature importances
        print("\nSorted Feature Importances:")
        for name, importance in sorted_features:
            print(f"Feature: {name}, Importance: {importance:.4f}")

    # Show the first 5 names of wrongly classified rows
    misclassified = predictions.filter(col('prediction') != col('class'))
    names = misclassified.select('name').limit(5).rdd.flatMap(lambda x: x).collect()
    print("\nFirst 5 names of wrongly classified rows:")
    print(names)

    return best_model

### Main section

In [14]:
def main(model: str, file_name: str, paramGrid: list[dict], is_tree: bool = False, use_standard_scaler: bool = False):

    # Load and preprocess the dataset
    file_path = os.path.join(os.getcwd(), TARGET_DIR_NAME, file_name)
    data = spark.read.csv(file_path, header=True, inferSchema=True)

    print(f"\n{'=' * 75}\n Setting up\n{'=' * 75}\n")
    if file_name.endswith('lesion_mask.csv'):
        clean_data = preprocess_data(data, lesion_mask = True)
    else:
        clean_data = preprocess_data(data, lesion_mask = False)
    #clean_data = clean_data.limit(100)

    # split the data into train and test sets
    train_df, test_df = train_test_split_by_patient(clean_data, test_ratio=0.2)

    # Evaluate the model using patient-wise cross-validation to find the best parameters
    print(f"\n{'=' * 75}\n Hyperparameter tunning\n{'=' * 75}\n")
    best_params = determine_best_parameters_on_folds(model, train_df, paramGrid, num_folds=5, patient_column='patient', standard_scaler=use_standard_scaler)

    # Train the best model on the full training data and evaluate on the test data
    print(f"\n{'=' * 75}\n Testing model on test dataset\n{'=' * 75}\n")
    best_model = train_and_evaluate_best_model(model, best_params, train_df, test_df, is_tree, standard_scaler=use_standard_scaler)

    return best_model

### Testing different models

#### 512 - with lesion mask

In [15]:
file_name = 'features_512_lesion_mask.csv'

In [16]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2173 (79.51%), Test size: 560 rows (20.49%)

 Hyperparameter tunning

Fold 1: 420 rows (19.33%) | Fold 2: 467 rows (21.49%) | Fold 3: 470 rows (21.63%) | Fold 4: 398 rows (18.32%) | Fold 5: 418 rows (19.24%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:37<00:00, 19.72s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.9790

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9536
Final Accuracy: 0.9536, Precision: 0.9291, Recall: 0.9821

Sorted Feature Importances:
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.9163
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0391
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.0352
Feature: original_firstorder_90Percentile, Importance: 0.0036
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0022
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0019
Feature: original_firstorder_Entropy, Importance: 0.0017
Feature: original_firstorder_10Percentile, Importance: 0.0000
Feature: original_firstorder_Energy, Importance: 0.0000
Feature: original_firstorder_InterquartileRange, Importance: 0.0000
Feature: original_firstorder_Kurtosis, Impor

In [17]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150, 200]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15, 20]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2173 (79.51%), Test size: 560 rows (20.49%)

 Hyperparameter tunning

Fold 1: 420 rows (19.33%) | Fold 2: 467 rows (21.49%) | Fold 3: 470 rows (21.63%) | Fold 4: 398 rows (18.32%) | Fold 5: 418 rows (19.24%)


Hyperparameter Tuning: 100%|██████████| 72/72 [37:50<00:00, 31.53s/it]


Best Overall Parameters: numTrees: 150, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto
Best areaUnderROC: 0.9879

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9554
Final Accuracy: 0.9554, Precision: 0.9180, Recall: 1.0000

Sorted Feature Importances:
Feature: original_gldm_DependenceNonUniformity, Importance: 0.1591
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.1269
Feature: original_glrlm_GrayLevelNonUniformity, Importance: 0.0960
Feature: original_ngtdm_Busyness, Importance: 0.0706
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.0677
Feature: original_glszm_GrayLevelNonUniformity, Importance: 0.0529
Feature: original_ngtdm_Coarseness, Importance: 0.0416
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0333
Feature: original_ngtdm_Strength, Importance: 0.0327
Feature: original_glrlm_LowGrayLevelRunEmphasis, Importance: 0.0233
Feature: or

In [18]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2173 (79.51%), Test size: 560 rows (20.49%)

 Hyperparameter tunning

Fold 1: 420 rows (19.33%) | Fold 2: 467 rows (21.49%) | Fold 3: 470 rows (21.63%) | Fold 4: 398 rows (18.32%) | Fold 5: 418 rows (19.24%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-23 18:46:11,769 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 18:46:20,580 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 18:46:27,933 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 18:46:32,728 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 18:46:38,032 INFO XGBoos

Best Overall Parameters: max_depth: 6, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.9896

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 19:02:01,937 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 19:02:07,988 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9589
Final Accuracy: 0.9589, Precision: 0.9241, Recall: 1.0000

First 5 names of wrongly classified rows:
['hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-1_Image-1-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-3_Image-1-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-4_Image-1-0.png']


In [19]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2173 (79.51%), Test size: 560 rows (20.49%)

 Hyperparameter tunning

Fold 1: 420 rows (19.33%) | Fold 2: 467 rows (21.49%) | Fold 3: 470 rows (21.63%) | Fold 4: 398 rows (18.32%) | Fold 5: 418 rows (19.24%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:13<00:00, 31.10s/it]


Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.9858

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9554
Final Accuracy: 0.9554, Precision: 0.9180, Recall: 1.0000

First 5 names of wrongly classified rows:
['hcs_003-001369_003-001369_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-1_Image-1-0.png', 'hcs_003-001987_003-001987_MG_BL_Series-3_Image-1-0.png']


#### 512 - with full mask

In [20]:
file_name = 'features_512_full_mask.csv'

In [21]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:38<00:00, 19.83s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.6755

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.6731
Final Accuracy: 0.6731, Precision: 0.6813, Recall: 0.6503

Sorted Feature Importances:
Feature: original_firstorder_Variance, Importance: 0.5438
Feature: original_firstorder_90Percentile, Importance: 0.0845
Feature: original_glcm_Imc2, Importance: 0.0819
Feature: original_firstorder_Skewness, Importance: 0.0680
Feature: original_firstorder_10Percentile, Importance: 0.0508
Feature: original_glrlm_RunEntropy, Importance: 0.0288
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0242
Feature: original_firstorder_Kurtosis, Importance: 0.0232
Feature: original_glcm_ClusterProminence, Importance: 0.0216
Feature: original_glszm_SmallAreaEmphasis, Importance: 0.0172
Feature: original_firstorder_Entropy, Importance: 0.0120
Feature: original_glrlm_GrayLevelN

In [22]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning: 100%|██████████| 36/36 [33:44<00:00, 56.23s/it] 


Best Overall Parameters: numTrees: 150, maxDepth: 10, maxBins: 64, featureSubsetStrategy: auto
Best areaUnderROC: 0.6857

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.6941
Final Accuracy: 0.6941, Precision: 0.6881, Recall: 0.7098

Sorted Feature Importances:
Feature: original_firstorder_Variance, Importance: 0.0627
Feature: original_firstorder_90Percentile, Importance: 0.0457
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0456
Feature: original_firstorder_InterquartileRange, Importance: 0.0366
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0354
Feature: original_firstorder_Skewness, Importance: 0.0345
Feature: original_firstorder_Energy, Importance: 0.0289
Feature: original_firstorder_TotalEnergy, Importance: 0.0273
Feature: original_firstorder_Kurtosis, Importance: 0.0236
Feature: original_firstorder_RootMeanSquared, Importance: 0.0225
Featur

In [23]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-23 19:47:08,243 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 19:47:13,099 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 19:47:19,461 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 19:47:24,962 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 19:47:31,702 INFO XGBoos

Best Overall Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7016

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 20:05:13,279 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 20:05:19,417 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7133
Final Accuracy: 0.7133, Precision: 0.6930, Recall: 0.7657

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-1_Image-1-1.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png']


In [24]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4, 1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2162 (79.08%), Test size: 572 rows (20.92%)

 Hyperparameter tunning

Fold 1: 424 rows (19.61%) | Fold 2: 422 rows (19.52%) | Fold 3: 450 rows (20.81%) | Fold 4: 398 rows (18.41%) | Fold 5: 468 rows (21.65%)


Hyperparameter Tuning: 100%|██████████| 12/12 [07:08<00:00, 35.74s/it]


Best Overall Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.6994

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.6993
Final Accuracy: 0.6993, Precision: 0.6887, Recall: 0.7273

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png']


#### 256 - with lesion mask

In [25]:
file_name = 'features_256_lesion_mask.csv'

In [26]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2254 (80.73%), Test size: 538 rows (19.27%)

 Hyperparameter tunning

Fold 1: 510 rows (22.63%) | Fold 2: 414 rows (18.37%) | Fold 3: 417 rows (18.50%) | Fold 4: 474 rows (21.03%) | Fold 5: 439 rows (19.48%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:26<00:00, 18.31s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.9199

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9108
Final Accuracy: 0.9108, Precision: 0.8850, Recall: 0.9442

Sorted Feature Importances:
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.5965
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.2463
Feature: original_firstorder_90Percentile, Importance: 0.1001
Feature: original_gldm_LargeDependenceLowGrayLevelEmphasis, Importance: 0.0144
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0097
Feature: original_firstorder_Minimum, Importance: 0.0085
Feature: original_gldm_DependenceEntropy, Importance: 0.0068
Feature: original_ngtdm_Coarseness, Importance: 0.0063
Feature: original_glszm_SmallAreaHighGrayLevelEmphasis, Importance: 0.0041
Feature: original_firstorder_InterquartileRange, Importance: 0.0033
Feature: original_glszm_Gray

In [27]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2254 (80.73%), Test size: 538 rows (19.27%)

 Hyperparameter tunning

Fold 1: 510 rows (22.63%) | Fold 2: 414 rows (18.37%) | Fold 3: 417 rows (18.50%) | Fold 4: 474 rows (21.03%) | Fold 5: 439 rows (19.48%)


Hyperparameter Tuning: 100%|██████████| 36/36 [21:19<00:00, 35.55s/it]


Best Overall Parameters: numTrees: 150, maxDepth: 15, maxBins: 64, featureSubsetStrategy: auto
Best areaUnderROC: 0.9404

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9461
Final Accuracy: 0.9461, Precision: 0.9348, Recall: 0.9591

Sorted Feature Importances:
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.1019
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.0830
Feature: original_glrlm_GrayLevelNonUniformity, Importance: 0.0710
Feature: original_firstorder_Median, Importance: 0.0596
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0527
Feature: original_ngtdm_Busyness, Importance: 0.0438
Feature: original_glszm_GrayLevelNonUniformity, Importance: 0.0429
Feature: original_firstorder_RootMeanSquared, Importance: 0.0376
Feature: original_firstorder_Mean, Importance: 0.0353
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0326
Feature: ori

In [28]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2254 (80.73%), Test size: 538 rows (19.27%)

 Hyperparameter tunning

Fold 1: 510 rows (22.63%) | Fold 2: 414 rows (18.37%) | Fold 3: 417 rows (18.50%) | Fold 4: 474 rows (21.03%) | Fold 5: 439 rows (19.48%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-23 20:38:40,627 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 20:38:45,305 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 20:38:50,598 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 20:38:55,327 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 20:39:00,564 INFO XGBoos

Best Overall Parameters: max_depth: 3, n_estimators: 200, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.9490

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 20:55:25,413 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 200}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 20:55:31,385 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9628
Final Accuracy: 0.9628, Precision: 0.9462, Recall: 0.9814

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-1_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000259_003-000259_MG_BL_Series-2_Image-1-1.png', 'hcs_003-001181_003-001181_MG_BL_Series-1694_Image-1694-1.png']


In [29]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

Number of rows with NaN values: 1
No overlapping patients between training and test sets.
Training size: 2254 (80.73%), Test size: 538 rows (19.27%)

 Hyperparameter tunning

Fold 1: 510 rows (22.63%) | Fold 2: 414 rows (18.37%) | Fold 3: 417 rows (18.50%) | Fold 4: 474 rows (21.03%) | Fold 5: 439 rows (19.48%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:03<00:00, 30.31s/it]


Best Overall Parameters: maxIter: 500, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.9411

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9572
Final Accuracy: 0.9572, Precision: 0.9241, Recall: 0.9963

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000247_003-000247_MG_BL_Series-1005_Image-1-1.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001862_003-001862_MG_BL_Series-1001_Image-1003-0.png']


#### 256 - with full mask

In [30]:
file_name = 'features_256_full_mask.csv'

In [31]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:35<00:00, 19.49s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.7221

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.7244
Final Accuracy: 0.7244, Precision: 0.7280, Recall: 0.7165

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.7217
Feature: original_firstorder_Variance, Importance: 0.0899
Feature: original_firstorder_Kurtosis, Importance: 0.0313
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0222
Feature: original_glcm_ClusterProminence, Importance: 0.0189
Feature: original_glszm_LargeAreaEmphasis, Importance: 0.0148
Feature: original_gldm_SmallDependenceHighGrayLevelEmphasis, Importance: 0.0148
Feature: original_glrlm_LongRunHighGrayLevelEmphasis, Importance: 0.0127
Feature: original_glrlm_ShortRunHighGrayLevelEmphasis, Importance: 0.0123
Feature: original_glszm_SmallAreaLowGrayLevelEmphasis, Importance: 0.0111
Feature: ori

In [32]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning: 100%|██████████| 36/36 [36:55<00:00, 61.54s/it] 


Best Overall Parameters: numTrees: 150, maxDepth: 5, maxBins: 64, featureSubsetStrategy: auto
Best areaUnderROC: 0.7388

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.7480
Final Accuracy: 0.7480, Precision: 0.7582, Recall: 0.7283

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.1140
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0955
Feature: original_firstorder_Variance, Importance: 0.0943
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0732
Feature: original_firstorder_InterquartileRange, Importance: 0.0702
Feature: original_firstorder_Energy, Importance: 0.0651
Feature: original_firstorder_TotalEnergy, Importance: 0.0648
Feature: original_firstorder_Mean, Importance: 0.0506
Feature: original_firstorder_RootMeanSquared, Importance: 0.0450
Feature: original_firstorder_Median, Importance: 0.0304
Feature: orig

In [33]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-23 21:43:12,626 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 21:43:17,659 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 21:43:23,334 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 21:43:28,491 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 21:43:35,835 INFO XGBoos

Best Overall Parameters: max_depth: 6, n_estimators: 50, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7494

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 22:01:00,139 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 22:01:06,301 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7480
Final Accuracy: 0.7480, Precision: 0.7669, Recall: 0.7126

First 5 names of wrongly classified rows:
['auth_001-000074_001-000074_MG_TP1_Series-4_Image-1-1.png', 'auth_001-000084_001-000084_MG_BL_Series-1_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000024_003-000024_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png']


In [34]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2286 (81.82%), Test size: 508 rows (18.18%)

 Hyperparameter tunning

Fold 1: 494 rows (21.61%) | Fold 2: 398 rows (17.41%) | Fold 3: 448 rows (19.60%) | Fold 4: 480 rows (21.00%) | Fold 5: 466 rows (20.38%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:30<00:00, 32.54s/it]


Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.01
Best areaUnderROC: 0.7380

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.7343
Final Accuracy: 0.7343, Precision: 0.7196, Recall: 0.7677

First 5 names of wrongly classified rows:
['auth_001-000061_001-000061_MG_BL_Series-8_Image-1-1.png', 'hcs_003-000024_003-000024_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png']


#### 128 - with lesion mask

In [35]:
file_name = 'features_128_lesion_mask.csv'

In [36]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:26<00:00, 18.29s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.8538

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.8937
Final Accuracy: 0.8937, Precision: 0.8754, Recall: 0.9179

Sorted Feature Importances:
Feature: original_firstorder_RootMeanSquared, Importance: 0.6193
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.1103
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.1018
Feature: original_firstorder_90Percentile, Importance: 0.0709
Feature: original_glszm_SmallAreaLowGrayLevelEmphasis, Importance: 0.0230
Feature: original_gldm_DependenceNonUniformity, Importance: 0.0157
Feature: original_firstorder_Skewness, Importance: 0.0137
Feature: original_glcm_Imc1, Importance: 0.0125
Feature: original_firstorder_10Percentile, Importance: 0.0094
Feature: original_glszm_SizeZoneNonUniformityNormalized, Importance: 0.0063
Feature: original_glcm_ClusterProminence

In [37]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning: 100%|██████████| 36/36 [25:43<00:00, 42.88s/it]


Best Overall Parameters: numTrees: 50, maxDepth: 10, maxBins: 32, featureSubsetStrategy: auto
Best areaUnderROC: 0.8697

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.8955
Final Accuracy: 0.8955, Precision: 0.9015, Recall: 0.8881

Sorted Feature Importances:
Feature: original_firstorder_Median, Importance: 0.0918
Feature: original_firstorder_Skewness, Importance: 0.0646
Feature: original_glrlm_RunLengthNonUniformity, Importance: 0.0566
Feature: original_glrlm_GrayLevelNonUniformity, Importance: 0.0536
Feature: original_firstorder_Mean, Importance: 0.0501
Feature: original_firstorder_RootMeanSquared, Importance: 0.0494
Feature: original_firstorder_90Percentile, Importance: 0.0479
Feature: original_glszm_SizeZoneNonUniformity, Importance: 0.0433
Feature: original_gldm_SmallDependenceLowGrayLevelEmphasis, Importance: 0.0341
Feature: original_gldm_GrayLevelNonUniformity, Importance: 0.0340
F

In [38]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()


# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning:   0%|          | 0/15 [00:00<?, ?it/s]2024-09-23 22:37:48,620 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 22:37:53,579 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 22:37:58,946 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 22:38:04,021 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 22:38:10,937 INFO XGBoos

Best Overall Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.8818

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 22:55:01,484 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 22:55:07,207 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.9086
Final Accuracy: 0.9086, Precision: 0.9041, Recall: 0.9142

First 5 names of wrongly classified rows:
['hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000247_003-000247_MG_BL_Series-1005_Image-1-1.png', 'hcs_003-000252_003-000252_MG_TP3_Series-2_Image-1-1.png', 'hcs_003-000257_003-000257_MG_BL_Series-1010_Image-3-1.png', 'hcs_003-000286_003-000286_MG_BL_Series-1010_Image-1-0.png']


In [39]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2261 (80.84%), Test size: 536 rows (19.16%)

 Hyperparameter tunning

Fold 1: 430 rows (19.02%) | Fold 2: 448 rows (19.81%) | Fold 3: 428 rows (18.93%) | Fold 4: 496 rows (21.94%) | Fold 5: 459 rows (20.30%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:03<00:00, 30.33s/it]


Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.8774

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.9030
Final Accuracy: 0.9030, Precision: 0.8750, Recall: 0.9403

First 5 names of wrongly classified rows:
['auth_001-000071_001-000071_MG_TP3_Series-4_Image-1-0.png', 'hcs_003-000242_003-000242_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000242_003-000242_MG_BL_Series-4_Image-1-0.png', 'hcs_003-000245_003-000245_MG_BL_Series-1008_Image-1-0.png', 'hcs_003-000257_003-000257_MG_BL_Series-1010_Image-4-0.png']


#### 128 - with full mask

In [40]:
file_name = 'features_128_full_mask.csv'

In [41]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning: 100%|██████████| 8/8 [02:52<00:00, 21.53s/it]


Best Overall Parameters: maxDepth: 5, maxBins: 64
Best areaUnderROC: 0.7671

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.7807
Final Accuracy: 0.7807, Precision: 0.8159, Recall: 0.7249

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.7645
Feature: original_firstorder_Variance, Importance: 0.0742
Feature: original_gldm_LargeDependenceLowGrayLevelEmphasis, Importance: 0.0277
Feature: original_firstorder_10Percentile, Importance: 0.0274
Feature: original_glcm_Correlation, Importance: 0.0164
Feature: original_firstorder_Skewness, Importance: 0.0163
Feature: original_ngtdm_Contrast, Importance: 0.0159
Feature: original_glrlm_GrayLevelNonUniformityNormalized, Importance: 0.0108
Feature: original_glszm_HighGrayLevelZoneEmphasis, Importance: 0.0087
Feature: original_firstorder_Kurtosis, Importance: 0.0084
Feature: original_glszm_SizeZoneNonUniformity, Import

In [42]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning: 100%|██████████| 36/36 [34:05<00:00, 56.83s/it] 


Best Overall Parameters: numTrees: 150, maxDepth: 5, maxBins: 32, featureSubsetStrategy: auto
Best areaUnderROC: 0.7786

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.7844
Final Accuracy: 0.7844, Precision: 0.7802, Recall: 0.7918

Sorted Feature Importances:
Feature: original_firstorder_90Percentile, Importance: 0.1049
Feature: original_firstorder_TotalEnergy, Importance: 0.0946
Feature: original_firstorder_Energy, Importance: 0.0918
Feature: original_firstorder_Variance, Importance: 0.0738
Feature: original_firstorder_Median, Importance: 0.0680
Feature: original_firstorder_Mean, Importance: 0.0636
Feature: original_firstorder_MeanAbsoluteDeviation, Importance: 0.0634
Feature: original_firstorder_RootMeanSquared, Importance: 0.0628
Feature: original_firstorder_InterquartileRange, Importance: 0.0498
Feature: original_firstorder_RobustMeanAbsoluteDeviation, Importance: 0.0396
Feature: orig

In [43]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning:   0%|          | 0/8 [00:00<?, ?it/s]2024-09-23 23:40:20,357 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 23:40:25,138 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 23:40:30,366 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 50}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 23:40:35,092 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-09-23 23:40:40,376 INFO XGBoost

Best Overall Parameters: max_depth: 3, n_estimators: 100, learning_rate: 0.1, subsample: 0.8, colsample_bytree: 0.8
Best areaUnderROC: 0.7831

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93


2024-09-23 23:49:28,211 INFO XGBoost-PySpark: _fit Running xgboost-2.1.1 on 1 workers with
	booster params: {'colsample_bytree': 0.8, 'device': 'cpu', 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'binary:logistic', 'subsample': 0.8, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-09-23 23:49:34,141 INFO XGBoost-PySpark: _fit Finished xgboost training!


Final Model Evaluation on Test Data areaUnderROC: 0.7862
Final Accuracy: 0.7862, Precision: 0.7852, Recall: 0.7881

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-1.png', 'hcs_003-000029_003-000029_MG_BL_Series-1003_Image-1003-0.png']


In [44]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


 Setting up

No overlapping patients between training and test sets.
Training size: 2260 (80.77%), Test size: 538 rows (19.23%)

 Hyperparameter tunning

Fold 1: 522 rows (23.10%) | Fold 2: 426 rows (18.85%) | Fold 3: 420 rows (18.58%) | Fold 4: 474 rows (20.97%) | Fold 5: 418 rows (18.50%)


Hyperparameter Tuning: 100%|██████████| 12/12 [06:15<00:00, 31.26s/it]


Best Overall Parameters: maxIter: 100, regParam: 0.01, tol: 0.0001
Best areaUnderROC: 0.7775

 Testing model on test dataset

Number of initial columns: 119, number of feature columns: 93
Final Model Evaluation on Test Data areaUnderROC: 0.8030
Final Accuracy: 0.8030, Precision: 0.7860, Recall: 0.8327

First 5 names of wrongly classified rows:
['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1002_Image-1002-0.png', 'hcs_003-000029_003-000029_MG_BL_Series-1003_Image-1003-0.png', 'hcs_003-000107_003-000107_MG_BL_Series-1001_Image-1001-1.png']


In [45]:
spark.stop()