## Classification

In [1]:
import os
import random
import findspark

from tqdm import tqdm
from dotenv import load_dotenv

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, FloatType, IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, PCA, StandardScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LinearSVC

from xgboost.spark import SparkXGBClassifier

# Load environment variables from .env file
_ = load_dotenv()

# Retrieve environment variables
TARGET_DIR_NAME = os.getenv("TARGET_DIR_NAME")

In [2]:
findspark.init()

# Initialize Spark session to run locally
spark = SparkSession.builder \
    .appName("Big Data Classification") \
    .config("spark.master", "local[*]") \
    .config("spark.ui.port", "4050") \
    .getOrCreate()

# Print the number of cores being used by Spark
print(f"Spark Session running on {spark.sparkContext.defaultParallelism} cores. UI is available at: {spark.sparkContext.uiWebUrl}")

Spark Session running on 12 cores. UI is available at: http://DESKTOP-684SCQF:4050


### Defining functions for infrastructure

In [3]:
def apply_column_types(df):
    """Assign StringType, IntegerType, and set default as DoubleType for other columns."""

    # Define specific StringType and IntegerType columns
    string_columns = ["name", "provider"]
    integer_columns = ["patient", "class"]

    # Apply column types: cast to StringType, IntegerType, and default to DoubleType for others in one line
    df = df.select([col(c).cast(StringType()) if c in string_columns
                    else col(c).cast(IntegerType()) if c in integer_columns
                    else col(c).cast(DoubleType()) for c in df.columns])
    
    return df

In [4]:
def preprocess_data1(data: DataFrame) -> DataFrame:
    """Preprocess the data by removing rows with NaN values."""    

    # Count NaN values in each column
    nan_count_df = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns])

    # Show the total number of NaN values for each column
    nan_count_df.show(truncate=False)

    # Remove rows with NaN values and count the number of rows before and after
    total_rows_before = data.count()
    clean_data = data.dropna()  
    clean_data = apply_column_types(clean_data)
    total_rows_after = clean_data.count()

    if total_rows_before != total_rows_after:
        print(f"Number of rows with NaN values: {total_rows_before - total_rows_after}")

    return clean_data

In [5]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import isnan, col, when, count

def preprocess_data(data: DataFrame) -> DataFrame:
   

    # Count NaN and null values in each column for reference
    nan_count_df = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns])
    
    # Display the count of NaN/null values in each column
    nan_count_df.show(truncate=False)

    # Count total rows before cleaning
    total_rows_before = data.count()

    # Remove the row where 'name' is 'hcs_003-001341_003-001341_MG_BL_Series-1005_Image-1005-0.png'
    specific_value = 'hcs_003-001341_003-001341_MG_BL_Series-1005_Image-1005-0.png'
    data = data.filter(col('name') != specific_value)


    clean_data = data.dropna(how='any')
    print(f"Rows after cleaning: {clean_data.count()}")

    # Assuming 'data' is your DataFrame
    print('ISNULLL')
    clean_data.select('name', 'original_glcm_SumAverage').filter(
        (col("original_glcm_SumAverage").isNull())
    ).show()

    print('ISNAAAN')
    clean_data.select('name', 'original_glcm_SumAverage').filter(
        (isnan(col("original_glcm_SumAverage")))
    ).show()

    
    # Apply column type conversion if necessary
    clean_data = apply_column_types(clean_data)  # Assuming this function is defined elsewhere
    
    # Count total rows after cleaning
    total_rows_after = clean_data.count()

    # If any rows were removed, print the difference
    if total_rows_before != total_rows_after:
        print(f"Number of rows with NaN values removed: {total_rows_before - total_rows_after}")

    nan_count_df = clean_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in clean_data.columns])
    
    # Display the count of NaN/null values in each column
    nan_count_df.show(truncate=False)    

    return clean_data


In [6]:
def check_patient_overlap(train_df: DataFrame, test_df: DataFrame, patient_column: str = 'patient', verbose_just_error: bool = False) -> None:
    """Checks for overlapping patients between train and test DataFrames."""

    # Get unique patients from train and test DataFrames
    train_patients = train_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    test_patients = test_df.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()

    # Convert lists to sets and check for any overlap
    overlapping_patients = set(train_patients).intersection(set(test_patients))

    # Print overlapping patients, if any
    if overlapping_patients:
        print(f"Overlapping patients found: {overlapping_patients}")
    elif not verbose_just_error:
        print("No overlapping patients between training and test sets.")

In [7]:
def calculate_class_distribution(df1: DataFrame, df2: DataFrame, class_column: str = 'class') -> None:
    """Calculates and prints class distribution percentages for two DataFrames in a single line."""
    
    # Calculate class counts for both DataFrames
    df1_counts = df1.groupBy(class_column).count().orderBy(class_column).collect()
    df2_counts = df2.groupBy(class_column).count().orderBy(class_column).collect()
    
    # Calculate total counts
    total_count_df1 = df1.count()
    total_count_df2 = df2.count()
    
    # Prepare class distributions as strings
    df1_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df1) * 100:.2f}%)" for row in df1_counts])
    df2_distribution = ', '.join([f"{row[class_column]}: {row['count']} ({(row['count'] / total_count_df2) * 100:.2f}%)" for row in df2_counts])
    
    # Print class distributions in one line
    print(f"Class distribution in train_df: [{df1_distribution}] | test_df: [{df2_distribution}]")

In [8]:
def train_test_split_by_patient(data: DataFrame, test_ratio: float = 0.2, patient_column: str = 'patient', seed: int = 42) -> tuple[DataFrame, DataFrame]:
    """ Splits the DataFrame into train and test sets based on unique patient IDs, ensuring no patient data overlap."""

    random.seed(seed) # Set seed for reproducibility
    
    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)
    
    # Split the patients into train and test sets
    split_index = int(len(unique_patients) * (1 - test_ratio))
    train_patients = unique_patients[:split_index]
    test_patients = unique_patients[split_index:]
    
    # Create train and test DataFrames 
    train_df = data.filter(col(patient_column).isin(train_patients))
    test_df = data.filter(col(patient_column).isin(test_patients))
    check_patient_overlap(train_df, test_df)
    
    # Calculate and print the percentages of train and test sets
    total_rows, train_rows, test_rows = data.count(), train_df.count(), test_df.count()
    print(f"Training size: {train_rows} ({(train_rows / total_rows) * 100:.2f}%), Test size: {test_rows} rows ({(test_rows / total_rows) * 100:.2f}%)") 
    
    calculate_class_distribution(train_df, test_df)
    
    return train_df, test_df

In [9]:
def create_folds_by_patient(data: DataFrame, num_folds: int = 5, patient_column: str = 'patient', seed: int = 42) -> list:
    """Splits the data into train-test folds based on unique patient IDs, ensuring no patient overlap across folds."""

    random.seed(seed) # Set seed for reproducibility

    # Extract unique patient IDs and randomly shuffle them
    unique_patients = data.select(patient_column).distinct().rdd.flatMap(lambda x: x).collect()
    random.shuffle(unique_patients)

    # Split patients evenly into folds
    fold_size = len(unique_patients) // num_folds
    folds = [unique_patients[i * fold_size:(i + 1) * fold_size] for i in range(num_folds)]

    # Handle any remaining patients by adding them to the last fold
    for i in range(len(unique_patients) % num_folds):
        folds[i].append(unique_patients[-(i + 1)])

    patient_folds, fold_summaries = [], []

    # Create train-test splits for each fold
    total_rows = data.count()
    for i in range(num_folds):

        # Get the test patients for the current fold and assign the rest to the training set
        test_patients = set(folds[i])
        train_patients = set(unique_patients) - test_patients

        # Filter the data based on the train and test patients
        train_df = data.filter(col(patient_column).isin(train_patients))
        test_df = data.filter(col(patient_column).isin(test_patients))
        check_patient_overlap(train_df, test_df, verbose_just_error=True)

        # Append the train-test split to the list of folds
        patient_folds.append((train_df, test_df))

        # Accumulate test set summary for this fold
        fold_summaries.append(f"Fold {i + 1}: {test_df.count()} rows ({(test_df.count() / total_rows) * 100:.2f}%)")

    print(" | ".join(fold_summaries))

    # Return the list of train-test tuples for each fold
    return patient_folds

In [10]:
def apply_pca(data: DataFrame, output_column: str = 'pca_features', k: int = 10) -> DataFrame:
    """Applies PCA to reduce the dimensionality of the feature set to k principal components."""
    
    # Initialize PCA with the specified number of components
    pca = PCA(k=k, inputCol='features', outputCol=output_column)
    
    # Fit PCA on the data and transform the features
    pca_model = pca.fit(data)
    transformed_data = pca_model.transform(data)
    
    print(f"PCA explained variance ratio: {pca_model.explainedVariance.toArray()}")
    
    return transformed_data

In [11]:
def vectorize_features(data: DataFrame, class_column: str = 'class', patient_column: str = 'patient', output_column: str = 'features', verbose: bool = True, standard_scaler: bool = False) -> DataFrame:
    """Assembles feature columns into a single feature vector column, excluding patient and class columns."""
    

    print('VECTORIZEEEEEEEEEEEEEEEEEEEEEE')
    # Count NaN values in each column
    nan_count_df = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns])

    # Show the total number of NaN values for each column
    nan_count_df.show(truncate=False)

    # Get the numeric columns from the DataFrame
    numeric_columns = [field.name for field in data.schema.fields if isinstance(field.dataType, (DoubleType, FloatType, IntegerType))]

    # Exclude class and patient columns
    feature_columns = [
        col for col in numeric_columns 
        if not col.startswith("diagnostics_") and col not in [class_column, patient_column]
    ]

    if verbose:
        print(f"Number of initial columns: {len(data.columns)}, number of feature columns: {len(feature_columns)}")
    
    # Assemble features into a feature vector
    assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'assembled_features')
    assembled_data = assembler.transform(data)

    # Standardize features
    if standard_scaler:
        scaler = StandardScaler(inputCol="assembled_features", outputCol = output_column, withStd = True, withMean = True)
        scaler_model = scaler.fit(assembled_data)
        scaled_data = scaler_model.transform(assembled_data)
    else:
        # Rename directly if no standardization is applied
        assembled_data = assembled_data.withColumnRenamed('assembled_features', 'features')
        scaled_data = assembled_data

    return scaled_data

In [12]:
def calculate_metrics(predictions: DataFrame) -> tuple:
    """Calculates accuracy, precision, and recall from the predictions DataFrame."""

    # Calculate true positives, true negatives, false positives, and false negatives
    tp = predictions.filter((col('prediction') == 1) & (col('class') == 1)).count()
    tn = predictions.filter((col('prediction') == 0) & (col('class') == 0)).count()
    fp = predictions.filter((col('prediction') == 1) & (col('class') == 0)).count()
    fn = predictions.filter((col('prediction') == 0) & (col('class') == 1)).count()

    # Calculate accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return accuracy, precision, recall

In [13]:
def determine_best_parameters_on_folds(model, train_df: DataFrame, paramGrid: list[dict], num_folds: int = 5, patient_column: str = 'patient', standard_scaler: bool = False):
    """Evaluates the Decision Tree model using patient-wise cross-validation with specified hyperparameter tuning."""

    folds = create_folds_by_patient(train_df, num_folds, patient_column)

    # Use BinaryClassificationEvaluator to measure performance
    evaluator = BinaryClassificationEvaluator(labelCol = "class", rawPredictionCol = "prediction", metricName = "areaUnderROC")

    best_params, best_metric = None, float("-inf")

    # Iterate over each parameter combination in the grid
    for params in tqdm(paramGrid, desc="Hyperparameter Tuning", leave = True):
        fold_metrics = []

        # Print parameter combination being evaluated
        param_items = [f"{param.name}: {value}" for param, value in params.items()]

        # For each fold, evaluate the parameter combination
        for _, (train_df, test_df) in enumerate(folds):

            # Vectorize features
            train_df = vectorize_features(train_df, verbose = True, standard_scaler = standard_scaler)
            test_df = vectorize_features(test_df, verbose = True, standard_scaler = standard_scaler)
            check_patient_overlap(train_df, test_df, verbose_just_error = True)

            # Train the model with the current parameters
            current_model = model.copy(params).fit(train_df)

            # Evaluate the model on the test data using the specified metric
            predictions = current_model.transform(test_df)
            metric = evaluator.evaluate(predictions)
            fold_metrics.append(metric)
        
        # Calculate average metric across all folds for the current parameter combination
        avg_metric = __builtins__.sum(fold_metrics) / len(fold_metrics)

        print(f"Average {evaluator.getMetricName()}: {avg_metric:.4f} | Evaluating Parameters:", ', '.join(param_items))

        # Update the best parameters if the current average metric is better
        if avg_metric > best_metric:
            best_metric = avg_metric
            best_params = params

    # Print the best parameters after evaluating all combinations
    param_items = [f"{param.name}: {value}" for param, value in best_params.items()]
    print("Best Overall Parameters:", ', '.join(param_items))
    print(f"Best {evaluator.getMetricName()}: {best_metric:.4f}")

    return best_params 

In [14]:
def train_and_evaluate_best_model(model, best_params: list[dict], train_df: DataFrame, test_df: DataFrame, is_tree: bool = False, standard_scaler: bool = False):
    """Train the model on the full training data using the best parameters and evaluate on the test data."""

    # Vectorize features
    train_df = vectorize_features(train_df, verbose = False, standard_scaler = standard_scaler)
    test_df = vectorize_features(test_df, standard_scaler=standard_scaler)

    # Train the model using the best parameters
    best_model = model.copy(best_params).fit(train_df)

    # Evaluate the model on the test data
    evaluator = BinaryClassificationEvaluator(labelCol="class", rawPredictionCol="prediction", metricName="areaUnderROC")
    predictions = best_model.transform(test_df)
    metric = evaluator.evaluate(predictions)

    # Calculate additional metrics: accuracy, precision, recall
    accuracy, precision, recall = calculate_metrics(predictions)

    print(f"Final Model Evaluation on Test Data {evaluator.getMetricName()}: {metric:.4f}")
    print(f"Final Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Print feature importances if the model is a tree-based model
    if is_tree:
        # Extract the feature importances from the model
        feature_importances = best_model.featureImportances.toArray()

        # Get feature column names from the assembler
        feature_columns = train_df.schema['features'].metadata['ml_attr']['attrs']['numeric']
        feature_names = [attr['name'] for attr in feature_columns]

        # Combine feature names and their importances, then sort by importance
        sorted_features = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

        # Print sorted feature importances
        print("\nSorted Feature Importances:")
        for name, importance in sorted_features:
            print(f"Feature: {name}, Importance: {importance:.4f}")

    # Show the first 5 names of wrongly classified rows
    misclassified = predictions.filter(col('prediction') != col('class'))
    names = misclassified.select('name').limit(5).rdd.flatMap(lambda x: x).collect()
    print("\nFirst 5 names of wrongly classified rows:")
    print(names)

    return best_model

### Main section

In [15]:
def main(model: str, file_name: str, paramGrid: list[dict], is_tree: bool = False, use_standard_scaler: bool = False):

    # Load and preprocess the dataset
    file_path = os.path.join(os.getcwd(), TARGET_DIR_NAME, file_name)
    data = spark.read.csv(file_path, header=True, inferSchema=True)

    print(f"\n{'=' * 75}\n Setting up\n{'=' * 75}\n")
    clean_data = preprocess_data(data)
    #clean_data = clean_data.limit(100)

    # split the data into train and test sets
    train_df, test_df = train_test_split_by_patient(clean_data, test_ratio=0.2)

    # Evaluate the model using patient-wise cross-validation to find the best parameters
    print(f"\n{'=' * 75}\n Hyperparameter tunning\n{'=' * 75}\n")
    best_params = determine_best_parameters_on_folds(model, train_df, paramGrid, num_folds=5, patient_column='patient', standard_scaler=use_standard_scaler)

    # Train the best model on the full training data and evaluate on the test data
    print(f"\n{'=' * 75}\n Testing model on test dataset\n{'=' * 75}\n")
    best_model = train_and_evaluate_best_model(model, best_params, train_df, test_df, is_tree, standard_scaler=use_standard_scaler)

    return best_model

In [16]:
file_name2 = 'features_512_lesion_mask.csv'
file_name1 = 'features_512_full_mask.csv'

In [17]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10]) \
    .addGrid(decision_tree.maxBins, [32]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name2, param_grid, is_tree=True)


 Setting up

+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+--------

Hyperparameter Tuning:   0%|          | 0/2 [00:00<?, ?it/s]

VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+-------------------

Hyperparameter Tuning:  50%|█████     | 1/2 [01:01<01:01, 61.57s/it]

Average areaUnderROC: 0.9790 | Evaluating Parameters: maxDepth: 5, maxBins: 32
VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------

Hyperparameter Tuning: 100%|██████████| 2/2 [02:02<00:00, 61.11s/it]

Average areaUnderROC: 0.9740 | Evaluating Parameters: maxDepth: 10, maxBins: 32
Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.9790

 Testing model on test dataset

VECTORIZEEEEEEEEEEEEEEEEEEEEEE





+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+----------------------

In [18]:
file_name2 = 'features_256_lesion_mask.csv'
file_name1 = 'features_256_full_mask.csv'
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10]) \
    .addGrid(decision_tree.maxBins, [32]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name2, param_grid, is_tree=True)


 Setting up

+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+--------

Hyperparameter Tuning:   0%|          | 0/2 [00:00<?, ?it/s]

VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+-------------------

Hyperparameter Tuning:  50%|█████     | 1/2 [00:34<00:34, 34.65s/it]

Average areaUnderROC: 0.9199 | Evaluating Parameters: maxDepth: 5, maxBins: 32
VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------

Hyperparameter Tuning: 100%|██████████| 2/2 [01:21<00:00, 40.84s/it]

Average areaUnderROC: 0.9103 | Evaluating Parameters: maxDepth: 10, maxBins: 32
Best Overall Parameters: maxDepth: 5, maxBins: 32
Best areaUnderROC: 0.9199

 Testing model on test dataset

VECTORIZEEEEEEEEEEEEEEEEEEEEEE





+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+----------------------

In [19]:
file_name2 = 'features_128_lesion_mask.csv'
file_name1 = 'features_128_full_mask.csv'
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10]) \
    .addGrid(decision_tree.maxBins, [32]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name2, param_grid, is_tree=True)


 Setting up

+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+--------

Hyperparameter Tuning:   0%|          | 0/2 [00:00<?, ?it/s]

VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+-------------------

Hyperparameter Tuning:  50%|█████     | 1/2 [00:36<00:36, 36.22s/it]

Average areaUnderROC: 0.8536 | Evaluating Parameters: maxDepth: 5, maxBins: 32
VECTORIZEEEEEEEEEEEEEEEEEEEEEE
+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------

### Testing different models

#### 512 - with lesion mask

In [39]:
file_name = 'features_512_full_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)


 Setting up

+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+--------

Hyperparameter Tuning:   0%|          | 0/8 [00:00<?, ?it/s]

Number of initial columns: 119, number of feature columns: 93
Number of initial columns: 119, number of feature columns: 93


In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150, 200]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15, 20]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)


 Setting up

+--------------------------------+--------------------------+------------------------------+------------------------------+---------------------------+----------------------------------+-------------------------------------------+-------------------------------+-----------------------------------------+----------------------------------+-------------------------------+-------------------------------+----------------------------------+----------------------------------+------------------------------+---------------------------------+------------------------------+-------------------------------------+----------------------------------+-----------------------------------+-------------------------------------------+--------------------------------------+--------------------------------+--------------------------------+--------------------------+---------------------------+--------------------------------------+----------------------------+---------------------------+--------

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

#### 512 - with full mask

In [14]:
file_name = 'features_512_full_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4, 1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

#### 256 - with lesion mask

In [19]:
file_name = 'features_256_lesion_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

#### 256 - with full mask

In [24]:
file_name = 'features_256_full_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

#### 128 - with lesion mask

In [14]:
file_name = 'features_128_lesion_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12, 15]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100, 200]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()


# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

#### 128 - with full mask

In [34]:
file_name = 'features_128_full_mask.csv'

In [None]:
decision_tree = DecisionTreeClassifier(labelCol = "class", featuresCol = "features")

# Set up parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(decision_tree.maxDepth, [5, 10, 15, 20]) \
    .addGrid(decision_tree.maxBins, [32, 64]) \
    .build()

# Call the main function to train and evaluate the model
model = main(decision_tree, file_name, param_grid, is_tree=True)

In [None]:
random_forest = RandomForestClassifier(labelCol="class", featuresCol="features")

# Set up parameter grid for hyperparameter tuning
rf_param_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [50, 150]) \
    .addGrid(random_forest.maxDepth, [5, 10, 15]) \
    .addGrid(random_forest.maxBins, [32, 64, 128]) \
    .addGrid(random_forest.featureSubsetStrategy, ['auto', 'sqrt']) \
    .build()

# Call the main function to train and evaluate the model
model = main(random_forest, file_name, rf_param_grid, is_tree=True)

In [None]:
# Linear SVC with Standard Scaler
svc = LinearSVC(labelCol="class", featuresCol="features")

# Set up the parameter grid for hyperparameter tuning
svc_param_grid = ParamGridBuilder() \
    .addGrid(svc.maxIter, [100, 500, 1000]) \
    .addGrid(svc.regParam, [0.01, 0.1]) \
    .addGrid(svc.tol, [1e-4,  1e-2]) \
    .build()

# Call the main function with the XGBoost classifier
model = main(svc, file_name, svc_param_grid, is_tree=False, use_standard_scaler=True)


In [None]:
xgb_classifier = SparkXGBClassifier(label_col="class", features_col="features", use_gpu=False) 

# Set up the parameter grid for hyperparameter tuning
xgb_param_grid = ParamGridBuilder() \
    .addGrid(xgb_classifier.max_depth, [3, 6, 9, 12]) \
    .addGrid(xgb_classifier.n_estimators, [50, 100]) \
    .addGrid(xgb_classifier.learning_rate, [0.1]) \
    .addGrid(xgb_classifier.subsample, [0.8]) \
    .addGrid(xgb_classifier.colsample_bytree, [0.8]) \
    .build()

# Call the main function to train and evaluate the model
model = main(xgb_classifier, file_name, xgb_param_grid, is_tree=False)

In [39]:
spark.stop()