## Feature Extraction

In [1]:
import os
import cv2
import shutil
import logging
import warnings
import radiomics
import numpy as np
import pandas as pd

from tqdm import tqdm
from dotenv import load_dotenv
from radiomics import featureextractor

# Load environment variables from .env file
_ = load_dotenv()

In [2]:
# Retrieve environment variables
BASE_DIR_NAME, BASE_SCALE_NAMES =  os.getenv('BASE_DIR_NAME'), os.getenv('BASE_SCALE_NAMES').split(',')
MASK_DIR_NAME, IMAGE_DIR_NAME = os.getenv('MASK_DIR_NAME'), os.getenv('IMAGE_DIR_NAME')
PROCESSED_MASK_DIR_NAME, TARGET_DIR_NAME = os.getenv('PROCESSED_MASK_DIR_NAME'), os.getenv('TARGET_DIR_NAME')

### Converting masks

In [3]:
def add_black_border(image: np.ndarray, border_thickness: int = 1) -> None:
    """Add a 1px wide black border to an entirely white image by drawing a white rectangle on a black image"""

    # Dimensions of the white rectangle, having 1px black border
    rectangle_width, rectangle_height = image.shape[1] - 2 * border_thickness, image.shape[0] - 2 * border_thickness
    
    # Calculate the coordinates for the white rectangle
    x = (image.shape[1] - rectangle_width) // 2
    y = (image.shape[0] - rectangle_height) // 2
    
    # Draw the white rectangle on the black image
    cv2.rectangle(image, (x, y), (x + rectangle_width, y + rectangle_height), (255, 255, 255), -1)

In [4]:
def process_masks(mask_dir: str, processed_mask_dir: str, verbose: str = False) -> None:
    """Process masks by adding a black border to entirely white masks and saving them."""

    # Get a list of all PNG files in the mask directory
    mask_files = [f for f in os.listdir(mask_dir) if f.endswith(".png")]
  
    entirely_white_images = []

    # Iterate over all mask files
    for mask_filename in tqdm(mask_files, desc = "Processing masks", unit = "file"):
        mask_path = os.path.join(mask_dir, mask_filename)
        processed_mask_path = os.path.join(processed_mask_dir, mask_filename)
        
        # Read the mask image into a numpy array
        image = cv2.imread(mask_path)
        
        # Process the images without tumor (entirely white images which need a black border)
        if mask_filename.endswith("1.png"):
            add_black_border(image)
            cv2.imwrite(processed_mask_path, image)

        # Process the images with tumor
        elif mask_filename.endswith("0.png"):

            # Check if the image is entirely white, if so, add a black border
            if (image == 255).all():
                entirely_white_images.append(mask_filename)
                image.fill(0)
                add_black_border(image)

            # Save the processed mask
            cv2.imwrite(processed_mask_path, image)

    if verbose and entirely_white_images:
        print("List of entirely white images processed:", entirely_white_images)

### Feature extraction

In [5]:
def initialize_feature_extractor() -> featureextractor.RadiomicsFeatureExtractor:
    """Initialize and configure the feature extractor for radiomics feature extraction."""
    
    # Suppress pyradiomics logging and specific warnings
    radiomics.logger.setLevel(logging.ERROR)
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", message="Shape features are only available")

    # Initialize and configure the feature extractor
    extractor = featureextractor.RadiomicsFeatureExtractor()
    #extractor.enableAllFeatures()
    #extractor.settings["force2D"] = True

    # Enable all the features except 2d and 3d shape features
    extractor.disableAllFeatures()
    for feature_class in ['firstorder', 'glcm', 'gldm', 'glrlm', 'glszm', 'ngtdm']:
        extractor.enableFeatureClassByName(feature_class)

    # Display configuration details
    print("Extractor parameters:", extractor.enabledFeatures)
    print('Extractor settings:', extractor.settings)

    return extractor

In [6]:
def extract_features(image_dir: str, processed_mask_dir: str, use_template_mask: bool, extractor: featureextractor.RadiomicsFeatureExtractor) -> pd.DataFrame:
    """Extract radiomics features for all images in the specified directory."""
    
    # Get a list of all PNG files in the image directory
    image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]
    features_list, errors = [], []
    
    # If use_template_mask is enabled, define the template mask path once
    template_mask_path = os.path.join(processed_mask_dir, 'auth_001-000061_001-000061_MG_BL_Series-8_Image-1-1.png') if use_template_mask else None
    mask_description = "without lesion mask" if use_template_mask else "with lesion mask"

    # Iterate over all image files
    for filename in tqdm(image_files, desc=f"Extracting features {mask_description}", unit="file"):
        try:
            image_path = os.path.join(image_dir, filename)
            processed_mask_path = os.path.join(processed_mask_dir, filename) if not use_template_mask else template_mask_path

            # Extract features for the current image
            features = extractor.execute(image_path, processed_mask_path, label = 255)

            # Convert features to a DataFrame row
            features_row_df = pd.DataFrame([features])

            # Extract additional metadata from the filename
            filename_parts = filename.split("_")
            features_row_df['name'] = filename
            features_row_df['provider'] = filename_parts[0]
            features_row_df['patient'] = filename_parts[1].split("-")[1]
            features_row_df['class'] = filename_parts[-1].split(".")[0].split("-")[-1]
            
            features_list.append(features_row_df)

        except Exception as e:
            errors.append((filename, str(e)))
            
    # Concatenate all feature rows into a single DataFrame
    features_df = pd.concat(features_list, ignore_index=True)
    return features_df, errors

### Main section

In [7]:
def process_and_extract_features():
    
    # Initialize the feature extractor and define an empty list to store errors
    extractor = initialize_feature_extractor()
    all_errors = []

    # Iterate over each scale 
    for scale_name in BASE_SCALE_NAMES:
        print(f"\n{'=' * 75}\n Starting processing for scale: {scale_name}\n{'=' * 75}\n")

        # Define the base directory for the current scale
        base_dir = os.path.join(os.getcwd(), BASE_DIR_NAME, scale_name)

        # Define the image and mask directories
        image_dir, mask_dir = os.path.join(base_dir, IMAGE_DIR_NAME), os.path.join(base_dir, MASK_DIR_NAME)
        processed_mask_dir = os.path.join(base_dir, PROCESSED_MASK_DIR_NAME)

        # Recreate the 'processed_mask' directory if it exists, or create it if it doesn't
        if os.path.exists(processed_mask_dir):
            shutil.rmtree(processed_mask_dir)
        os.makedirs(processed_mask_dir)

        # Process masks and extract features
        process_masks(mask_dir, processed_mask_dir, verbose=True)

        # Extract features for both mask types
        for mask_type, name in [(True, "full_mask"), (False, "lesion_mask")]:
            # Extract features for the current mask type
            features_df, errors = extract_features(image_dir, processed_mask_dir, use_template_mask=mask_type, extractor=extractor)
            all_errors.extend(errors)

            # Save the features to a CSV file in the target directory
            os.makedirs(TARGET_DIR_NAME, exist_ok=True)
            features_df.to_csv(os.path.join(TARGET_DIR_NAME, f'features_{scale_name}_{name}.csv'), index=False)
            print(f"Feature extraction completed. Data saved to 'features_{scale_name}_{name}.csv' file.")

In [8]:
process_and_extract_features()

Extractor parameters: {'firstorder': [], 'glcm': [], 'gldm': [], 'glrlm': [], 'glszm': [], 'ngtdm': []}
Extractor settings: {'minimumROIDimensions': 2, 'minimumROISize': None, 'normalize': False, 'normalizeScale': 1, 'removeOutliers': None, 'resampledPixelSpacing': None, 'interpolator': 'sitkBSpline', 'preCrop': False, 'padDistance': 5, 'distances': [1], 'force2D': False, 'force2Ddimension': 0, 'resegmentRange': None, 'label': 1, 'additionalInfo': True}

 Starting processing for scale: 128



Processing masks: 100%|██████████| 2798/2798 [00:01<00:00, 1963.97file/s]


List of entirely white images processed: ['auth_001-000068_001-000068_MG_BL_Series-2_Image-1-0.png', 'auth_001-000078_001-000078_MG_BL_Series-3_Image-1-0.png', 'auth_001-000078_001-000078_MG_TP1_1_Series-2_Image-1-0.png', 'auth_001-000082_001-000082_MG_TP1_1_Series-2_Image-1-0.png', 'auth_001-000084_001-000084_MG_BL_Series-1_Image-1-0.png', 'hcs_003-000243_003-000243_MG_BL_Series-2_Image-3-0.png', 'hcs_003-000248_003-000248_MG_BL_Series-2_Image-1-0.png', 'hcs_003-000249_003-000249_MG_BL_Series-2_Image-2-0.png', 'hcs_003-000251_003-000251_MG_BL_Series-1_Image-1-0.png', 'hcs_003-000251_003-000251_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000255_003-000255_MG_BL_Series-2_Image-1-0.png', 'hcs_003-000255_003-000255_MG_TP2_Series-2_Image-1-0.png', 'hcs_003-000256_003-000256_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000256_003-000256_MG_BL_Series-4_Image-1-0.png', 'hcs_003-000257_003-000257_MG_BL_Series-1010_Image-5-0.png', 'hcs_003-000257_003-000257_MG_BL_Series-1010_Image-6-0.png', 'hcs_003-000

Extracting features without lesion mask: 100%|██████████| 2798/2798 [03:23<00:00, 13.72file/s]


Feature extraction completed. Data saved to 'features_128_full_mask.csv' file.


Extracting features with lesion mask: 100%|██████████| 2798/2798 [03:19<00:00, 14.05file/s]


Feature extraction completed. Data saved to 'features_128_lesion_mask.csv' file.

 Starting processing for scale: 256



Processing masks: 100%|██████████| 2794/2794 [00:04<00:00, 678.01file/s]


List of entirely white images processed: ['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000243_003-000243_MG_BL_Series-2_Image-1-0.png', 'hcs_003-000243_003-000243_MG_BL_Series-2_Image-3-0.png', 'hcs_003-000303_003-000303_MG_BL_Series-3_Image-1-0.png', 'hcs_003-000673_003-000673_MG_BL_Series-1002_Image-1002-0.png', 'hcs_003-001204_003-001204_MG_BL_Series-1001_Image-1002-0.png', 'hcs_003-001210_003-001210_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001222_003-001222_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001222_003-001222_MG_TP2_Series-1001_Image-1001-0.png', 'hcs_003-001310_003-001310_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001310_003-001310_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001354_003-001354_MG_BL_Series-1001_Image-1003-0.png', 'hcs_003-001367_003-001367_MG_BL_Series-1001_Image-1001-0.png', 'hcs_003-001852_003-001852_MG_BL_Series-1001_Image-1002-0.png', 'hcs_003-001852_003-001852_MG_BL_Series-1001_Image-1004-0.png', 'hcs_003-001862_003-

Extracting features without lesion mask: 100%|██████████| 2794/2794 [06:32<00:00,  7.11file/s]


Feature extraction completed. Data saved to 'features_256_full_mask.csv' file.


Extracting features with lesion mask: 100%|██████████| 2794/2794 [05:23<00:00,  8.65file/s]


Feature extraction completed. Data saved to 'features_256_lesion_mask.csv' file.

 Starting processing for scale: 512



Processing masks: 100%|██████████| 2734/2734 [00:12<00:00, 223.32file/s]


List of entirely white images processed: ['auth_001-000084_001-000084_MG_BL_Series-3_Image-1-0.png', 'uns_005-000002_005-000002_MG_TP2_Series-71300000_Image-72-0.png', 'uns_005-000002_005-000002_MG_TP3_Series-71300000_Image-48-0.png', 'uns_005-000003_005-000003_MG_TP2_Series-1_Image-49-0.png', 'uns_005-000005_005-000005_MG_TP3_1_Series-1_Image-162-0.png']


Extracting features without lesion mask: 100%|██████████| 2734/2734 [19:39<00:00,  2.32file/s]


Feature extraction completed. Data saved to 'features_512_full_mask.csv' file.


Extracting features with lesion mask: 100%|██████████| 2734/2734 [13:25<00:00,  3.40file/s]


Feature extraction completed. Data saved to 'features_512_lesion_mask.csv' file.
