## Feature Extraction

In [9]:
import os
import cv2
import shutil
import logging
import warnings
import radiomics
import numpy as np
import pandas as pd

from tqdm import tqdm
from dotenv import load_dotenv
from radiomics import featureextractor

# Load environment variables from .env file
_ = load_dotenv()

In [10]:
# Retrieve environment variables
BASE_DIR_NAME, MASK_DIR_NAME, IMAGE_DIR_NAME =  os.getenv('BASE_DIR_NAME'), os.getenv('MASK_DIR_NAME'), os.getenv('IMAGE_DIR_NAME')
PROCESSED_MASK_DIR_NAME = os.getenv('PROCESSED_MASK_DIR_NAME')

# Define the image and mask directories
image_dir = os.path.join(os.getcwd(), BASE_DIR_NAME, IMAGE_DIR_NAME)
mask_dir = os.path.join(os.getcwd(), BASE_DIR_NAME, MASK_DIR_NAME)
processed_mask_dir = os.path.join(os.getcwd(), BASE_DIR_NAME, PROCESSED_MASK_DIR_NAME)

# Recreate the 'processed_mask' directory if it exists, or create it if it doesn't
if os.path.exists(processed_mask_dir):
    shutil.rmtree(processed_mask_dir)
os.makedirs(processed_mask_dir)

### Converting masks

In [11]:
def add_black_border(image: np.ndarray, border_thickness: int = 1) -> None:
    """Add a 1px wide black border to an entirely white image by drawing a white rectangle on a black image"""

    # Dimensions of the white rectangle, having 1px black border
    rectangle_width, rectangle_height = image.shape[1] - 2 * border_thickness, image.shape[0] - 2 * border_thickness
    
    # Calculate the coordinates for the white rectangle
    x = (image.shape[1] - rectangle_width) // 2
    y = (image.shape[0] - rectangle_height) // 2
    
    # Draw the white rectangle on the black image
    cv2.rectangle(image, (x, y), (x + rectangle_width, y + rectangle_height), (255, 255, 255), -1)

In [12]:
def process_masks(mask_dir: str, processed_mask_dir: str, verbose: str = False) -> None:
    """Process masks by adding a black border to entirely white masks and saving them."""

    # Get a list of all PNG files in the mask directory
    mask_files = [f for f in os.listdir(mask_dir) if f.endswith(".png")]
    entirely_white_images = []

    # Iterate over all mask files
    for mask_filename in tqdm(mask_files, desc = "Processing masks", unit = "file"):
        mask_path = os.path.join(mask_dir, mask_filename)
        processed_mask_path = os.path.join(processed_mask_dir, mask_filename)
        
        # Read the mask image into a numpy array
        image = cv2.imread(mask_path)
        
        # Process the images without tumor (entirely white images which need a black border)
        if mask_filename.endswith("1.png"):
            add_black_border(image)
            cv2.imwrite(processed_mask_path, image)

        # Process the images with tumor
        elif mask_filename.endswith("0.png"):

            # Check if the image is entirely white, if so, add a black border
            if (image == 255).all():
                entirely_white_images.append(mask_filename)
                image.fill(0)
                add_black_border(image)

            # Save the processed mask
            cv2.imwrite(processed_mask_path, image)

    if verbose and entirely_white_images:
        print("\nList of entirely white images processed:", entirely_white_images)

In [13]:
process_masks(mask_dir, processed_mask_dir, verbose = False)

Processing masks: 100%|██████████| 2734/2734 [00:11<00:00, 235.86file/s]


### Feature extraction

In [14]:
# Suppress pyradiomics logging and specific warnings
radiomics.logger.setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", message="Shape features are only available")

# Initialize and configure the feature extractor
extractor = featureextractor.RadiomicsFeatureExtractor()
extractor.enableAllFeatures()
extractor.settings["force2D"] = True

# Display configuration details
print("Extractor parameters:", extractor.enabledFeatures)
print('Extractor settings:', extractor.settings)

Extractor parameters: {'firstorder': [], 'glcm': [], 'gldm': [], 'glrlm': [], 'glszm': [], 'ngtdm': [], 'shape': [], 'shape2D': []}
Extractor settings: {'minimumROIDimensions': 2, 'minimumROISize': None, 'normalize': False, 'normalizeScale': 1, 'removeOutliers': None, 'resampledPixelSpacing': None, 'interpolator': 'sitkBSpline', 'preCrop': False, 'padDistance': 5, 'distances': [1], 'force2D': True, 'force2Ddimension': 0, 'resegmentRange': None, 'label': 1, 'additionalInfo': True}


In [15]:
def extract_features(image_dir: str, mask_dir: str, extractor: featureextractor.RadiomicsFeatureExtractor) -> pd.DataFrame:
    """Extract radiomics features for all images in the specified directory."""
    
    # Get a list of all PNG files in the image directory
    image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]
    features_list, errors = [], []
    
    # Iterate over all image files
    for filename in tqdm(image_files, desc = "Extracting features", unit = "file"):
        try:
            image_path = os.path.join(image_dir, filename)
            processed_mask_path = os.path.join(processed_mask_dir, filename)

            # Extract features for the current image
            features = extractor.execute(image_path, processed_mask_path, label = 255)

            # Convert features to a DataFrame row
            features_row_df = pd.DataFrame([features])

            # Extract additional metadata from the filename
            filename_parts = filename.split("_")
            features_row_df['name'] = filename
            features_row_df['provider'] = filename_parts[0]
            features_row_df['patient'] = filename_parts[1].split("-")[1]
            features_row_df['class'] = filename_parts[-1].split(".")[0].split("-")[-1]
            
            features_list.append(features_row_df)

        except Exception as e:
            errors.append((filename, str(e)))
            
    # Concatenate all feature rows into a single DataFrame
    features_df = pd.concat(features_list, ignore_index=True)
    return features_df, errors

In [16]:
features_df, errors = extract_features(image_dir, mask_dir, extractor)

Extracting features: 100%|██████████| 2734/2734 [15:41<00:00,  2.90file/s]


### Extracted DataFrame info

In [18]:
print('Data frame shape is:', features_df.shape)
print("Error processing images:", errors)
features_df.head(1)

Data frame shape is: (2734, 128)


Unnamed: 0,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Spacing,...,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,name,provider,patient,class
0,v3.0.1,2.1.0,2.4.0,1.7.0,3.10.9,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},13f501ab5e4d2d2bee6d5303f0d25f463bef4733,2D,"(1.0, 1.0)",...,8113.29267896797,55.37102726504246,0.004597197237489,0.0107815836433796,0.0001275586289649,0.0045766590389016,auth_001-000061_001-000061_MG_BL_Series-8_Imag...,auth,61,0


### Save dataframe to excel

In [19]:
output_file_name = os.path.join(os.getcwd(), "data", "extracted_features.csv")
features_df.to_csv(output_file_name, index = False)