# Features extracting for COX
"""
This script calculates and extracts the intensity and the area features of Breast Arterial Calcification (BAC) 
to be used in a Cox proportional hazards model. The BAC count is used as a continuous variable, and the area is 
categorized into 3 grades with reference to BAC absence.

- BAC_Absence: 0 if area <= 0.0025
- Mild: 1 if 0.0025 < area <= 0.01
- Moderate: 2 if 0.01 < area <= 0.025
- Severe: 3 if area > 0.025

In our data:
- PID represents a unique case of multiple mammograms belonging to one woman.
- File name represents the mammogram image.

The script processes mask images to extract these features and saves the results in an Excel file.
"""


In [None]:
# Standard library imports
import os
import random
import warnings

# Third-party imports
import numpy as np
import pandas as pd
from keras import backend as K
from PIL import Image

# Setting up matplotlib to work interactively in a Jupyter environment
%matplotlib inline

# Setting the seed for reproducibility
seed_value = 42
random.seed(seed_value)  # For Python's built-in random module
np.random.seed(seed_value)  # For NumPy

# Don't show warning messages
warnings.filterwarnings('ignore')


In [None]:
# Define the paths
input_data_path = r'< PATH_TO_INPUT >'
output_mask_path_orig_size = r'< PATH_TO_OUTPUT_MASK >'
new_intensity_path = r'< Path_to_intensity >'

# Ensure the New Intensity folder exists
if not os.path.exists(new_intensity_path):
    os.makedirs(new_intensity_path)

# Define the map_area_to_grades function
def map_area_to_grades(average_area):
    if average_area <= 0.0025:
        return 0
    elif average_area <= 0.01:
        return 1
    elif average_area <= 0.025:
        return 2
    else:
        return 3


# DataFrame to hold results
results_df = pd.DataFrame(columns=['Image Name', 'PID', 'Intensity Value', 'BAC_Count', 'Area', 'BAC_Absence', 'Mild', 'Moderate', 'Severe'])

# Iterate over the mask images to process
for file_name in os.listdir(output_mask_path_orig_size):
    input_file_path = os.path.join(input_data_path, file_name)
    mask_file_path = os.path.join(output_mask_path_orig_size, file_name)
    result_file_path = os.path.join(new_intensity_path, file_name)
    
    # Check if the corresponding input image exists
    if os.path.exists(input_file_path):
        try:
            # Load images and convert to grayscale
            input_img = Image.open(input_file_path).convert('L')
            mask_img = Image.open(mask_file_path).convert('L')
            input_array = np.array(input_img)
            mask_array = np.array(mask_img)
            
            # Multiply input image by mask
            multiplied_array = input_array * mask_array
            result_img = Image.fromarray(multiplied_array)
            result_img.save(result_file_path)
            
            # Calculate intensity and BAC_Count
            intensity_value = np.sum(multiplied_array)
            BAC_Count = np.sum(mask_array > 0)
            area = np.sum(mask_array > 0) / float(mask_array.size)
            
            # Map area to grades
            grade = map_area_to_grades(area)
            
            # Determine BAC absence, mild, moderate, and severe
            BAC_Absence = 1 if grade == 0 else 0
            Mild = 1 if grade == 1 else 0
            Moderate = 1 if grade == 2 else 0
            Severe = 1 if grade == 3 else 0

            # Extract PID from file name
            pid = file_name[:8]
            
            # Append to results DataFrame
            results_df = results_df.append({
                'Image Name': file_name,
                'PID': pid,
                'Intensity Value': intensity_value,
                'BAC_Count': BAC_Count,
                'Area': area,
                'BAC_Absence': BAC_Absence,
                'Mild': Mild,
                'Moderate': Moderate,
                'Severe': Severe
            }, ignore_index=True)
            
        except IOError as e:
            print(f"Error processing file {file_name}: {e}")

# Save results to Excel
results_excel_path = r'path_to_your_file.xlsx'
results_df.to_excel(results_excel_path, index=False)
