# 3D Radiomic Feature Extraction
## Extract radiomics features from 3D medical imaging volumes

This notebook extracts radiomic features from 3D medical images:
- **Input:** Raw 3D NIfTI volumes from imagesTr/labelsTr
- **Features:** Shape, Texture (GLCM, GLRLM, GLSZM), Intensity-based radiomic features
- **Output:** Feature matrices saved as pickle and CSV
- **Attention Mask:** All non-background labels treated as single region of interest

In [None]:
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import radiomics
from radiomics import featureextractor
import logging
import warnings
import pickle
from collections import defaultdict
from tqdm import tqdm
import json

warnings.filterwarnings('ignore')
logging.getLogger('radiomics').setLevel(logging.ERROR)

print(f"PyRadiomics version: {radiomics.__version__}")
print(f"Radiomics successfully imported!")

## 1. Configuration

In [None]:
# Paths
BASE_DIR = Path('C:/FeatureEx')
IMAGES_DIR = BASE_DIR / 'imagesTr' / 'imagesTr'
LABELS_DIR = BASE_DIR / 'labelsTr' / 'labelsTr'
OUTPUT_DIR = BASE_DIR / 'radiomics_3d'
OUTPUT_DIR.mkdir(exist_ok=True)

# Settings
STRUCTURE_CLASSES = [1, 2, 3, 4]  # Classes 1-4 are structures (0 is background)

print(f"Input directories:")
print(f"  Images: {IMAGES_DIR}")
print(f"  Labels: {LABELS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"\nExtraction settings:")
print(f"  Structure classes: {STRUCTURE_CLASSES}")
print(f"  Attention mask: All non-background labels combined into single ROI")

## 2. Initialize Radiomics Extractor

In [None]:
# Create custom parameter file for 3D radiomics
params = {
    'binWidth': 25,  # 25 HU bin width for CT-like data
    'resampledPixelSpacing': None,  # Keep original spacing
    'interpolator': 'sitkBSpline',
    'label': 1,  # Will be updated per extraction
    'imageType': {
        'Original': {},
        'Wavelet': {'wavelet': 'coif1'},
    },
    'featureClass': {
        'shape': {},
        'firstorder': {},
        'glcm': {},
        'glrlm': {},
        'glszm': {},
        'ngtdm': {},
        'gldm': {},
    }
}

print(f"Radiomics parameters configured:")
print(f"  Bin width: {params['binWidth']}")
print(f"  Image types: {list(params['imageType'].keys())}")
print(f"  Feature classes: {list(params['featureClass'].keys())}")

## 3. Get File Pairs

In [None]:
# Get matching image-label pairs
image_files = sorted([f for f in IMAGES_DIR.glob('*.nii*')])
label_files = sorted([f for f in LABELS_DIR.glob('*.nii*')])

# Create mapping
image_map = {f.stem: f for f in image_files}
label_map = {f.stem: f for f in label_files}

# Find matching pairs
matching_pairs = set(image_map.keys()) & set(label_map.keys())
file_pairs = [(image_map[name], label_map[name]) for name in sorted(matching_pairs)]

print(f"File pairs found: {len(file_pairs)}")
print(f"\nFirst 5 pairs:")
for img_path, lbl_path in file_pairs[:5]:
    print(f"  {img_path.name} <-> {lbl_path.name}")

## 4. Helper Functions for Multi-class Masking

In [None]:
def create_aoi_mask(label_data, structure_classes):
    """
    Create binary attention mask for all non-background structures.
    All structures are treated as a single region of interest.
    
    Args:
        label_data: 3D label array
        structure_classes: List of structure class labels (1-4)
    
    Returns:
        Binary mask array where all non-background labels are 1
    """
    # Create mask for all non-background classes (structures 1-4)
    mask = np.isin(label_data, structure_classes).astype(np.uint8)
    return mask

def validate_mask(mask_data, min_voxels=100):
    """
    Validate mask has sufficient voxels.
    
    Args:
        mask_data: Binary mask array
        min_voxels: Minimum required voxels
    
    Returns:
        Boolean indicating validity
    """
    voxel_count = np.count_nonzero(mask_data)
    return voxel_count >= min_voxels

print("Mask helper functions defined.")

## 5. Radiomics Extraction Function

In [None]:
def extract_features(image_path, label_path, extractor, structure_classes):
    """
    Extract radiomics features for the combined area of interest.
    
    All non-background labels (classes 1-4) are treated as a single ROI.
    
    Args:
        image_path: Path to image NIfTI
        label_path: Path to label NIfTI
        extractor: RadiomicsFeatureExtractor instance
        structure_classes: List of structure class labels
    
    Returns:
        Dictionary of features or None if extraction failed
    """
    try:
        # Load data
        image_nib = nib.load(image_path)
        image_array = image_nib.get_fdata()
        
        label_nib = nib.load(label_path)
        label_array = label_nib.get_fdata()
        
        # Create binary mask for all non-background structures
        mask_array = create_aoi_mask(label_array, structure_classes)
        
        # Validate mask
        if not validate_mask(mask_array):
            return None
        
        # Save mask to temporary NIfTI
        mask_nib = nib.Nifti1Image(mask_array.astype(np.uint8), label_nib.affine)
        mask_path = Path('/tmp/temp_mask.nii.gz')
        nib.save(mask_nib, mask_path)
        
        # Extract features
        features = extractor.execute(str(image_path), str(mask_path))
        
        # Clean up temp file
        mask_path.unlink(missing_ok=True)
        
        return dict(features)
    
    except Exception as e:
        return None

print("Feature extraction function defined.")

## 6. Extract Features from All Images

In [None]:
# Initialize extractor
extractor = featureextractor.RadiomicsFeatureExtractor()

# Initialize storage
all_features = []
feature_names = None
sample_ids = []
extraction_log = []

print(f"Starting feature extraction from {len(file_pairs)} image-label pairs...\n")

# Extract features from each pair
for pair_idx, (img_path, lbl_path) in enumerate(tqdm(file_pairs), 1):
    sample_name = img_path.stem
    log_entry = {'sample': sample_name, 'status': 'failed', 'features_extracted': False}
    
    # Extract features from combined ROI (all non-background structures)
    features_dict = extract_features(img_path, lbl_path, extractor, STRUCTURE_CLASSES)
    
    if features_dict:
        # Store features
        all_features.append(features_dict)
        sample_ids.append(sample_name)
        log_entry['status'] = 'success'
        log_entry['features_extracted'] = True
        
        # Extract feature names from first successful extraction
        if feature_names is None:
            feature_names = list(features_dict.keys())
    
    extraction_log.append(log_entry)

print(f"\nFeature extraction complete!")
print(f"Total samples with extracted features: {len(all_features)}")
print(f"Success rate: {len(all_features) / len(file_pairs) * 100:.1f}%")
print(f"Total feature types: {len(feature_names) if feature_names else 0}")

## 7. Convert to DataFrames

In [None]:
# Create DataFrame from extracted features
if all_features:
    features_df = pd.DataFrame(all_features)
    features_df['sample_id'] = sample_ids
    
    print(f"Features DataFrame created:")
    print(f"  Samples: {len(features_df)}")
    print(f"  Features: {len(feature_names) if feature_names else 0}")
    print(f"  Total columns: {len(features_df.columns)}")
    print(f"  Shape: {features_df.shape}")
    print(f"\nFirst 5 feature columns: {list(features_df.columns[:5])}")
else:
    print("No features were successfully extracted!")

## 8. Combine and Normalize Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Create combined feature matrix (all classes concatenated)
combined_features_list = []
class_labels_list = []
sample_ids_combined = []

for class_idx, class_label in enumerate(STRUCTURE_CLASSES):
    df_key = f'class_{class_label}'
    if df_key in feature_dfs:
        df = feature_dfs[df_key]
        combined_features_list.append(df.values)
        class_labels_list.extend([class_label] * len(df))
        sample_ids_combined.extend(df.index.tolist())

# Combine and create DataFrame
if combined_features_list:
    combined_array = np.vstack(combined_features_list)
    combined_df = pd.DataFrame(
        combined_array,
        columns=feature_names if feature_names else [f'feature_{i}' for i in range(combined_array.shape[1])]
    )
    combined_df['class'] = class_labels_list
    
    print(f"Combined feature matrix:")
    print(f"  Shape: {combined_df.shape}")
    print(f"  Samples: {len(combined_df)}")
    print(f"  Features: {len(feature_names) if feature_names else 0}")
    
    # Show class distribution
    print(f"\nClass distribution:")
    for class_label in STRUCTURE_CLASSES:
        count = (combined_df['class'] == class_label).sum()
        print(f"  Class {class_label}: {count} samples")

## 9. Feature Statistics and Summary

In [None]:
# Feature statistics
if feature_names:
    print(f"Feature statistics:")
    print(f"\nTop 10 features by variance:")
    
    feature_variance = features_df[feature_names].var()
    top_features = feature_variance.nlargest(10)
    
    for idx, (feat_name, variance) in enumerate(top_features.items(), 1):
        print(f"  {idx}. {feat_name}: {variance:.4f}")
    
    print(f"\nFeature summary:")
    print(f"  Total feature types: {len(feature_names)}")
    print(f"  Total radiomic features: {len(feature_names) * len(features_df)}")
    print(f"  Feature classes: {len(set([feat.split('_')[0] for feat in feature_names]))}")

## 10. Save Features to Files

In [None]:
# Save features as CSV
if all_features:
    output_csv = OUTPUT_DIR / 'radiomics_3d_features.csv'
    features_df.to_csv(output_csv, index=False)
    print(f"Saved features: {output_csv.name}")
    
    # Also save without sample_id for feature analysis
    output_features_only = OUTPUT_DIR / 'radiomics_3d_features_only.csv'
    features_df[feature_names].to_csv(output_features_only, index=False)
    print(f"Saved features (no IDs): {output_features_only.name}")
else:
    print("No features to save!")

## 11. Save Features as Pickle (for sklearn compatibility)

In [None]:
# Save as pickle for later use with sklearn
pickle_data = {
    'features_df': features_df,
    'feature_names': feature_names,
    'sample_ids': sample_ids,
    'metadata': {
        'total_samples': len(file_pairs),
        'successful_extractions': len(all_features),
        'success_rate': len(all_features) / len(file_pairs) * 100,
        'total_features': len(feature_names) if feature_names else 0,
        'roi_type': 'combined_all_structures',
        'structure_classes': STRUCTURE_CLASSES,
        'extraction_log': extraction_log
    }
}

pickle_path = OUTPUT_DIR / 'radiomics_3d_features.pkl'
with open(pickle_path, 'wb') as f:
    pickle.dump(pickle_data, f)

print(f"\nSaved pickle: {pickle_path.name}")
print(f"Pickle contains:")
print(f"  - Features DataFrame")
print(f"  - Feature names")
print(f"  - Sample IDs")
print(f"  - Metadata and extraction log")

## 12. Create Configuration Summary

In [None]:
config_summary = {
    'extraction_info': {
        'total_samples': len(file_pairs),
        'successful_extractions': len(all_features),
        'success_rate': f"{len(all_features) / len(file_pairs) * 100:.1f}%",
        'extraction_log_path': str(OUTPUT_DIR / 'extraction_log.json')
    },
    'region_of_interest': {
        'roi_type': 'combined_all_structures',
        'description': 'All non-background labels (1-4) treated as single attention mask',
        'structure_classes': STRUCTURE_CLASSES
    },
    'features': {
        'total_feature_types': len(feature_names) if feature_names else 0,
        'feature_names_sample': feature_names[:10] if feature_names else [],
        'num_features_in_sample': 10,
        'total_features_truncated': len(feature_names) - 10 if feature_names and len(feature_names) > 10 else 0
    },
    'output_files': {
        'features_csv': 'radiomics_3d_features.csv',
        'features_only_csv': 'radiomics_3d_features_only.csv',
        'pickle': 'radiomics_3d_features.pkl',
        'config_summary': 'radiomics_3d_config.json',
        'extraction_log': 'extraction_log.json'
    }
}

# Save config
config_path = OUTPUT_DIR / 'radiomics_3d_config.json'
with open(config_path, 'w') as f:
    json.dump(config_summary, f, indent=2)

print(f"Configuration summary:")
for key, val in config_summary.items():
    print(f"\n{key}:")
    if isinstance(val, dict):
        for k, v in val.items():
            print(f"  {k}: {v}")
    else:
        print(f"  {val}")

## 13. Save Extraction Log

In [None]:
# Save detailed extraction log
log_path = OUTPUT_DIR / 'extraction_log.json'
with open(log_path, 'w') as f:
    json.dump(extraction_log, f, indent=2)

print(f"Extraction log saved to: {log_path.name}")

# Print summary
successful = sum(1 for entry in extraction_log if entry['features_extracted'])
print(f"\nExtraction summary:")
print(f"  Total samples processed: {len(extraction_log)}")
print(f"  Successful extractions: {successful}")
print(f"  Failed extractions: {len(extraction_log) - successful}")
print(f"  Success rate: {(successful/len(extraction_log))*100:.1f}%")

In [None]:
# Test loading the pickle file
with open(pickle_path, 'rb') as f:
    loaded_data = pickle.load(f)

print(f"Verification - Loaded pickle contents:")
print(f"  Keys: {list(loaded_data.keys())}")
print(f"  DataFrame shape: {loaded_data['features_df'].shape}")
print(f"  Number of features: {len(loaded_data['feature_names'])}")
print(f"  Number of samples: {len(loaded_data['sample_ids'])}")
print(f"  ROI type: {loaded_data['metadata']['roi_type']}")
print(f"  Success rate: {loaded_data['metadata']['success_rate']:.1f}%")

print(f"\nDataFrame preview:")
print(loaded_data['features_df'].head(3))

In [None]:
# Example: Using extracted features for analysis
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

print("Example: Feature analysis with extracted radiomic features\n")

if loaded_data['features_df'] is not None and loaded_data['feature_names']:
    df = loaded_data['features_df']
    feature_names_list = loaded_data['feature_names']
    
    print(f"Features extracted from combined ROI (all non-background structures):")
    print(f"  Total samples: {len(df)}")
    print(f"  Total features: {len(feature_names_list)}")
    
    # Basic statistics
    print(f"\nFeature statistics:")
    print(f"  Min values: {df[feature_names_list].min().min():.4f}")
    print(f"  Max values: {df[feature_names_list].max().max():.4f}")
    print(f"  Mean values: {df[feature_names_list].mean().mean():.4f}")
    print(f"  Std values: {df[feature_names_list].std().mean():.4f}")
    
    print(f"\nTop 10 features by variance:")
    feature_var = df[feature_names_list].var()
    for i, (feat, var) in enumerate(feature_var.nlargest(10).items(), 1):
        print(f"  {i}. {feat}: {var:.6f}")

In [None]:
print("\n" + "="*70)
print("3D RADIOMIC FEATURE EXTRACTION - COMPLETE")
print("="*70)

print(f"\nOutput Directory: {OUTPUT_DIR}")
print(f"\nROI Configuration:")
print(f"  Type: Combined all structures (attention mask)")
print(f"  Description: All non-background labels (1-4) as single region")
print(f"  Classes: {STRUCTURE_CLASSES}")

print(f"\nExtraction Results:")
print(f"  Total samples: {len(file_pairs)}")
print(f"  Successfully extracted: {len(all_features)}")
print(f"  Success rate: {len(all_features)/len(file_pairs)*100:.1f}%")
print(f"  Features per sample: {len(feature_names) if feature_names else 0}")

print(f"\nGenerated Files:")
output_files = list(OUTPUT_DIR.glob('*'))
for file in sorted(output_files):
    if file.is_file():
        size_mb = file.stat().st_size / (1024*1024)
        print(f"  - {file.name} ({size_mb:.2f} MB)")

print(f"\nNext Steps:")
print(f"  1. Load features: pickle.load(open('{pickle_path}', 'rb'))")
print(f"  2. Access DataFrame: loaded_data['features_df']")
print(f"  3. Get features: loaded_data['features_df'][loaded_data['feature_names']]")
print(f"  4. Combine with CNN features from ResNet3D_Classification.ipynb")
print(f"  5. Perform multi-modal classification and fusion analysis")