In [1]:
import os
import time
import pandas as pd
import numpy as np
import radiomics
from radiomics import featureextractor
from tqdm import tqdm
import concurrent.futures
from pingouin import intraclass_corr

def process_patient(patient_dir, patient_id, extractor, phase):
    """Extract features for single patient and specific phase"""
    image_path = os.path.join(patient_dir, f"{patient_id}-{phase}-image.nrrd")
    mask_path = os.path.join(patient_dir, f"{patient_id}-{phase}-label.nrrd")
    
    if not os.path.exists(image_path) or not os.path.exists(mask_path):
        print(f"Missing files: {patient_id} {phase}")
        return None
    
    try:
        result = extractor.execute(image_path, mask_path)
        return {f"{phase}_{k}": v for k, v in result.items() if not k.startswith('diagnostics')}
    except Exception as e:
        print(f"Error processing {patient_id} {phase}: {str(e)}")
        return None

def calculate_delta_features(df):
    """Calculate delta features (difference only)"""
    ap_features = [col.replace('AP_', '') for col in df.columns if col.startswith('AP_')]
    delta_data = {}
    
    for feat in ap_features:
        vp_col = f'VP_{feat}'
        if vp_col not in df.columns:
            continue
            
        ap = df[f'AP_{feat}']
        vp = df[vp_col]
        
        # Calculate difference features only
        delta_data[f'dm_{feat}'] = ap - vp
    
    return pd.concat([df, pd.DataFrame(delta_data, index=df.index)], axis=1)

def extract_features_for_directory(directory, output_prefix, config_path, results_dir):
    """Process feature extraction for entire directory"""
    start_time = time.time()
    extractor = featureextractor.RadiomicsFeatureExtractor(config_path)
    
    patients = [d for d in os.listdir(directory) 
               if os.path.isdir(os.path.join(directory, d))]
    
    results = {p: {} for p in patients}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()//2) as executor:
        futures = {}
        for patient in patients:
            patient_dir = os.path.join(directory, patient)
            for phase in ['AP', 'VP']:
                futures[executor.submit(process_patient, patient_dir, patient, extractor, phase)] = (patient, phase)
        
        for future in tqdm(concurrent.futures.as_completed(futures), 
                          total=len(futures), 
                          desc=f"Processing {output_prefix}"):
            patient, phase = futures[future]
            try:
                result = future.result()
                if result:
                    results[patient].update(result)
            except Exception as e:
                print(f"Error processing {patient} {phase}: {str(e)}")
    
    df = pd.DataFrame.from_dict(results, orient='index')
    df.index.name = 'PatientID'
    df = calculate_delta_features(df)
    
    # Save as CSV
    output_file = os.path.join(results_dir, f"{output_prefix}.csv")
    df.to_csv(output_file)
    
    print(f"Completed {output_prefix} | Time: {time.time()-start_time:.2f}s")
    print(f"File saved: {output_file}")
    return df

def load_and_prepare_data(results_dir):
    """Load and prepare data"""
    df1 = pd.read_csv(os.path.join(results_dir, 'radiomics1.csv'))
    df2 = pd.read_csv(os.path.join(results_dir, 'radiomics2.csv'))
    
    # Ensure data order consistency
    df2 = df2.set_index('PatientID').reindex(df1['PatientID']).reset_index()
    
    # Get feature column names
    feature_columns = [col for col in df1.columns if col != 'PatientID']
    return df1, df2, feature_columns

def calculate_icc(feature, df1, df2):
    """Calculate ICC for single feature"""
    combined = pd.DataFrame({
        'raters': np.concatenate([['R1']*len(df1), ['R2']*len(df2)]),
        'patients': np.concatenate([df1['PatientID'], df2['PatientID']]),
        'values': np.concatenate([df1[feature], df2[feature]])
    })
    icc = intraclass_corr(data=combined, targets='patients', raters='raters', ratings='values')
    return icc.loc[icc['Type'] == 'ICC2', 'ICC'].values[0]

def calculate_icc_parallel(features, df1, df2):
    """Calculate ICC in parallel"""
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(
            lambda feat: (feat, calculate_icc(feat, df1, df2)), 
            features
        ))
    return dict(results)

def calculate_icc_statistics(icc_results):
    """Calculate ICC statistics including mean and median"""
    icc_values = list(icc_results.values())
    mean_icc = np.mean(icc_values)
    median_icc = np.median(icc_values)
    
    # Calculate ICC distribution
    icc_distribution = {
        'excellent_icc': len([icc for icc in icc_values if icc > 0.9]),
        'good_icc': len([icc for icc in icc_values if 0.75 < icc <= 0.9]),
        'moderate_icc': len([icc for icc in icc_values if 0.5 < icc <= 0.75]),
        'poor_icc': len([icc for icc in icc_values if icc <= 0.5])
    }
    
    return mean_icc, median_icc, icc_distribution

def main():
    """Main function to run the complete pipeline"""
    # Create results directory if not exists
    results_dir = './results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Define paths
    config_path = './configs/CT.yaml'
    directory1 = './data/imagelabel_1'
    directory2 = './data/imagelabel_2'
    
    # Extract features for both directories
    print("Starting feature extraction...")
    df1 = extract_features_for_directory(directory1, 'radiomics1', config_path, results_dir)
    df2 = extract_features_for_directory(directory2, 'radiomics2', config_path, results_dir)
    
    # Load data for ICC calculation
    df1, df2, feature_columns = load_and_prepare_data(results_dir)
    original_features_count = len(feature_columns)
    print(f"\nOriginal features count: {original_features_count}")
    
    # Calculate ICC
    print("\nCalculating ICC...")
    icc_results = calculate_icc_parallel(feature_columns, df1, df2)
    
    # Calculate ICC statistics
    mean_icc, median_icc, icc_distribution = calculate_icc_statistics(icc_results)
    
    # Print ICC statistics
    print(f"\nICC Statistics:")
    print(f"Mean ICC: {mean_icc:.3f}")
    print(f"Median ICC: {median_icc:.3f}")
    print(f"ICC Distribution:")
    print(f"  Excellent (ICC > 0.9): {icc_distribution['excellent_icc']} features ({icc_distribution['excellent_icc']/original_features_count:.1%})")
    print(f"  Good (0.75 < ICC ≤ 0.9): {icc_distribution['good_icc']} features ({icc_distribution['good_icc']/original_features_count:.1%})")
    print(f"  Moderate (0.5 < ICC ≤ 0.75): {icc_distribution['moderate_icc']} features ({icc_distribution['moderate_icc']/original_features_count:.1%})")
    print(f"  Poor (ICC ≤ 0.5): {icc_distribution['poor_icc']} features ({icc_distribution['poor_icc']/original_features_count:.1%})")
    
    # Filter features with ICC > 0.75 (good consistency)
    selected_features = [feat for feat, icc in icc_results.items() if icc > 0.75]
    selected_count = len(selected_features)
    retention_ratio = selected_count / original_features_count
    
    print(f"\nFeature Selection Results:")
    print(f"Selected features count: {selected_count}")
    print(f"Retention ratio: {retention_ratio:.2%}")
    
    # Save selected features only
    final_df = df1[['PatientID'] + selected_features]
    final_output_path = os.path.join(results_dir, 'radiomics.csv')
    final_df.to_csv(final_output_path, index=False)
    print(f"Selected features saved to {final_output_path}")

if __name__ == "__main__":
    main()

Starting feature extraction...


Processing radiomics1: 100%|█████████████████████████████████████████████████████████| 410/410 [36:17<00:00,  5.31s/it]


Completed radiomics1 | Time: 2180.31s
File saved: ./results\radiomics1.csv


Processing radiomics2: 100%|█████████████████████████████████████████████████████████| 410/410 [36:31<00:00,  5.34s/it]


Completed radiomics2 | Time: 2194.02s
File saved: ./results\radiomics2.csv

Original features count: 3864

Calculating ICC...

ICC Statistics:
Mean ICC: 0.739
Median ICC: 0.847
ICC Distribution:
  Excellent (ICC > 0.9): 1564 features (40.5%)
  Good (0.75 < ICC ≤ 0.9): 807 features (20.9%)
  Moderate (0.5 < ICC ≤ 0.75): 809 features (20.9%)
  Poor (ICC ≤ 0.5): 684 features (17.7%)

Feature Selection Results:
Selected features count: 2371
Retention ratio: 61.36%
Selected features saved to ./results\radiomics.csv
