In [None]:
import pandas as pd
import pydicom
import os
from tqdm import tqdm

ORIGINAL_DICOM_FOLDER = r"D:\vinbigdata-chest-xray-abnormalities-detection\train"
ORIGINAL_CSV_PATH = r"D:\vinbigdata-chest-xray-abnormalities-detection\train.csv"
NEW_CSV_PATH = r"D:\vinbigdata-chest-xray-abnormalities-detection\train224.csv"

def scale_coordinates(row):
    new_row = row.copy() 

    if pd.isna(row['x_min']): # skip healthy patients as only ill people have bounding boxes
        return new_row
    
    original_width, original_height = original_dims[row['image_id']]
    
    # Skip invalid dimensions
    if not original_width or not original_height:
        new_row['x_min'] = new_row['y_min'] = new_row['x_max'] = new_row['y_max'] = None
        return new_row
    
    # Calculate scaling factors
    scale_x = 224 / original_width
    scale_y = 224 / original_height
    
    new_row['x_min'] = max(0, min(round(row['x_min'] * scale_x), 223)) #223 for 224 pixels (224x224 images)
    new_row['y_min'] = max(0, min(round(row['y_min'] * scale_y), 223))
    new_row['x_max'] = max(0, min(round(row['x_max'] * scale_x), 223)) 
    new_row['y_max'] = max(0, min(round(row['y_max'] * scale_y), 223))
    
    return new_row


df = pd.read_csv(ORIGINAL_CSV_PATH)

original_dims = {}

# Preprocess all unique DICOM files
for image_id in tqdm(df['image_id'].unique(), desc="Processing DICOMs"):
    dicom_path = os.path.join(ORIGINAL_DICOM_FOLDER, f"{image_id}.dicom")
    try:
        ds = pydicom.dcmread(dicom_path)
        original_dims[image_id] = (ds.Columns, ds.Rows)
    except Exception as e:
        print(f"Error at {image_id}: {str(e)}")
        original_dims[image_id] = (None, None)

scaled_df = df.apply(scale_coordinates, axis=1)
scaled_df.to_csv(NEW_CSV_PATH, index=False)

Processing DICOMs:   5%|▍         | 676/15000 [00:30<10:55, 21.86it/s]


KeyboardInterrupt: 