In [None]:
# Import Lib

In [None]:
import os
import pandas as pd
import pydicom
import numpy as np


# Load Data

In [None]:
df = pd.read_csv('cq500.csv')
df.head()

# Details Information of Data

In [None]:
# Read dicom file and get the information based on name and Source Folder columns
def get_dicom_info(row):
    try:
        src_dir = './archive/' + row['name'] + '/Unknown Study/' + row['Source Folder']
        src_files = os.listdir(src_dir)
        src_files = [f for f in src_files if f.endswith('.dcm')]
        dicom_info = []
        for f in src_files:
            dicom = pydicom.dcmread(os.path.join(src_dir, f))
            dicom_info.append(dicom)
        # Arrange the dicom files based on last dimension of ImagePositionPatient, which is the z-axis
        # Arrange the src_files based on the z-axis
        img_pos = [float(x.ImagePositionPatient[2]) for x in dicom_info]
        src_files = [x for _, x in sorted(zip(img_pos, src_files))]
        dicom_info = [x for _, x in sorted(zip(img_pos, dicom_info))]
        return src_files, [str(x.ImagePositionPatient) for x in dicom_info]

    except:
        return None

# Get the information of dicom files of first row
src_file, img_pos = get_dicom_info(df.iloc[0])
print(src_file, img_pos)

In [None]:
import matplotlib.pyplot as plt
import pydicom
import os

def plot_dicom(row):
    src_file, img_pos = get_dicom_info(row)
    num_images = len(src_file)

    # Calculate rows needed (8 images per row)
    rows = num_images // 8 + (1 if num_images % 10 else 0)

    # Create subplots with proper spacing
    fig, axs = plt.subplots(rows, 8, figsize=(25, 3 * rows))

    # Flatten axes array for easy iteration
    axs = axs.flatten()

    for i, f in enumerate(src_file):
        # Read DICOM file
        dicom_path = os.path.join('./archive', row['name'], 'Unknown Study', row['Source Folder'], f)
        dicom = pydicom.dcmread(dicom_path)

        # Process and plot image
        axs[i].imshow(dicom.pixel_array, cmap='bone')
        axs[i].set_title(f.split('.')[0], fontsize=8)  # Show filename without extension
        axs[i].axis('off')

    # Hide empty subplots
    for j in range(num_images, len(axs)):
        axs[j].axis('off')

    plt.tight_layout()
    plt.show()

plot_dicom(df.iloc[0])

In [None]:
# # Add src_files and img_pos columns to the dataframe
# df[['filename', 'img_pos']] = df.apply(get_dicom_info, axis=1, result_type='expand')

In [None]:
df.to_csv('cq500.csv', index=False)
df.head()

# Remove Redundant Data

In [None]:
import ast

def remove_redundant_slices(row):
    src_files = row['filename']
    img_pos_str = row['img_pos']

    src_files = ast.literal_eval(src_files)
    
    # Parse image positions from string to list of coordinate lists
    try:
        pos_list = ast.literal_eval(img_pos_str)  # Convert string to list of "[x,y,z]" strings
        z_coords = []
        parsed_positions = []
        
        # Extract and validate positions
        for pos_str in pos_list:
            # Remove brackets and parse coordinates
            coords = [float(c.strip()) for c in pos_str.strip('[]').split(',')]
            if len(coords) != 3:
                raise ValueError(f"Invalid position format: {pos_str}")
            parsed_positions.append(coords)
            z_coords.append(coords[2])
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing positions: {str(e)}")
        return None, None

    # Pair files with positions and sort by z-axis
    paired_data = sorted(zip(src_files, parsed_positions, z_coords), 
                       key=lambda x: x[2])
    src_files = [x[0] for x in paired_data]
    z_coords = [x[2] for x in paired_data]
    parsed_positions = [x[1] for x in paired_data]

    # Handle different cases
    num_slices = len(src_files)
    
    if num_slices < 10:
        print(f"Removing row with {num_slices} slices")
        return None, None
        
    if num_slices == 32:
        return src_files, [str(p) for p in parsed_positions]

    if num_slices > 32:
        # Convert to numpy arrays for efficient calculations
        z = np.array(z_coords)
        
        # Calculate optimal number of slices to keep (maximum 32)
        keep_indices = [0]
        last_z = z[0]
        
        for i in range(1, len(z)):
            if (z[i] - last_z) >= 5:
                keep_indices.append(i)
                last_z = z[i]
            if len(keep_indices) == 32:
                break
                
        # If we didn't find enough 5mm spaced slices, take first 32
        if len(keep_indices) < 32:
            keep_indices = np.linspace(0, len(z)-1, 32, dtype=int).tolist()
            
        filtered_files = [src_files[i] for i in keep_indices]
        filtered_pos = [str(parsed_positions[i]) for i in keep_indices]
        
        return filtered_files[:32], filtered_pos[:32]

    return src_files, [str(p) for p in parsed_positions]

# Remove redundant slices
df[['filename', 'img_pos']] = df.apply(remove_redundant_slices, axis=1, result_type='expand')
df.to_csv('cq500_redundancy.csv', index=False)