In [None]:
#Attention: This code runs on Python 3.11.7. It is necessary to use current versions of the packages (SimpleITK, etc.)
#This is because some packages need to be used in different versions and it is easier to this process in two steps.

#Annotations:
#What is malignant, what is benign? I have decided to consider a scan as malignant if there is at least one malignant tumour on the CT. We classify tumours with a malignancy rating of >3 as malignant and tumours <3 as benign.
#Voxels or World Coordinates? The CT-GAN takes both, the data is in world coordinates. Calculation from world coordinates to voxels is made within CT-GAN. Therefore, we export world coordinates and configure the config.py

In [1]:
! python --version

Python 3.11.10


In [18]:
#Install pyldic if you have not done that so far
pip install --upgrade pylidc

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Preparation
import pylidc as pl

import numpy as np
np.int = int 
np.bool = bool 
import matplotlib.pyplot as plt

In [None]:
#Load scans
scans = pl.query(pl.Scan).all()
annotations = pl.query(pl.Annotation).all()

In [5]:
# Zähle gutartige und bösartige Scans basierend auf dem Malignitätswert in den Annotationen
benign_count = 0
malignant_count = 0

# Iteriere durch alle Annotationen und klassifiziere basierend auf Malignität
for annotation in annotations:
    if annotation.malignancy > 3:
        malignant_count += 1
    elif annotation.malignancy < 3:
        benign_count += 1

print(f"Number of benign nodes: {benign_count}")
print(f"Number of malicious  nodes: {malignant_count}")

Number of benign nodes: 2600
Number of malicious  nodes: 1653


In [7]:
import pylidc as pl
from collections import defaultdict
import pandas as pd

# Load all annotations with pylidc
annotations = pl.query(pl.Annotation).all()

# Counting the annotations per patient
annotations_per_patient_full = defaultdict(int)
for annotation in annotations:
    patient_id = annotation.scan.patient_id  # Annahme: patient_id ist über scan abrufbar
    annotations_per_patient_full[patient_id] += 1

# Aggregate information on how many patients have how many annotations
annotations_count_distribution_full = defaultdict(int)
for count in annotations_per_patient_full.values():
    annotations_count_distribution_full[count] += 1

# Create a DataFrame for the output
annotations_df_full = pd.DataFrame(list(annotations_count_distribution_full.items()), 
                                   columns=['Annotations per Patient', 'Number of Patients'])
annotations_df_full = annotations_df_full.sort_values(by='Annotations per Patient').reset_index(drop=True)

# Output of the table
print(annotations_df_full)


    Annotations per Patient  Number of Patients
0                         1                  55
1                         2                  70
2                         3                  67
3                         4                 174
4                         5                  62
5                         6                  58
6                         7                  62
7                         8                  62
8                         9                  34
9                        10                  34
10                       11                  21
11                       12                  28
12                       13                  23
13                       14                  15
14                       15                  12
15                       16                   9
16                       17                  10
17                       18                   7
18                       19                   6
19                       20             

In [8]:
import pylidc as pl
from collections import defaultdict
import pandas as pd

# Query all scans in the data set
scans = pl.query(pl.Scan).all()

# Dictionary for storing the number of nodules per patient
nodules_per_patient = defaultdict(int)

# Iterate over each scan and count the nodules (without cluster_annotations)
for scan in scans:
    patient_id = scan.patient_id  # Angenommen, patient_id is avaailable
    annotations = scan.annotations 
    nodules_per_patient[patient_id] = len(annotations)

# Aggregate the number of patients per number of nodules
nodules_count_distribution = defaultdict(int)
for count in nodules_per_patient.values():
    nodules_count_distribution[count] += 1

# Create a DataFrame for a tabular display
nodules_df = pd.DataFrame(list(nodules_count_distribution.items()), 
                          columns=['Nodules per Patient', 'Number of Patients'])
nodules_df = nodules_df.sort_values(by='Nodules per Patient').reset_index(drop=True)

# Output of the aggregated table
print(nodules_df)


    Nodules per Patient  Number of Patients
0                     0                 135
1                     1                  55
2                     2                  70
3                     3                  68
4                     4                 175
5                     5                  63
6                     6                  58
7                     7                  61
8                     8                  61
9                     9                  36
10                   10                  33
11                   11                  21
12                   12                  28
13                   13                  23
14                   14                  15
15                   15                  12
16                   16                   9
17                   17                  10
18                   18                   7
19                   19                   6
20                   20                   7
21                   21         

In [9]:
import pylidc as pl
from collections import defaultdict

# Query all annotations in the dataset
annotations = pl.query(pl.Annotation).all()

# Dictionary to store malignancy values per patient
malignancy_per_patient = defaultdict(set)

# Iterate through all annotations and store the malignancy values for each patient
for annotation in annotations:
    patient_id = annotation.scan.patient_id  # Assumption: patient_id can be accessed via scan
    malignancy_per_patient[patient_id].add(annotation.malignancy)

# Check if there are patients who have both malignancy=1 and malignancy=5
patients_with_mixed_malignancy = [patient_id for patient_id, malignancies in malignancy_per_patient.items() if {1, 5}.issubset(malignancies)]

# Output patients with malignancy=1 and malignancy=5
print("Patients with malignancy=1 and malignancy=5:", patients_with_mixed_malignancy)

# Result: There are patients who have both malignant and benign cancer

Patients with malignancy=1 and malignancy=5: ['LIDC-IDRI-0078', 'LIDC-IDRI-0132', 'LIDC-IDRI-0011', 'LIDC-IDRI-0018', 'LIDC-IDRI-0041', 'LIDC-IDRI-0044', 'LIDC-IDRI-0045', 'LIDC-IDRI-0052', 'LIDC-IDRI-0063', 'LIDC-IDRI-0073', 'LIDC-IDRI-0081', 'LIDC-IDRI-0087', 'LIDC-IDRI-0098', 'LIDC-IDRI-0108', 'LIDC-IDRI-0117', 'LIDC-IDRI-0130', 'LIDC-IDRI-0134', 'LIDC-IDRI-0137', 'LIDC-IDRI-0141', 'LIDC-IDRI-0181', 'LIDC-IDRI-0186', 'LIDC-IDRI-0201', 'LIDC-IDRI-0203', 'LIDC-IDRI-0229', 'LIDC-IDRI-0250', 'LIDC-IDRI-0264', 'LIDC-IDRI-0267', 'LIDC-IDRI-0285', 'LIDC-IDRI-0300', 'LIDC-IDRI-0309', 'LIDC-IDRI-0314', 'LIDC-IDRI-0332', 'LIDC-IDRI-0347', 'LIDC-IDRI-0353', 'LIDC-IDRI-0368', 'LIDC-IDRI-0377', 'LIDC-IDRI-0407', 'LIDC-IDRI-0421', 'LIDC-IDRI-0436', 'LIDC-IDRI-0439', 'LIDC-IDRI-0447', 'LIDC-IDRI-0454', 'LIDC-IDRI-0458', 'LIDC-IDRI-0462', 'LIDC-IDRI-0464', 'LIDC-IDRI-0466', 'LIDC-IDRI-0469', 'LIDC-IDRI-0481', 'LIDC-IDRI-0484', 'LIDC-IDRI-0491', 'LIDC-IDRI-0493', 'LIDC-IDRI-0521', 'LIDC-IDRI-0526', 

In [10]:
import pylidc as pl

# Query all scans in the dataset
scans = pl.query(pl.Scan).all()

# Counters for the number of healthy and unhealthy scans
healthy_count = 0
unhealthy_count = 0

# Function to classify a scan as healthy or unhealthy based on malignancy without clustering
def classify_scan(scan):
    for annotation in scan.annotations:  # Direct access to annotations
        # Check if the annotation has a high malignancy
        if annotation.malignancy >= 4:
            return "Unhealthy"
    return "Healthy"

# Iterate through all scans and classify them
for scan in scans:
    classification = classify_scan(scan)
    if classification == "Healthy":
        healthy_count += 1
    else:
        unhealthy_count += 1

# Output the summary
print(f"Number of healthy patients: {healthy_count}")
print(f"Number of unhealthy patients: {unhealthy_count}")

Number of healthy patients: 448
Number of unhealthy patients: 570


In [1]:
#Number of observations
import pylidc as pl

# Initialize counter
count_between_10_and_16mm = 0

# Query all scans
scans = pl.query(pl.Scan)

for scan in scans:
    annotations = scan.annotations
    for ann in annotations:
        diameter = ann.diameter  # Diameter in millimeters
        
        # Count nodules between 10mm and 16mm (inclusive)
        if 10 <= diameter <= 16:
            count_between_10_and_16mm += 1

# Print the result
print(f"Number of nodules between 10mm and 16mm: {count_between_10_and_16mm}")

Number of nodules between 10mm and 16mm: 1454


In [5]:
import csv
import pylidc as pl

# Query all scans
scans = pl.query(pl.Scan).all()

# Lists for healthy and unhealthy data
healthy_data = []
unhealthy_data = []

# Function to classify the scan based on all nodules
def is_scan_unhealthy(scan):
    for annotation in scan.annotations:
        if annotation.malignancy >= 4:
            return True  # The scan is unhealthy
    return False  # The scan is healthy

# Iterate over all scans, classify them, and collect coordinates
for scan in scans:
    # Classify the scan based on all nodules
    classification = "Unhealthy" if is_scan_unhealthy(scan) else "Healthy"

    # Iterate over all annotations in the scan
    for annotation in scan.annotations:
        diameter = annotation.diameter  # Diameter in millimeters

        # Apply the diameter filter: only between 10 mm and 16 mm (inclusive)
        if not (10 <= diameter <= 16):
            continue  # Skip annotations outside the range

        centroid = annotation.centroid  # Coordinates of the nodule (use directly)

        # Prepare the data row
        data_row = [scan.patient_id, centroid[0], centroid[1], centroid[2]]

        # Add the data to the corresponding list
        if classification == "Healthy":
            healthy_data.append(data_row)
        else:
            unhealthy_data.append(data_row)

# Create CSV files for healthy and unhealthy scans
header = ["filename", "x", "y", "z"]

with open("healthy_coords.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(healthy_data)

with open("unhealthy_coords.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(unhealthy_data)

print("CSV files created: healthy_coords.csv and unhealthy_coords.csv")


CSV-Dateien erstellt: healthy_coords.csv und unhealthy_coords.csv


In [26]:
#Create directory
import pylidc as pl
import os
import shutil

# Path to the LIDC-IDRI dataset
lidc_dataset_path = r"C:\Users\Quent\Desktop\data\Data\manifest-1600709154662\LIDC-IDRI"

# Paths to the target directories
healthy_dir = r"C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans"
unhealthy_dir = r"C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans"

# Create target directories if they don't exist
os.makedirs(healthy_dir, exist_ok=True)
os.makedirs(unhealthy_dir, exist_ok=True)

# Function to determine if a scan is unhealthy
def is_scan_unhealthy(scan):
    for annotation in scan.annotations:
        if annotation.malignancy >= 4:
            return True  # The scan is unhealthy
    return False  # The scan is healthy

# Iterate over all scans
scans = pl.query(pl.Scan).all()

for scan in scans:
    # Determine the target directory based on the scan's classification
    if is_scan_unhealthy(scan):
        target_dir = unhealthy_dir
    else:
        target_dir = healthy_dir

    # Construct the path to the scan's DICOM directory
    scan_dicom_dir = os.path.join(lidc_dataset_path, scan.patient_id)

    if os.path.exists(scan_dicom_dir):
        # Destination directory for this scan
        dest_scan_dir = os.path.join(target_dir, scan.patient_id)

        if not os.path.exists(dest_scan_dir):
            # Copy the scan's DICOM directory to the destination
            shutil.copytree(scan_dicom_dir, dest_scan_dir)
            print(f"Copied scan {scan.patient_id} to {dest_scan_dir}")
        else:
            print(f"Destination directory {dest_scan_dir} already exists. Skipping.")
    else:
        print(f"Directory not found for scan {scan.patient_id}")


Copied scan LIDC-IDRI-0078 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0078
Copied scan LIDC-IDRI-0069 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0069
Copied scan LIDC-IDRI-0079 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0079
Copied scan LIDC-IDRI-0101 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0101
Copied scan LIDC-IDRI-0110 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0110
Copied scan LIDC-IDRI-0115 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0115
Copied scan LIDC-IDRI-0132 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0132
Copied scan LIDC-IDRI-0136 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans\LIDC-IDRI-0136
Copied scan LIDC-IDRI-0150 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy

In [33]:
import os
import shutil

# Function to count the number of .dcm files in a directory (including nested ones)
def count_dcm_files_recursive(folder_path):
    dcm_count = 0
    for root, dirs, files in os.walk(folder_path):
        dcm_count += len([f for f in files if f.endswith('.dcm')])
    return dcm_count

# Function to move .dcm files to the main folder
def move_dcm_files_recursive(src_folder, dest_folder):
    for root, dirs, files in os.walk(src_folder):
        for file in files:
            if file.endswith('.dcm'):
                src_path = os.path.join(root, file)
                dest_path = os.path.join(dest_folder, file)
                shutil.move(src_path, dest_path)

# Function to process each LIDC-IDRI folder
def process_folder(main_folder):
    lidc_folders = [os.path.join(main_folder, f) for f in os.listdir(main_folder) if f.startswith("LIDC-IDRI-")]
    print(f"Found {len(lidc_folders)} LIDC-IDRI folders.")

    for lidc_folder in lidc_folders:
        print(f"Processing {lidc_folder}...")

        # Get all subdirectories in the current LIDC-IDRI folder
        subdirs = [os.path.join(lidc_folder, d) for d in os.listdir(lidc_folder) if os.path.isdir(os.path.join(lidc_folder, d))]
        if not subdirs:
            print(f"Skipping {lidc_folder} - no subdirectories found.")
            continue

        # Find the subdirectory with the most .dcm files (including nested ones)
        subdir_counts = {subdir: count_dcm_files_recursive(subdir) for subdir in subdirs}
        for subdir, count in subdir_counts.items():
            print(f"  {subdir} contains {count} .dcm files.")

        # Determine the subdirectory to use for moving files
        target_dir = max(subdir_counts, key=subdir_counts.get)
        print(f"  Moving files from {target_dir} to {lidc_folder}...")

        # Move all .dcm files from the target subdirectory to the main folder
        move_dcm_files_recursive(target_dir, lidc_folder)

        # Delete all subdirectories
        print(f"  Deleting all subdirectories in {lidc_folder}...")
        for subdir in subdirs:
            shutil.rmtree(subdir, ignore_errors=True)

        print(f"  Finished processing {lidc_folder}.\n")

# Define the paths to your healthy and unhealthy scan folders
healthy_folder = r"C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans"
unhealthy_folder = r"C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\unhealthy_scans"

print("Processing healthy scans folder...")
process_folder(healthy_folder)

print("Processing unhealthy scans folder...")
process_folder(unhealthy_folder)

print("Operation completed.")


Processing healthy scans folder...
Found 448 LIDC-IDRI folders.
Processing C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004...
  C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004\01-01-2000-NA-NA-88534 contains 1 .dcm files.
  C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004\01-01-2000-NA-NA-91780 contains 241 .dcm files.
  Moving files from C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004\01-01-2000-NA-NA-91780 to C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004...
  Deleting all subdirectories in C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004...
  Finished processing C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0004.

Processing C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-master\data\healthy_scans\LIDC-IDRI-0005...
  C:\Users\Quent\Desktop\CT-GAN_alt\CT-GAN-ma