In [13]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pydicom
import os
import pandas as pd
from tqdm import tqdm

In [14]:
# Get names of all DICOM files

# Use your path to the kaggle dataset
path = "../../../data/train_images"

dicom_files = []
for root, dirs, files in os.walk(path):
    for file in files:
        if file.endswith(".dcm"):
            dicom_files.append(os.path.join(root, file))

In [15]:
train_label_coordinates = pd.read_csv('../../data/train_label_coordinates.csv')

In [16]:
# Parse dicom_file name to get study_id, series_id, and instance_number
def parse_dicom_file(dicom_file):
    study_id = dicom_file.split('/')[-3]
    study_id = int(study_id)
    series_id = dicom_file.split('/')[-2]
    series_id = int(series_id)
    instance_number = int(dicom_file.split('/')[-1].split('.')[0])
    return study_id, series_id, instance_number

original_path = '../../../data/train_images/'
# Add file path to train_label_coordinates dataframe
for i in tqdm(range(len(train_label_coordinates))):
    study_id = train_label_coordinates.loc[i, 'study_id']
    series_id = train_label_coordinates.loc[i, 'series_id']
    instance_number = train_label_coordinates.loc[i, 'instance_number']
    # Create file path
    file_path = original_path + str(study_id) + '/' + str(series_id) + '/' + str(instance_number) + '.dcm'
    train_label_coordinates.loc[i, 'file_path'] = file_path

100%|██████████| 48692/48692 [00:05<00:00, 9520.06it/s]


In [17]:
train_label_coordinates

Unnamed: 0,study_id,series_id,instance_number,condition,level,x,y,file_path
0,4003253,702807833,8,Spinal Canal Stenosis,L1/L2,322.831858,227.964602,../../../data/train_images/4003253/702807833/8...
1,4003253,702807833,8,Spinal Canal Stenosis,L2/L3,320.571429,295.714286,../../../data/train_images/4003253/702807833/8...
2,4003253,702807833,8,Spinal Canal Stenosis,L3/L4,323.030303,371.818182,../../../data/train_images/4003253/702807833/8...
3,4003253,702807833,8,Spinal Canal Stenosis,L4/L5,335.292035,427.327434,../../../data/train_images/4003253/702807833/8...
4,4003253,702807833,8,Spinal Canal Stenosis,L5/S1,353.415929,483.964602,../../../data/train_images/4003253/702807833/8...
...,...,...,...,...,...,...,...,...
48687,4290709089,4237840455,11,Left Neural Foraminal Narrowing,L1/L2,219.465940,97.831063,../../../data/train_images/4290709089/42378404...
48688,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L2/L3,205.340599,140.207084,../../../data/train_images/4290709089/42378404...
48689,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L3/L4,202.724796,181.013624,../../../data/train_images/4290709089/42378404...
48690,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L4/L5,202.933333,219.733333,../../../data/train_images/4290709089/42378404...


In [18]:
# Check if the file path exists
for i in tqdm(range(len(train_label_coordinates))):
    file_path = train_label_coordinates.loc[i, 'file_path']
    if not os.path.exists(file_path):
        print(f"File path {file_path} does not exist")

# Great! They all exist. Now, let's create the preprocessed images and save them to a new directory

100%|██████████| 48692/48692 [00:01<00:00, 39364.77it/s]


In [19]:
# Function to preprocess the image
def preprocess_image(file_path, coord_row):
    # Read the dicom file
    dicom = pydicom.dcmread(file_path)
    # Get the pixel array
    image = dicom.pixel_array
    # Normalize the pixel array
    image = image - np.min(image)
    image = image / np.max(image)
    image = (image * 255).astype(np.uint8)

    # Crop the 100x100 region around the coordinates
    y = int(coord_row['y'])
    x = int(coord_row['x'])
    ymin = y - 50 if y - 50 >= 0 else 0
    ymax = y + 50 if y + 50 <= image.shape[0] else image.shape[0]
    xmin = x - 50 if x - 50 >= 0 else 0
    xmax = x + 50 if x + 50 <= image.shape[1] else image.shape[1]
    image = image[ymin:ymax, xmin:xmax]

    # Save the image as a .png file
    save_path = '../../data/preprocessed/' + str(coord_row['study_id']) + '_' + str(coord_row['series_id']) + '_' + str(coord_row['instance_number']) + '_' + str(x) + '_' + str(y) + '.png'
    save_path = save_path.replace('.dcm', '.png')
    # If the directory does not exist, create it
    save_dir = os.path.dirname(save_path)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    # Save the image
    try:
        cv2.imwrite(save_path, image)
    except:
        print(f"Error saving image at {save_path}")
        return None

    # Return the save path
    return save_path

In [20]:
for i in tqdm(range(len(train_label_coordinates))):
    file_path = train_label_coordinates.loc[i, 'file_path']
    coord_row = train_label_coordinates.loc[i]
    save_path = preprocess_image(file_path, coord_row)
    train_label_coordinates.loc[i, 'preprocessed_file_path'] = save_path

100%|██████████| 48692/48692 [07:25<00:00, 109.42it/s]


In [21]:
train_label_coordinates['preprocessed_file_path']

0        ../../data/preprocessed/4003253_702807833_8_32...
1        ../../data/preprocessed/4003253_702807833_8_32...
2        ../../data/preprocessed/4003253_702807833_8_32...
3        ../../data/preprocessed/4003253_702807833_8_33...
4        ../../data/preprocessed/4003253_702807833_8_35...
                               ...                        
48687    ../../data/preprocessed/4290709089_4237840455_...
48688    ../../data/preprocessed/4290709089_4237840455_...
48689    ../../data/preprocessed/4290709089_4237840455_...
48690    ../../data/preprocessed/4290709089_4237840455_...
48691    ../../data/preprocessed/4290709089_4237840455_...
Name: preprocessed_file_path, Length: 48692, dtype: object

In [22]:
train_label_coordinates

Unnamed: 0,study_id,series_id,instance_number,condition,level,x,y,file_path,preprocessed_file_path
0,4003253,702807833,8,Spinal Canal Stenosis,L1/L2,322.831858,227.964602,../../../data/train_images/4003253/702807833/8...,../../data/preprocessed/4003253_702807833_8_32...
1,4003253,702807833,8,Spinal Canal Stenosis,L2/L3,320.571429,295.714286,../../../data/train_images/4003253/702807833/8...,../../data/preprocessed/4003253_702807833_8_32...
2,4003253,702807833,8,Spinal Canal Stenosis,L3/L4,323.030303,371.818182,../../../data/train_images/4003253/702807833/8...,../../data/preprocessed/4003253_702807833_8_32...
3,4003253,702807833,8,Spinal Canal Stenosis,L4/L5,335.292035,427.327434,../../../data/train_images/4003253/702807833/8...,../../data/preprocessed/4003253_702807833_8_33...
4,4003253,702807833,8,Spinal Canal Stenosis,L5/S1,353.415929,483.964602,../../../data/train_images/4003253/702807833/8...,../../data/preprocessed/4003253_702807833_8_35...
...,...,...,...,...,...,...,...,...,...
48687,4290709089,4237840455,11,Left Neural Foraminal Narrowing,L1/L2,219.465940,97.831063,../../../data/train_images/4290709089/42378404...,../../data/preprocessed/4290709089_4237840455_...
48688,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L2/L3,205.340599,140.207084,../../../data/train_images/4290709089/42378404...,../../data/preprocessed/4290709089_4237840455_...
48689,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L3/L4,202.724796,181.013624,../../../data/train_images/4290709089/42378404...,../../data/preprocessed/4290709089_4237840455_...
48690,4290709089,4237840455,12,Left Neural Foraminal Narrowing,L4/L5,202.933333,219.733333,../../../data/train_images/4290709089/42378404...,../../data/preprocessed/4290709089_4237840455_...


In [23]:
# Add int_x and int_y columns to the dataframe
train_label_coordinates['int_x'] = train_label_coordinates['x'].astype(int)
train_label_coordinates['int_y'] = train_label_coordinates['y'].astype(int)

In [24]:
# Save the train_label_coordinates dataframe
train_label_coordinates.to_csv('../../data/train_label_coordinates_preprocessed.csv', index=False)