# 00_DICOM_DataFrame.ipynb

This notebook:
- extract the metadata of the orginal CT scans and save in `CT_data/DICOM_Data.csv`, and
- get the binary label (abnormal/normal) for the converted NIFTI files and save in `CT_data/label.csv`. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydicom
from pydicom import dcmread
from pydicom.data import get_testdata_files
import glob
import os
from collections import defaultdict
import tqdm
import gc
import seaborn as sns
import ast

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 100)

# Utilitiy files located in the src folder
from src.util_data import listdir_nohidden

In [2]:
work_dir = <PATH TO project_dir> # path to the directory of the current project (project master path)
raw_data_dir = work_dir + 'raw_data/' # path to original DICOM dataset
targetpath = work_dir + 'CT_data/' # path to the converted converted dataset 

## Prepare for metadata extraction

In [4]:
# Column name to extract from the DICOM files. (PixelData is excluded)
col_name = ['AccessionNumber',
            'AcquisitionNumber',
            'BitsAllocated',
            'BitsStored',
            'BodyPartExamined',
            'CTDIPhantomTypeCodeSequence',
            'CTDIvol',
            'Columns',
            'ContentDate',
            'ContentTime',
            'ContrastBolusAgent',
            'ContrastBolusIngredientConcentration',
            'ContrastBolusStartTime',
            'ContrastBolusStopTime',
            'ContrastBolusTotalDose',
            'ContrastBolusVolume',
            'ContrastFlowDuration',
            'ContrastFlowRate',
            'ConvolutionKernel',
            'DataCollectionCenterPatient',
            'DataCollectionDiameter',
            'DateOfLastCalibration',
            'DistanceSourceToDetector',
            'DistanceSourceToPatient',
            'EstimatedDoseSaving',
            'Exposure',
            'ExposureModulationType',
            'ExposureTime',
            'FilterType',
            'FocalSpots',
            'FrameOfReferenceUID',
            'GantryDetectorTilt',
            'GeneratorPower',
            'HighBit',
            'ImageOrientationPatient',
            'ImagePositionPatient',
            'ImageType',
            'InstanceNumber',
            'IrradiationEventUID',
            'IssuerOfPatientID',
            'KVP',
            'LargestImagePixelValue',
            'Manufacturer',
            'ManufacturerModelName',
            'Modality',
            'PatientBirthDate',
            'PatientID',
            'PatientName',
            'PatientPosition',
            'PatientSex',
            'PhotometricInterpretation',
            # 'PixelData',
            'PixelRepresentation',
            'PixelSpacing',
            'PositionReferenceIndicator',
            'ReconstructionDiameter',
            'ReconstructionTargetCenterPatient',
            'ReferringPhysicianName',
            'RescaleIntercept',
            'RescaleSlope',
            'RescaleType',
            'RotationDirection',
            'Rows',
            'SOPClassUID',
            'SOPInstanceUID',
            'SamplesPerPixel',
            'SeriesInstanceUID',
            'SeriesNumber',
            'SingleCollimationWidth',
            'SliceLocation',
            'SliceThickness',
            'SmallestImagePixelValue',
            'SoftwareVersions',
            'SpecificCharacterSet',
            'SpiralPitchFactor',
            'StudyDate',
            'StudyID',
            'StudyInstanceUID',
            'StudyTime',
            'TableFeedPerRotation',
            'TableHeight',
            'TableSpeed',
            'TimeOfLastCalibration',
            'TotalCollimationWidth',
            'WindowCenter',
            'WindowCenterWidthExplanation',
            'WindowWidth',
            'XRayTubeCurrent']

In [5]:
df = pd.DataFrame(columns=col_name)
my_dict = defaultdict(list)

In [6]:
numberofslices_list = []
location_orig_list = []
series_list = []
basename1_list = []
basename2_list = []
basename3_list = []
normal_pat_list = []

## Extract the metadata from each CT scan

In [None]:
for series_path in tqdm.tqdm(glob.glob(raw_data_dir+'*/*/DICOM/*/*/*/*')):
    name = listdir_nohidden(series_path)[0] # Get 1 DICOM path within the CT scan
    ds = pydicom.read_file(name)
    for i in col_name :
        if i in ds :
            my_dict[i].append(str(ds[i].value))
        else:
            my_dict[i].append(np.nan)
    df = pd.concat([df, pd.DataFrame(my_dict)], ignore_index = True)
    del my_dict
    number_of_slices = len(listdir_nohidden(series_path))
    normal_pat = series_path.split("/")[-7]
    basename1 = series_path.split("/")[-4]
    basename2 = series_path.split("/")[-3]
    basename3 = series_path.split("/")[-2]
    series = os.path.basename(series_path)
    numberofslices_list.append(number_of_slices)
    location_orig_list.append(series_path)
    series_list.append(series)
    normal_pat_list.append(normal_pat)
    basename1_list.append(basename1)
    basename2_list.append(basename2)
    basename3_list.append(basename3)
    my_dict = defaultdict(list)
gc.collect()

In [9]:
df["numberofslices"]=numberofslices_list
df["path_to_orig_series"]=location_orig_list
df["basename1"]=basename1_list
df["basename2"]=basename2_list
df["basename3"]=basename3_list
df["normal_patient"]=normal_pat_list
df["filename_3d"]=series_list

Get the number of CT scans (stacks of DICOM file).

In [None]:
number_of_dicom = len(df)
print("Number of dicom stacks in the dataset: ", number_of_dicom)

### Record labels that are not directly obtained from the DICOM files.

The metadata of this parameters are saved in a list. Relocating them in individual column.

- imagetype1: ORIGINAL
- imagetype3: PRIMARY / SECONDARY
- imagetype3: AXIAL / LOCALIZER

In [None]:
imagetype_1_list=[]
imagetype_2_list=[]
imagetype_3_list=[]
for i in tqdm.tqdm(range(number_of_dicom)):
    img_type = ast.literal_eval(df.ImageType[i])
    if len(img_type) >= 3:
        imagetype_1_list.append(img_type[0])
        imagetype_2_list.append(img_type[1])
        imagetype_3_list.append(img_type[2])
    elif len(img_type) == 2:
        imagetype_1_list.append(img_type[0])
        imagetype_2_list.append(img_type[1])
        imagetype_3_list.append('problemInput')
    
df["imagetype_1"]=imagetype_1_list
df["imagetype_2"]=imagetype_2_list
df["imagetype_3"]=imagetype_3_list

In [16]:
# save the metadata to csv
df.to_csv(targetpath + 'DICOM_Data.csv',index=False)

## Get abnormal labels

Generate the `label.csv`. The format is:

| PatientName |   image_name | abnormal |
|:------------|-------------:|---------:|
|    Patient1 |      xxx.nii |        1 |
|    Patient1 |      yyy.nii |        1 |
|    Patient2 |      zzz.nii |        0 |

where `Patient 1` has two CT scans (`xxx.nii` and `yyy.nii`) labeled as abnormal (1) and `Patient 2` has one CT scans (`zzz.nii`) labeled as normal (0) in the dataset. 

In [None]:
df_label = df[["PatientName","filename_3d","normal_patient"]]

abnormal_map = {'normal':0, 'abnormal':1} 
df_label["abnormal"] = df_label["normal_patient"].map(abnormal_map)

df_label["add_str_column"] = ".nii"
df_label["image_name"] = df_label["filename_3d"].str.cat(df_label["add_str_column"],sep="")
df_label = df_label[["PatientName","image_name","abnormal"]]

#save the table as label.csv
df_label.to_csv(targetpath + 'label.csv',index=False)