In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
from tqdm import tqdm
from matplotlib import pyplot as plt

In [None]:
def load_csv(csv_path):
    data_df = pd.read_csv(csv_path)
    data_df[['ID', 'Image', 'Diagnosis']] = data_df['ID'].str.split('_', expand=True)
    data_df = data_df.drop(['ID'], axis=1).drop_duplicates()
    data_df = data_df.pivot(index='Image', columns='Diagnosis', values='Label').reset_index()
    data_df['Image'] = 'ID_' + data_df['Image']
    return data_df

def get_metadata(path, files):
    train_di = {}

    for filename in tqdm(files):
        dcm = pydicom.dcmread(path + filename)
        all_keywords = dcm.dir()
        ignored = ['Rows', 'Columns', 'PixelData']

        if 'ID' not in train_di:
            train_di['ID'] = []
            
        train_di['ID'].append(filename[:-4])
        
        for name in all_keywords:
            if name in ignored:
                continue

            if name not in train_di:
                train_di[name] = []

            train_di[name].append(dcm[name].value)

    df = pd.DataFrame(train_di)
    
    return df

In [None]:
path = '../data/'
path_train = path + 'stage_2_train_images/'
path_test = path + 'stage_2_test_images/'

In [None]:
train_files = os.listdir(path_train)
train_meta = get_metadata(path_train, train_files)
train_meta.to_csv('train_stage1_metadata.csv', index=False)

In [None]:
test_files = os.listdir(path_test)
test_meta = get_metadata(path_test, test_files)
test_meta.to_csv('test_stage_2_metadata.csv', index=False)