In [8]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split

In [9]:
csv_path = "datasets/csv"
csv_dirs = os.listdir(csv_path)

# understanding the data provided
for file in csv_dirs:
    df_name = file[:-4]
    # print(df_name)
    df = pd.read_csv(csv_path + "/" + file)
    # display(df.head(1))

In [10]:
# reading CSVs as separate datasets
calc_case_description_test_set = pd.read_csv("datasets/csv/calc_case_description_test_set.csv")
calc_case_description_train_set = pd.read_csv("datasets/csv/calc_case_description_train_set.csv")
dicom_info = pd.read_csv("datasets/csv/dicom_info.csv")
mass_case_description_test_set = pd.read_csv("datasets/csv/mass_case_description_test_set.csv")
mass_case_description_train_set = pd.read_csv("datasets/csv/mass_case_description_train_set.csv")
meta = pd.read_csv("datasets/csv/meta.csv")

# choosing columns to keep
keep_set = ["patient_id", "pathology", "image file path"]
keep_dicom = ["image_path", "SeriesInstanceUID"]
keep_meta = ["SeriesInstanceUID", "StudyInstanceUID"]

# modifying datasets to keep relavent columns
calc_case_description_test_set = calc_case_description_test_set[keep_set].drop_duplicates()
calc_case_description_train_set = calc_case_description_train_set[keep_set].drop_duplicates()
dicom_info = dicom_info[keep_dicom].drop_duplicates()
mass_case_description_test_set = mass_case_description_test_set[keep_set].drop_duplicates()
mass_case_description_train_set = mass_case_description_train_set[keep_set].drop_duplicates()
meta = meta[keep_meta].drop_duplicates()

In [11]:
# getting all entries with matching patient_id
matching_test = pd.merge(
    calc_case_description_test_set, mass_case_description_test_set,
    how = "inner", left_on = "patient_id", right_on = "patient_id"
)

# getting unique patient_id to remove from original sets
unique_patient_ids = matching_test['patient_id'].unique().tolist()

# remove entries with matching patient id
calc_case_description_test_set = calc_case_description_test_set[~calc_case_description_test_set['patient_id'].isin(unique_patient_ids)]
mass_case_description_test_set = mass_case_description_test_set[~mass_case_description_test_set['patient_id'].isin(unique_patient_ids)]

# getting all entries with matching patient_id
matching_train = pd.merge(
    calc_case_description_train_set, mass_case_description_train_set,
    how = "inner", left_on = "patient_id", right_on = "patient_id"
)

# getting unique patient_id to remove from original sets
unique_patient_ids = matching_train['patient_id'].unique().tolist()

# remove entries with matching patient id
calc_case_description_train_set = calc_case_description_train_set[~calc_case_description_train_set['patient_id'].isin(unique_patient_ids)]
mass_case_description_train_set = mass_case_description_train_set[~mass_case_description_train_set['patient_id'].isin(unique_patient_ids)]

In [12]:
def connectDicom(dataset):
    # splitting and getting the second set of numbers
    dataset["common_id"] = dataset["image file path"].str.split("/").str[2]
    # merging where common_id matches SeriesInstanceUID
    dataset = pd.merge(dataset, dicom_info, how = "inner", left_on = "common_id", right_on = "SeriesInstanceUID")
    # dropping original "image file path", and "common_id"
    dataset = dataset.drop(columns = ["common_id", "image file path"])
    return dataset

# running function onto each dataset
calc_case_description_test_set = connectDicom(calc_case_description_test_set)
calc_case_description_train_set = connectDicom(calc_case_description_train_set)
mass_case_description_test_set = connectDicom(mass_case_description_test_set)
mass_case_description_train_set = connectDicom(mass_case_description_train_set)

In [14]:
# combining all 4 datasets together to conduct train_test_split
dataset = pd.concat([calc_case_description_test_set, calc_case_description_train_set,
                     mass_case_description_test_set, mass_case_description_train_set])

# modifying path to follow actual image path
def modify_path(img_path):
    new_path =  f'datasets/jpeg/{img_path.split("jpeg/")[1]}'
    return new_path

# changing path in image_path column
dataset["image_path"] = dataset["image_path"].apply(modify_path)
# splitting data into training and testing sets
train_data, test_data = train_test_split(dataset, test_size = 0.2, random_state = 42)

# writing into csv
train_data.to_csv("datasets/csv/train_data.csv", index = False)
test_data.to_csv("datasets/csv/test_data.csv", index = False)

display(train_data.head())
display(test_data.head())

Unnamed: 0,patient_id,pathology,image_path,SeriesInstanceUID
176,P_00339,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.1925301...,1.3.6.1.4.1.9590.100.1.2.192530197711481513109...
182,P_00355,BENIGN,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.4105746...,1.3.6.1.4.1.9590.100.1.2.410574697111807072009...
398,P_00692,BENIGN,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.8590696...,1.3.6.1.4.1.9590.100.1.2.859069621116119183062...
893,P_01481,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.2596841...,1.3.6.1.4.1.9590.100.1.2.259684152811080267334...
101,P_00196,BENIGN,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.8708234...,1.3.6.1.4.1.9590.100.1.2.870823408111977721032...


Unnamed: 0,patient_id,pathology,image_path,SeriesInstanceUID
537,P_00901,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.3947922...,1.3.6.1.4.1.9590.100.1.2.394792236611156603001...
370,P_00667,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.3969023...,1.3.6.1.4.1.9590.100.1.2.396902344711303322128...
574,P_00961,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.3311828...,1.3.6.1.4.1.9590.100.1.2.331182811135153738204...
177,P_01390,MALIGNANT,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.2551878...,1.3.6.1.4.1.9590.100.1.2.255187836210844252836...
313,P_00518,BENIGN,datasets/jpeg/1.3.6.1.4.1.9590.100.1.2.1490924...,1.3.6.1.4.1.9590.100.1.2.149092480112927640720...
