### Import Libraries

In [27]:
# import libraries
import pandas as pd
import os

### Read description data for training and testing 

In [90]:
# open metadata file
meta_path = 'CBIS-DDSM_Data/images_dataset/metadata.csv' 
metadata_df = pd.read_csv(meta_path)

# open csv data files
mass_train_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_train_set.csv")
mass_test_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_test_set.csv")
calc_train_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_train_set.csv")
calc_test_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_test_set.csv")

In [92]:
calc_test_orig["image file path"][0]

'Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009/1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992/000000.dcm'

### Combine mass and calcification data files  

In [48]:
# rename columns
col_new_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "image_path", "cropped_image_path", "ROI_mask_path"] 
mass_names = ["breast_density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]
calc_names = ["breast density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]

mass_train_renamed = mass_train_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
mass_test_renamed = mass_test_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
calc_train_renamed = calc_train_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})
calc_test_renamed = calc_test_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})

In [50]:
# print number of cases for file
print("Mass trained cases: ", len(mass_train_renamed))
print("Mass trained cases: ", len(mass_test_renamed))
print("Calc trained cases: ", len(calc_train_renamed))
print("Calc trained cases: ", len(calc_test_renamed))

Mass trained cases:  1318
Mass trained cases:  378
Calc trained cases:  1546
Calc trained cases:  326


In [61]:
# combine columns and print number of cases combining training and testing data
train_orig = pd.concat([mass_train_renamed[col_new_names], calc_train_renamed[col_new_names]], ignore_index=True)
test_orig = pd.concat([mass_test_renamed[col_new_names], calc_test_renamed[col_new_names]], ignore_index=True)
print("Train cases: ", len(train_orig))
print("Test cases: ", len(test_orig))

Train cases:  2864
Test cases:  704


### Extract series UIDs from paths 
Extract series UIDs from paths from descriptions files and from images stored in images_png 

In [94]:
# Extract training series UIDs from paths stored in the description files 
train_series = train_orig["image_path"]
test_series = test_orig["image_path"]

train_series_list = [serie.split("/")[2] for serie in train_series]
test_series_list = [serie.split("/")[2] for serie in test_series]
print("Train path: ", train_series[0])
print("Train serie: ", train_series_list[0])
print()
print("Test path: ", test_series[0])
print("Test serie: ", test_series_list[0])

Train path:  Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.422112722213189649807611434612228974994/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515/000000.dcm
Train serie:  1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515

Test path:  Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.416403281812750683720028031170500130104/1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172/000000.dcm
Test serie:  1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172


In [95]:
# concatenate series UID to training and test sets
train_orig["series_uid"] = train_series_list 
test_orig["series_uid"] = test_series_list

print(train_orig.iloc[0])
print(train_orig.iloc[2000])

breast_density                                                        3
breast_side                                                        LEFT
image_view                                                           CC
abnormality_type                                                   mass
pathology                                                     MALIGNANT
image_path            Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...
cropped_image_path    Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
ROI_mask_path         Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
series_uid            1.3.6.1.4.1.9590.100.1.2.342386194811267636608...
Name: 0, dtype: object
breast_density                                                        3
breast_side                                                       RIGHT
image_view                                                           CC
abnormality_type                                          calcification
pathology                                

In [96]:
# Extracts paths from images stored in images directory 
path_to_imgs = "images_png/"
paths_to_files = os.listdir(path_to_imgs)[1:]
paths_to_files[:3]

['1.3.6.1.4.1.9590.100.1.2.1000188793118245351251151451524542911321-1.png',
 '1.3.6.1.4.1.9590.100.1.2.1000188793118245351251151451524542911321-2.png',
 '1.3.6.1.4.1.9590.100.1.2.1001312081106048061172717354220833515471-1.png']

In [97]:
# creates a list of series UIDs by removing file extension and last 3 digits 
series_to_imgs = [path[:-7] for path in paths_to_imgs]
series_to_imgs[:3]

['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547']

In [98]:
"1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009" in series_to_imgs

False

In [99]:
type("1.3.6.1.4.1.9590.100.1.2.161465562211359959230647609981488894942")

str

In [100]:
calc_test_renamed["image_path"][0]

'Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009/1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992/000000.dcm'

In [14]:
path_parts[1] in path

AttributeError: 'list' object has no attribute 'values'

In [None]:
metadata_df["Series UID"].str.contains
result = df[df['Column1'].str.contains('an', case=False, na=False)]
print(result)

In [7]:
print("meta columns: ", metadata_df.columns)
print("mass columns: ", mass_train_orig.columns)
print("calc columns: ", calc_train_orig.columns)

meta columns:  Index(['Series UID', 'Collection', '3rd Party Analysis',
       'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
       'Study Date', 'Series Description', 'Manufacturer', 'Modality',
       'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
       'File Location', 'Download Timestamp'],
      dtype='object')
mass columns:  Index(['patient_id', 'breast_density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'mass shape', 'mass margins',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path'],
      dtype='object')
calc columns:  Index(['patient_id', 'breast density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'calc type', 'calc distribution',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path'],
      dtype='object')
