In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2 as cv
import numpy as np
from dataclasses import dataclass
import os
import random
from numba import njit, jit
from sklearn.utils import shuffle
import shutil



In [2]:
## Read EMBED dataset metadata files.
magview_path = '/data/mammo/png/magview_all_cohorts_anon_HITI.csv'
metadata_path = '/data/mammo/png/metadata_all_cohort_with_ROI_HITI.csv'


# In[ ]:





# In[3]:

## Read magview image findings file.
df_mag = pd.read_csv(magview_path)
df_mag['study_date_anon'] = pd.to_datetime(df_mag['study_date_anon'], errors='coerce', format= '%Y-%m-%d')
list(df_mag.columns)

  df_mag = pd.read_csv(magview_path)


['Unnamed: 0.2',
 'Unnamed: 0.1',
 'index',
 'Unnamed: 0',
 'massshape',
 'massmargin',
 'massdens',
 'calcfind',
 'calcdistri',
 'calcnumber',
 'otherfind',
 'implanfind',
 'consistent',
 'side',
 'size',
 'location',
 'depth',
 'distance',
 'numfind',
 'asses',
 'recc',
 'stable',
 'new',
 'changed',
 'loc_num',
 'tech_init',
 'init',
 'proccode',
 'desc',
 'vtype',
 'tissueden',
 'case',
 'type',
 'technique',
 'biopsite',
 'biop_loc',
 'bcomp',
 'path_loc',
 'diag_out',
 'surgery',
 'lymphsurg',
 'surg_loc',
 'pocomp',
 'ltcomp',
 'bside',
 'path1',
 'path2',
 'path3',
 'path4',
 'path5',
 'path6',
 'path7',
 'path8',
 'path9',
 'path10',
 'concord',
 'hgrade',
 'tnmpt',
 'tnmpn',
 'tnmm',
 'tnmdesc',
 'tnmr',
 'stage',
 'loc',
 'bdepth',
 'bdistance',
 'focality',
 'nfocal',
 'specsize',
 'specsize2',
 'specsize3',
 'dcissize',
 'invsize',
 'superior',
 'inferior',
 'anterior',
 'posterior',
 'medial',
 'lateral',
 'specinteg',
 'specnum',
 'specembed',
 'est',
 'estp',
 'her2',
 

In [9]:
# see racial distribution.
df_mag.groupby(['ETHNICITY_DESC'])['ETHNICITY_DESC'].count()

ETHNICITY_DESC
African American  or Black                   189572
American Indian or Alaskan Native               892
Asian                                         22597
Caucasian or White                           180183
Hispanic                                         34
Multiple                                       1378
Native Hawaiian or Other Pacific Islander      4053
Not Recorded                                     32
Patient Declines                                 20
Unknown, Unavailable or Unreported            28373
Name: ETHNICITY_DESC, dtype: int64

In [12]:
# df_mag['GENDER_DESC'].frequency()
df_mag.groupby(['GENDER_DESC'])['GENDER_DESC'].count()

GENDER_DESC
Female     425154
Male         1975
Unknown         5
Name: GENDER_DESC, dtype: int64

In [13]:
## Read file metadata files
df_meta = pd.read_csv(metadata_path)
list(df_meta.columns)

  df_meta = pd.read_csv(metadata_path)


['Unnamed: 0',
 'AcquisitionContextSequence',
 'AcquisitionTime',
 '0_AnatomicRegionSequence_CodeMeaning',
 '0_AnatomicRegionSequence_CodeValue',
 '0_AnatomicRegionSequence_CodingSchemeDesignator',
 'AnodeTargetMaterial',
 'BitsAllocated',
 'BitsStored',
 'BodyPartExamined',
 'BodyPartThickness',
 'BreastImplantPresent',
 'BurnedInAnnotation',
 'Columns',
 'CompressionForce',
 'ContentTime',
 'DetectorBinning',
 'DetectorConditionsNominalFlag',
 'DetectorTemperature',
 'DetectorType',
 'DistanceSourceToDetector',
 'DistanceSourceToPatient',
 'EntranceDose',
 'EntranceDoseInmGy',
 'EstimatedRadiographicMagnificationFactor',
 'Exposure',
 'ExposureControlMode',
 'ExposureControlModeDescription',
 'ExposureInuAs',
 'ExposureTime',
 'ExposureTimeInuS',
 'FieldOfViewHorizontalFlip',
 'FieldOfViewOrigin',
 'FieldOfViewRotation',
 'FilterMaterial',
 'FilterThicknessMaximum',
 'FilterThicknessMinimum',
 'FilterType',
 'FocalSpots',
 'Grid',
 'HalfValueLayer',
 'HighBit',
 'ImageLaterality',
 '

In [14]:

## Data integrity check. EMBED only contains cohort 1 and 2.

print(df_meta.cohort_num.unique())
print(df_mag.cohort_num.unique())

df_meta.cohort_num = df_meta.cohort_num.astype(str) 
df_mag.cohort_num = df_mag.cohort_num.astype(str)



df_meta_one_two = df_meta[df_meta.cohort_num.isin(['1','2'])]
df_mag_one_two = df_mag[df_mag.cohort_num.isin(['1','2'])]


[ 1  2  3  4  5  6  7  8  9 10]
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10']


In [15]:
print(df_meta_one_two.cohort_num.unique())
print(df_mag_one_two.cohort_num.unique())


# In[11]:


# Get diagnostic exams with BIRADS 4/5/6
df_mag_diag_pos = df_mag_one_two[df_mag_one_two.asses.isin(['S','M','K']) & 
                         df_mag_one_two.desc.str.contains('diag', case=False)]

# Get and rename relevant columns to prepare for merge with screening exams
## racial information is included in the magview file. 
df_mag_diag_pos_empi = df_mag_diag_pos[['empi_anon', 
                                        'acc_anon', 
                                        'numfind', 
                                        'bside', 
                                        'study_date_anon', 'ETHNICITY_DESC', 'GENDER_DESC', 'MARITAL_STATUS_DESC',
                                        'asses']]
## column names are labelled in simple language. 
df_mag_diag_pos_empi.columns = ['empi_anon', 
                                'acc_anon_diag', 
                                'diag_num', 
                                'diag_side', 
                                'diag_study_date', 'race', 'gender', 'marriage',
                                'diag_asses']

# Get screening exams and left merge on empi_anon with df_mag_diag_pos_empi
df_mag_scr = df_mag_one_two[df_mag_one_two.desc.str.contains('screen', case=False)]
df_mag_scr_pos = df_mag_diag_pos_empi.merge(df_mag_scr, on='empi_anon', how='left')

# Keep only screening exams with time diff less than +180 days
df_mag_scr_pos = df_mag_scr_pos.loc[(df_mag_scr_pos.side == df_mag_scr_pos.diag_side)]
df_mag_scr_pos['study_date_diff'] = df_mag_scr_pos.diag_study_date - df_mag_scr_pos.study_date_anon

df_mag_scr_pos_rel = df_mag_scr_pos.loc[(df_mag_scr_pos.study_date_diff.dt.days >= 0) & 
                                        (df_mag_scr_pos.study_date_diff.dt.days <= 180)]

# Get relevant columns from df_meta or merge entire dataset
df_meta_rel = df_meta_one_two[['empi_anon', 
                       'acc_anon', 
                       'ViewPosition', 
                       'ImageLateralityFinal', 
                       'FinalImageType', 
                       'png_path', 
                       'num_roi', 
                       'ROI_coords']]

# Merge df_meta_rel with positive exams in magview
df_meta_scr_pos_rel = df_mag_scr_pos_rel.merge(df_meta_rel, 
                                               left_on=['empi_anon', 'acc_anon', 'diag_side'], 
                                               right_on=['empi_anon', 'acc_anon', 'ImageLateralityFinal'], 
                                               how='inner')

# Keep only images with 1 or 2 ROIs
df_meta_scr_pos_rel = df_meta_scr_pos_rel[(df_meta_scr_pos_rel.num_roi == 1) | 
                                          (df_meta_scr_pos_rel.num_roi == 2)]

# Keep only 2D images
df_meta_scr_pos_rel = df_meta_scr_pos_rel[df_meta_scr_pos_rel.FinalImageType == '2D'].reset_index()

print(df_meta_scr_pos_rel.num_roi.value_counts())
print("Positive number")
print(len(df_meta_scr_pos_rel))


['1' '2']
['1' '2']
1    1053
2     136
Name: num_roi, dtype: int64
Positive number
1189


In [16]:
## Eventual racial distribution displayed
df_meta_scr_pos_rel.groupby(['race'])['race'].count()

race
African American  or Black                   473
Asian                                         67
Caucasian or White                           589
Native Hawaiian or Other Pacific Islander      6
Unknown, Unavailable or Unreported            54
Name: race, dtype: int64

In [17]:
df_meta_scr_pos_rel.groupby(['GENDER_DESC'])['GENDER_DESC'].count()

GENDER_DESC
Female    1189
Name: GENDER_DESC, dtype: int64

In [18]:
## images are converted into 800 x 600 to make them fit into 8 GB GPUs.
import cv2 as cv

df = df_meta_scr_pos_rel
df = shuffle(df)
all_rois_list = []
all_file_empi_anon_pos = []
save_dir = '/home/jupyter-ihwan28/breast_simple_comparison_by_race/images/800x600/br12_456/'

def CoppFileToDirectory_Pos(Save_Dir):
    for i in range(len(df)):
        
        img_path = df.png_path[i]
        empi_anon = df.empi_anon[i]
        race = df.race[i]
        marriage = df.marriage[i]
        image_array = cv.imread(img_path)
        resized_image = cv.resize(image_array, (600, 800), interpolation= cv.INTER_AREA)
        if not os.path.exists(Save_Dir):
        # if the demo_folder directory is not present
        # then create it.
            os.makedirs(Save_Dir)
            os.makedirs(Save_Dir + "/pos/")
            os.makedirs(Save_Dir + "/neg/")
        print(str(i) + "th iteration")
        filename = os.path.basename(img_path)
        filename = filename +'_' +str(i) + '.png'
        newPath = Save_Dir+'/pos/' + filename
#         shutil.copy(img_path, newPath)
        cv.imwrite(newPath, resized_image)
        all_file_empi_anon_pos.append([empi_anon, newPath, race, marriage])

CoppFileToDirectory_Pos(save_dir)
## Information extracted are stored in separate csv files. 
pos_df = pd.DataFrame(all_file_empi_anon_pos, columns=['empi_anon', 'file_path', 'race', 'marriage'])
pos_df.to_csv('/home/jupyter-ihwan28/breast_simple_comparison_by_race/images/800x600/br12_456/pos_empi_path.csv')

0th iteration
1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
6th iteration
7th iteration
8th iteration
9th iteration
10th iteration
11th iteration
12th iteration
13th iteration
14th iteration
15th iteration
16th iteration
17th iteration
18th iteration
19th iteration
20th iteration
21th iteration
22th iteration
23th iteration
24th iteration
25th iteration
26th iteration
27th iteration
28th iteration
29th iteration
30th iteration
31th iteration
32th iteration
33th iteration
34th iteration
35th iteration
36th iteration
37th iteration
38th iteration
39th iteration
40th iteration
41th iteration
42th iteration
43th iteration
44th iteration
45th iteration
46th iteration
47th iteration
48th iteration
49th iteration
50th iteration
51th iteration
52th iteration
53th iteration
54th iteration
55th iteration
56th iteration
57th iteration
58th iteration
59th iteration
60th iteration
61th iteration
62th iteration
63th iteration
64th iteration
65th iteration
66th iteration
67th 

In [19]:
### Same processes are conducted for negative labelled images.

df_mag_scr_neg = df_mag_one_two[df_mag_one_two.desc.str.contains('screen', case=False) & 
                        df_mag_one_two.asses.isin(['N','B'])]

# Exclude patients that are included in the positive set
df_mag_scr_neg_rel = df_mag_scr_neg[~df_mag_scr_neg.empi_anon.isin(df_meta_scr_pos_rel.empi_anon)].sort_index()

# Merge df_meta_rel with negative screening exams in magview
df_meta_scr_neg_rel = pd.merge(df_mag_scr_neg_rel, 
                               df_meta_rel,
                               left_on=['empi_anon', 'acc_anon', 'side'], 
                               right_on=['empi_anon', 'acc_anon', 'ImageLateralityFinal'], 
                               how='inner')

# Keep only 2D images
df_meta_scr_neg_rel = df_meta_scr_neg_rel[df_meta_scr_neg_rel.FinalImageType == '2D'].reset_index()

# df_meta_scr_neg_rel
print("Negative number")
print(len(df_meta_scr_neg_rel))

# In[ ]:


df = df_meta_scr_neg_rel
df = shuffle(df)
all_rois_list = []
all_file_empi_anon_neg = []
save_dir = '/home/jupyter-ihwan28/breast_simple_comparison_by_race/images/800x600/br12_456/'

def CoppFileToDirectory_Neg(Save_Dir):
    for i in range(len(df)):
        print(str(i) + "th iteration")
        img_path = df.png_path[i]
        empi_anon = df.empi_anon[i]
        race = df.ETHNICITY_DESC[i]
        marriage = df.MARITAL_STATUS_DESC[i]
        image_array = cv.imread(img_path)
        resized_image = cv.resize(image_array, (600, 800), interpolation= cv.INTER_AREA)
        if not os.path.exists(Save_Dir):
        # if the demo_folder directory is not present
        # then create it.
            os.makedirs(Save_Dir)
            os.makedirs(Save_Dir + "/pos/")
            os.makedirs(Save_Dir + "/neg/")
    
        filename = os.path.basename(img_path)
        filename = filename +'_' +str(i)  + '.png'
        newPath = Save_Dir+'/neg/' + filename
        cv.imwrite(newPath, resized_image)
#         shutil.copy(img_path, newPath)
        all_file_empi_anon_neg.append([empi_anon, newPath, race,marriage])
#         if i >= len(df_meta_scr_pos_rel)+1:
#             print("Neg loop broke out")
#             break

CoppFileToDirectory_Neg(save_dir)


# In[ ]:


len(all_file_empi_anon_neg)


# In[ ]:


neg_df = pd.DataFrame(all_file_empi_anon_neg, columns=['empi_anon', 'file_path','race','marriage'])
neg_df.to_csv('/home/jupyter-ihwan28/breast_simple_comparison_by_race/images/800x600/br12_456/neg_empi_path.csv')

Negative number
17016
0th iteration
1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
6th iteration
7th iteration
8th iteration
9th iteration
10th iteration
11th iteration
12th iteration
13th iteration
14th iteration
15th iteration
16th iteration
17th iteration
18th iteration
19th iteration
20th iteration
21th iteration
22th iteration
23th iteration
24th iteration
25th iteration
26th iteration
27th iteration
28th iteration
29th iteration
30th iteration
31th iteration
32th iteration
33th iteration
34th iteration
35th iteration
36th iteration
37th iteration
38th iteration
39th iteration
40th iteration
41th iteration
42th iteration
43th iteration
44th iteration
45th iteration
46th iteration
47th iteration
48th iteration
49th iteration
50th iteration
51th iteration
52th iteration
53th iteration
54th iteration
55th iteration
56th iteration
57th iteration
58th iteration
59th iteration
60th iteration
61th iteration
62th iteration
63th iteration
64th iteration
65th iteratio


KeyboardInterrupt



In [None]:
list(df_meta_scr_neg_rel.columns)
