In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from pydicom import dcmread
from skimage.io import imsave
from skimage.transform import resize
from tqdm import tqdm
import ast, sys
sys.path.append("/home/moritz/repositories/med_leak/")
np.random.seed(420)
tqdm.pandas()

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
base_dir = Path("/home/moritz/data_huge/embed_small/tables")
data_dir = Path("/home/moritz/data_huge/embed_small/images")
clinical_df = pd.read_csv(base_dir / "EMBED_OpenData_clinical_reduced.csv")
meta_df = pd.read_csv(base_dir / "EMBED_OpenData_metadata_reduced.csv")
len(clinical_df), len(meta_df)

In [4]:
def get_stats(cdf, image_df):
    n_patients = cdf.empi_anon.nunique()
    n_studies = cdf.acc_anon.nunique()
    n_images = image_df.anon_dicom_path.nunique()
    return n_patients, n_studies, n_images

In [5]:
n_patients, n_studies, n_images = get_stats(clinical_df, meta_df)
print(f"{n_images} images from {n_patients} patients")

480323 images from 23253 patients


In [6]:
clinical_df.head()

Unnamed: 0.1,Unnamed: 0,empi_anon,acc_anon,desc,numfind,side,asses,tissueden,bside,GENDER_DESC,...,ETHNIC_GROUP_DESC,MARITAL_STATUS_DESC,age_at_study,study_date_anon,procdate_anon,cohort_num,path_severity,total_L_find,total_R_find,first_3_zip
0,0,60696029,8099128854014801,MG Diagnostic Right,1,R,A,3.0,,Female,...,"Unreported, Unknown, Unavailable",Married,69.652354,2013-12-30,,1,,0.0,1.0,303.0
1,1,90986305,5156106114290009,MG Diagnostic Left,1,L,P,3.0,,Female,...,"Unreported, Unknown, Unavailable",Married,42.741466,2013-08-26,,1,,1.0,0.0,303.0
2,2,78097721,3065491565033830,MG Screening Bilateral w/CAD,1,,N,3.0,,Female,...,Non-Hispanic or Latino,Married,60.967713,2013-02-19,,1,,0.0,0.0,303.0
3,3,43734324,1878527667143380,MG Screening Bilateral w/CAD,1,R,A,3.0,R,Female,...,Non-Hispanic or Latino,Married,56.329699,2013-02-08,2013-03-08,1,2.0,0.0,1.0,303.0
4,4,28410573,8493926076199113,MG Screening Bilateral w/CAD,1,B,B,3.0,,Female,...,Non-Hispanic or Latino,Married,50.544501,2013-02-18,,1,,1.0,1.0,300.0


In [None]:
clinical_df.desc.value_counts()

desc
MG Screen Bilat w/Tomo/CAD Stnd Protocol    40674
MG Screening Bilateral                       9347
MG Diagnostic Bilateral w/ CAD               7581
MG Screening Bilateral w/CAD                 6414
MG Diagnostic Mammo Bilateral                3162
MG Diagnostic Left w/CAD                     3019
MG Diagnostic Right w/CAD                    2991
MG Diagnostic Left                           1853
MG Diagnostic Right                          1735
MG Diagnostic Bilateral w/Tomo/CAD           1333
MG Screening Left w/Tomo/CAD                  815
MG Screening Right w/Tomo/CAD                 738
MG Diagnostic  Left w/Tomo/CAD                462
MG Diagnostic  Right w/Tomo/CAD               436
MG Screening Right w/CAD                      318
MG Screening Left w/CAD                       255
MG Diagnostic Bilateral w/Tomosynthesis       246
MG Screening Bilat w/Tomosynthesis            123
MG Screening Left                             110
MG Screening Right                           

In [8]:
clinical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81776 entries, 0 to 81775
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           81776 non-null  int64  
 1   empi_anon            81776 non-null  int64  
 2   acc_anon             81776 non-null  int64  
 3   desc                 81776 non-null  object 
 4   numfind              81776 non-null  int64  
 5   side                 41626 non-null  object 
 6   asses                81776 non-null  object 
 7   tissueden            81200 non-null  float64
 8   bside                4799 non-null   object 
 9   GENDER_DESC          81553 non-null  object 
 10  RACE_DESC            81553 non-null  object 
 11  ETHNIC_GROUP_DESC    81553 non-null  object 
 12  MARITAL_STATUS_DESC  81553 non-null  object 
 13  age_at_study         81553 non-null  float64
 14  study_date_anon      81776 non-null  object 
 15  procdate_anon        4803 non-null  

In [9]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480323 entries, 0 to 480322
Data columns (total 29 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    480323 non-null  int64  
 1   empi_anon                     480323 non-null  int64  
 2   acc_anon                      480323 non-null  int64  
 3   cohort_num                    480323 non-null  int64  
 4   study_date_anon               480323 non-null  object 
 5   StudyDescription              480323 non-null  object 
 6   SeriesDescription             480054 non-null  object 
 7   FinalImageType                480323 non-null  object 
 8   ImageLateralityFinal          480323 non-null  object 
 9   ViewPosition                  472059 non-null  object 
 10  spot_mag                      34419 non-null   float64
 11  anon_dicom_path               480323 non-null  object 
 12  num_roi                       480323 non-nul

In [10]:
meta_df.ImageLateralityFinal.value_counts(), meta_df.Manufacturer.value_counts(), meta_df.FinalImageType.value_counts(), meta_df.ViewPosition.value_counts()

(ImageLateralityFinal
 R    240579
 L    239744
 Name: count, dtype: int64,
 Manufacturer
 HOLOGIC, Inc.               441213
 GE MEDICAL SYSTEMS           30344
 FUJIFILM Corporation          8239
 GE HEALTHCARE                  524
 Lorad, A Hologic Company         3
 Name: count, dtype: int64,
 FinalImageType
 2D       364564
 cview    115759
 Name: count, dtype: int64,
 ViewPosition
 MLO         228562
 CC          213629
 XCCL         12982
 ML           11681
 LM            2838
 CV             642
 XCCM           353
 MLOID          328
 AT             292
 CCID           286
 RL             204
 RM             201
 FB              15
 TAN             15
 MLID            12
 SIO              9
 LMO              6
 SPECIMEN         2
 ISO              1
 LMID             1
 Name: count, dtype: int64)

In [11]:
# categorize racial groups
clinical_df = clinical_df.replace("African American  or Black", "Black")
clinical_df = clinical_df.replace("Caucasian or White", "White")
clinical_df = clinical_df.replace("Native Hawaiian or Other Pacific Islander", "Other/Unknown")
clinical_df = clinical_df.replace("Unknown, Unavailable or Unreported", "Other/Unknown")
clinical_df = clinical_df.replace("Multiple", "Other/Unknown")
clinical_df = clinical_df.replace("American Indian or Alaskan Native", "Other/Unknown")
clinical_df = clinical_df.replace("Not Recorded", "Other/Unknown")
clinical_df = clinical_df.replace("Patient Declines", "Other/Unknown")

## Applying Inclusion Criteria  

**Inclusion Criteria:**
- ony left breast images
- first image of first study of each patient
- only MLO or CC view
- only female patients
- only studies with BIRADS Density A-D

In [12]:
# fill in missing clinical_df.side with info from clinical_df.desc
empty_side_mask = clinical_df.side.isnull()
contains_bilat_mask = clinical_df['desc'].str.contains('bilat', case=False, na=False)
to_fill = empty_side_mask & contains_bilat_mask
clinical_df.loc[to_fill, "side"] = "B"

In [13]:
n_patients, n_studies, n_images = get_stats(clinical_df, meta_df)
print(f"{n_images} images from {n_patients} patients")

480323 images from 23253 patients


In [14]:
#meta_df = meta_df.loc[meta_df.ViewPosition.isin(["MLO", "CC"])] # only MLO or CC view
#meta_df = meta_df.loc[meta_df.PatientSex=="F"] # only female patients
clinical_df = clinical_df.loc[clinical_df.acc_anon.isin(meta_df.acc_anon)] # only patients with images
n_patients, n_studies, n_images = get_stats(clinical_df, meta_df)
print(f"{n_images} images from {n_patients} patients, {n_studies} studies")

480323 images from 23253 patients, 72770 studies


In [15]:
data_df = pd.merge(clinical_df, meta_df, how='left', on=['acc_anon', 'empi_anon'])
data_df.drop_duplicates(subset="anon_dicom_path", inplace=True) # drop duplicates
data_df.rename(columns={"age_at_study": "age"}, inplace=True)
data_df.rename(columns={"PatientSex": "Sex"}, inplace=True)
# exclude patients with missing race info
data_df = data_df[~data_df.RACE_DESC.isna()]
data_df = data_df.loc[data_df.ViewPosition.isin(["MLO", "CC"])] # only MLO or CC view

In [16]:
# BI-RADS Density LABELS
data_df["BIRADS_DENSITY"] = data_df["tissueden"]
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(1.0, 0)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(2.0, 1)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(3.0, 2)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(4.0, 3)
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(1.0, "BIRADS A")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(2.0, "BIRADS B")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(3.0, "BIRADS C")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(4.0, "BIRADS D")

  data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(1.0, "BIRADS A")


In [17]:
print(data_df["BIRADS_DENSITY"].value_counts())
nan_vals = data_df.BIRADS_DENSITY.isna()
data_df = data_df.loc[~nan_vals]

BIRADS_DENSITY
1.0    183316
2.0    181372
0.0     48560
3.0     24593
5.0       840
Name: count, dtype: int64


In [18]:
data_df["ViewPosition"].value_counts()

ViewPosition
MLO    226822
CC     211859
Name: count, dtype: int64

In [19]:
data_df = data_df.sample(frac=1).reset_index(drop=True)
data_df_all = data_df.copy()
print(f"{data_df_all.anon_dicom_path.nunique()} images from {data_df_all.empi_anon.nunique()} patients. Data from {data_df_all.acc_anon.nunique()} studies")

438681 images from 22311 patients. Data from 70222 studies


apply optional exclusion criteria (only one image (left breast) per patient)

In [20]:
# only left breasts
data_df = data_df.loc[(data_df.side=="L") | (data_df.side=="B")] # only findings related to the left or both breasts
data_df = data_df.loc[data_df.ImageLateralityFinal=="L"]
print(f"{data_df.anon_dicom_path.nunique()} left breast images from {data_df.empi_anon.nunique()} patients. Data from {data_df.acc_anon.nunique()} studies")

191392 left breast images from 19600 patients. Data from 56135 studies


In [21]:
# drop all images but the first image of the first study per patient
data_df = data_df.sort_values(by='acc_anon')
data_df = data_df.drop_duplicates(subset="empi_anon")
print(f"{data_df.anon_dicom_path.nunique()} left breast images from female {data_df.empi_anon.nunique()} patients. Data from {data_df.acc_anon.nunique()} studies")

19600 left breast images from female 19600 patients. Data from 19600 studies


In [22]:
data_df.side.value_counts(), data_df.ViewPosition.value_counts()

(side
 B    14658
 L     4942
 Name: count, dtype: int64,
 ViewPosition
 MLO    10043
 CC      9557
 Name: count, dtype: int64)

In [23]:
data_df

Unnamed: 0,Unnamed: 0_x,empi_anon,acc_anon,desc,numfind,side,asses,tissueden,bside,GENDER_DESC,...,SeriesNumber,SeriesTime,StudyID,WindowCenter,WindowWidth,has_pix_array,category,VOILUTFunction,WindowCenterWidthExplanation,BIRADS_DENSITY
402102,60012,11371531,1000443661283482,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,L,A,BIRADS C,,Female,...,71300000.0,95156.0,MG Screening Dig,540.0,580.0,False,no image,,,2.0
314843,21160,65317699,1000449157413064,MG Screening Bilateral,1,B,N,BIRADS B,,Female,...,5.0,145858.0,00006MG160035804,2047.0,4096.0,False,no image,,,1.0
32388,65878,81619488,1000461782014031,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,L,B,BIRADS C,,Female,...,71300000.0,144613.0,MG Screening Dig,2047.0,4096.0,False,no image,,,2.0
81251,10667,23567423,1000539429266213,MG Diagnostic Left w/CAD,1,L,P,BIRADS D,,Female,...,71100000.0,82122.0,MG Diagnostic Ma,2047.0,4096.0,False,no image,,,3.0
122153,28330,79046796,1000628717095292,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,B,N,BIRADS B,,Female,...,71300000.0,100528.0,MG Screen Bilat,540.0,580.0,False,no image,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14667,65879,65367456,9983051246439877,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,B,N,BIRADS B,,Female,...,71300000.0,155634.0,MG Screening Dig,2047.0,4096.0,False,no image,,,1.0
38935,12243,92696277,9983796813744318,MG Screening Bilateral,1,B,N,BIRADS B,,Female,...,28606.0,134815.0,MG Screening Dig,"(""2731.0"", ""2809.0"", ""2635.0"")","(""900.0"", ""750.0"", ""1050.0"")",False,no image,SIGMOID,"('NORMAL', 'HARDER', 'SOFTER')",1.0
245985,74367,67072999,9985020392597629,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,B,N,BIRADS B,,Female,...,71300000.0,134508.0,H1003134200019,2047.0,4096.0,False,no image,,,1.0
206127,57522,62070924,9996191246486565,MG Screening Bilateral,1,B,N,BIRADS C,,Female,...,4.0,121109.0,00006MG160057622,2047.0,4096.0,False,no image,,,2.0


In [24]:
data_df_all.Sex.value_counts()

Sex
F    436974
M      1707
Name: count, dtype: int64

### Train-test split

In [25]:
n_test_patients = 500
test_patients = np.random.choice(data_df.empi_anon.unique(), n_test_patients, replace=False)
data_df['split'] = "train"
data_df.loc[data_df.empi_anon.isin(test_patients), 'split'] = "test"
data_df_all['split'] = "train"
data_df_all.loc[data_df_all.empi_anon.isin(test_patients), 'split'] = "test"
data_df['split'].value_counts(normalize=False), data_df_all['split'].value_counts(normalize=False)

(split
 train    19100
 test       500
 Name: count, dtype: int64,
 split
 train    428393
 test      10288
 Name: count, dtype: int64)

In [26]:
data_df_all.asses.value_counts()

asses
N    289421
B     64714
A     56350
P     18894
S      7307
K      1140
M       806
X        49
Name: count, dtype: int64

In [4]:
data_df_all.RACE_DESC.value_counts(normalize=False)

NameError: name 'data_df_all' is not defined

In [27]:
#data_df.to_csv("/home/moritz/repositories/fair_leak/data/csv/embed.csv")
#data_df_all.to_csv("/home/moritz/repositories/fair_leak/data/csv/embed_all.csv")