In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from pydicom import dcmread
from skimage.io import imsave
from skimage.transform import resize
from tqdm import tqdm
import ast, sys
sys.path.append("/home/moritz/repositories/med_leak/")
np.random.seed(420)
tqdm.pandas()

In [3]:
def dataframe_stats(df):
    n_patients = df.empi_anon.nunique()
    n_exams = df.acc_anon.nunique()
    print(f"Patients: {n_patients}")
    print(f"Exams: {n_exams}")
    if "anon_dicom_path" in df.columns:
        n_images = df.anon_dicom_path.nunique()
        print(f"Images: {n_images}")

In [4]:
base_dir = Path("/home/moritz/data_massive/embed_small/tables")
data_dir = Path("/home/moritz/data_massive/embed_small/images")
clinical_df = pd.read_csv(base_dir / "EMBED_OpenData_clinical_reduced.csv")
meta_df = pd.read_csv(base_dir / "EMBED_OpenData_metadata_reduced.csv")
len(clinical_df), len(meta_df)

  clinical_df = pd.read_csv(base_dir / "EMBED_OpenData_clinical_reduced.csv")
  meta_df = pd.read_csv(base_dir / "EMBED_OpenData_metadata_reduced.csv")


(81776, 480323)

In [5]:
dataframe_stats(clinical_df)

Patients: 23253
Exams: 72770


In [6]:
# standardize racial groups
clinical_df = clinical_df.replace("African American  or Black", "Black")
clinical_df = clinical_df.replace("Caucasian or White", "White")
clinical_df = clinical_df.replace("Native Hawaiian or Other Pacific Islander", "Other/Unknown")
clinical_df = clinical_df.replace("Unknown, Unavailable or Unreported", "Other/Unknown")
clinical_df = clinical_df.replace("Multiple", "Other/Unknown")
clinical_df = clinical_df.replace("American Indian or Alaskan Native", "Other/Unknown")
clinical_df = clinical_df.replace("Not Recorded", "Other/Unknown")
clinical_df = clinical_df.replace("Patient Declines", "Other/Unknown")

# helper column to idenftify screening exams
clinical_df['screen_exam'] = clinical_df.desc.str.contains('screen', case=False)

In [7]:
clinical_df = clinical_df[clinical_df['screen_exam'] == True]
dataframe_stats(clinical_df)

Patients: 20460
Exams: 55956


In [8]:
clinical_df.tissueden.value_counts(), clinical_df.asses.value_counts()

(tissueden
 2.0    24955
 3.0    24724
 1.0     6067
 4.0     2977
 5.0        6
 Name: count, dtype: int64,
 asses
 N    41779
 A    10845
 B     6224
 S       22
 K        9
 P        7
 M        1
 X        1
 Name: count, dtype: int64)

In [9]:
data_df = pd.merge(clinical_df, meta_df, how='left', on=['acc_anon', 'empi_anon'])
data_df.drop_duplicates(subset="anon_dicom_path", inplace=True) # drop duplicates
data_df.rename(columns={"age_at_study": "age"}, inplace=True)
data_df.rename(columns={"PatientSex": "Sex"}, inplace=True)
# exclude patients with missing race info
data_df = data_df[~data_df.RACE_DESC.isna()]
data_df = data_df.loc[data_df.ViewPosition.isin(["MLO", "CC"])] # only MLO or CC view
# only female patients with valid BIRADS density
invalid_densities = data_df.tissueden==5.0
invalid_gender = data_df.GENDER_DESC == "Male"
data_df.drop(data_df[invalid_densities].index, inplace=True)
data_df.drop(data_df[invalid_gender].index, inplace=True)
data_df = data_df.reset_index(drop=True)
dataframe_stats(data_df)

  data_df.drop(data_df[invalid_gender].index, inplace=True)


Patients: 19447
Exams: 53963
Images: 361153


In [10]:
data_df.tissueden.value_counts(), data_df.GENDER_DESC.value_counts()

(tissueden
 2.0    153523
 3.0    146108
 1.0     41899
 4.0     18629
 Name: count, dtype: int64,
 GENDER_DESC
 Female    361153
 Name: count, dtype: int64)

In [11]:
# BI-RADS Density LABELS
data_df["BIRADS_DENSITY"] = data_df["tissueden"]
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(1.0, 0)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(2.0, 1)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(3.0, 2)
data_df.loc[:, "BIRADS_DENSITY"] = data_df.loc[:, "BIRADS_DENSITY"].replace(4.0, 3)
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(1.0, "BIRADS A")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(2.0, "BIRADS B")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(3.0, "BIRADS C")
data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(4.0, "BIRADS D")

  data_df.loc[:, "tissueden"] = data_df.loc[:, "tissueden"].replace(1.0, "BIRADS A")


In [12]:
print(data_df["BIRADS_DENSITY"].value_counts(dropna=False))
nan_vals = data_df.BIRADS_DENSITY.isna()
data_df = data_df.loc[~nan_vals]

BIRADS_DENSITY
1.0    153523
2.0    146108
0.0     41899
3.0     18629
NaN       994
Name: count, dtype: int64


In [13]:
data_df["ViewPosition"].value_counts()

ViewPosition
MLO    189134
CC     171025
Name: count, dtype: int64

In [14]:
f"{len(data_df)} images from {data_df.empi_anon.nunique()} patients and {data_df.acc_anon.nunique()} exams."

'360159 images from 19429 patients and 53817 exams.'

### Train-test split

In [15]:
n_test_patients = 500
test_patients = np.random.choice(data_df.empi_anon.unique(), n_test_patients, replace=False)
data_df['split'] = "train"
data_df.loc[data_df.empi_anon.isin(test_patients), 'split'] = "test"
data_df['split'] = "train"
data_df.loc[data_df.empi_anon.isin(test_patients), 'split'] = "test"
data_df['split'].value_counts(normalize=False), data_df['split'].value_counts(normalize=False)

(split
 train    351277
 test       8882
 Name: count, dtype: int64,
 split
 train    351277
 test       8882
 Name: count, dtype: int64)

In [16]:
data_df.RACE_DESC.value_counts(normalize=False)

RACE_DESC
Black            162012
White            152965
Other/Unknown     24363
Asian             20819
Name: count, dtype: int64

In [22]:
list(data_df.columns)

['Unnamed: 0_x',
 'empi_anon',
 'acc_anon',
 'desc',
 'numfind',
 'side',
 'asses',
 'tissueden',
 'bside',
 'GENDER_DESC',
 'RACE_DESC',
 'ETHNIC_GROUP_DESC',
 'MARITAL_STATUS_DESC',
 'age',
 'study_date_anon_x',
 'procdate_anon',
 'cohort_num_x',
 'path_severity',
 'total_L_find',
 'total_R_find',
 'first_3_zip',
 'screen_exam',
 'Unnamed: 0_y',
 'cohort_num_y',
 'study_date_anon_y',
 'StudyDescription',
 'SeriesDescription',
 'FinalImageType',
 'ImageLateralityFinal',
 'ViewPosition',
 'spot_mag',
 'anon_dicom_path',
 'num_roi',
 'ROI_coords',
 'SRC_DST',
 'match_level',
 'Manufacturer',
 'ManufacturerModelName',
 'Sex',
 'ProtocolName',
 'SeriesNumber',
 'SeriesTime',
 'StudyID',
 'WindowCenter',
 'WindowWidth',
 'has_pix_array',
 'category',
 'VOILUTFunction',
 'WindowCenterWidthExplanation',
 'preprocessed_img_exists',
 'BIRADS_DENSITY',
 'split']

In [18]:
#data_df.to_csv("/home/moritz/repositories/med_leak/data/csv/embed.csv")

### Cohort Table

In [19]:
from utils import cohort_table

In [24]:
cohort_df = cohort_table(
    data_df,
    strat_var="RACE_DESC",
    strat_var_vals=["Asian", "Black", "White", "Other/Unknown"],
    other_vars=["Sex", "tissueden", "asses", "ViewPosition", "FinalImageType"],
    patient_id_col="empi_anon",
)
cohort_df

OrderedDict([('Variable', ['Patients', 'Records', 'Median age (SD)']), ('All', ['19429', '360159', '60 (12)']), ('Asian', ['1299', '20819', '53 (11)']), ('Black', ['8120', '162012', '59 (11)']), ('White', ['7896', '152965', '62 (12)']), ('Other/Unknown', ['2114', '24363', '53 (10)'])])


Unnamed: 0,Variable,All,Asian,Black,White,Other/Unknown
0,Patients,19429,1299,8120,7896,2114
1,Records,360159,20819,162012,152965,24363
2,Median age (SD),60 (12),53 (11),59 (11),62 (12),53 (10)
3,F (\%),360159 (100.0),20819 (100.0),162012 (100.0),152965 (100.0),24363 (100.0)
4,BIRADS C (\%),146108 (40.6),12247 (58.8),54524 (33.7),67658 (44.2),11679 (47.9)
5,BIRADS D (\%),18629 (5.2),2891 (13.9),6069 (3.7),8161 (5.3),1508 (6.2)
6,BIRADS B (\%),153523 (42.6),5207 (25.0),75606 (46.7),63246 (41.3),9464 (38.8)
7,BIRADS A (\%),41899 (11.6),474 (2.3),25813 (15.9),13900 (9.1),1712 (7.0)
8,N (\%),271205 (75.3),15669 (75.3),123004 (75.9),115426 (75.5),17106 (70.2)
9,A (\%),54743 (15.2),3809 (18.3),24423 (15.1),20898 (13.7),5613 (23.0)


In [26]:
cohort_df.to_latex("./tables/embed.tex", index=False)