# ChiMEC Radiomics Input and Output Data for Mirai Validation 
This notebook describes the mammogram dataset in ChIMEC used to validate Mirai.

In [15]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from utils.metadata_inspect import print_summary


ModuleNotFoundError: No module named 'utils'

In [2]:
# cut flow at /gpfs/data/huo-lab/Image/annawoodard/maicara/data/interim/mammo_v8/cutflow.txt
metadata_path = '/gpfs/data/huo-lab/Image/annawoodard/maicara/data/interim/mammo_v8/series_metadata.pkl'
metadata = pd.read_pickle(metadata_path)

In [58]:
latest = latest[latest.png_path.notnull()] #filter out those with non existent png paths
hologic_mask = latest['Manufacturer'].str.contains('hologic|lorad', case=False, na=False, regex=True)
total_exams = latest['exam_id'].nunique()
print(total_exams)
latest_hologic = latest[hologic_mask]
percent_hologic = latest_hologic['exam_id'].nunique() / total_exams * 100
print(f' total hologic exams is {total_exams} \n percentage hologic exams is {percent_hologic}')



7824
 total hologic exams is 7824 
 percentage hologic exams is 98.46625766871165


In [60]:
latest.Manufacturer.value_counts()

HOLOGIC, Inc.          29873
LORAD                   1128
KODAK                    136
SIEMENS                   80
R2 Technology, Inc.        3
GE MEDICAL SYSTEMS         2
UNKNOWN                    1
AMICAS-NSC                 1
Name: Manufacturer, dtype: int64

In [41]:
print(f'length before dropping na is {len(latest)}')
latest = latest.dropna(how='all')
print(f'length after dropping na is {len(latest)}')
latest.Manufacturer.value_counts()
len1 = len(latest)
hologic_mask = latest['Manufacturer'].str.contains('hologic|lorad', case=False, na=False, regex=True)
latest = latest[hologic_mask]
len2 = len(latest)
print(f'percentage of Hologic Machines in the dataset is about {(len2 / len1)* 100}')
latest.Manufacturer.value_counts()

length before dropping na is 180614
length after dropping na is 180614
percentage of Hologic Machines in the dataset is about 48.66012601459466


HOLOGIC, Inc.               85890
LORAD                        1965
Lorad, A Hologic Company       32
Name: Manufacturer, dtype: int64

In [7]:
data_path = "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/CRDW_Registry_IndexDx_2020_Jul23.csv"
data = pd.read_csv(data_path)

metadata_path = "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/chimec_mammo_retry.csv"  # output of filter_and_convert.py script
metadata = pd.read_csv(metadata_path)

mrn_to_study_id = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/mrn_to_study_id.csv",
    names=["mrn", "study_id"],
)
mirai_loaded = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/mirai_chimec_all.csv"
)
downloaded_images = pd.read_pickle(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/downloaded_imaging_metadata.pkl"
)
mirai_output = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/data/interim_full_validation_output.csv"
)
cases_and_controls = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/dr_7934_pats.txt",
    sep="|",
)

spore_registration = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/SPORERegistrationDat_DATA_2021-06-21_0927.csv"
)

In [8]:
mirai_output["patient_id"] = mirai_output["patient_exam_id"].apply(
    lambda x: x.split("\t")[0]
)

CRDW DATASET 

In [9]:
total_n = len(data.index)
data = data[~pd.isnull(data.Age)]
print("filtered out {} entries without a valid age".format(total_n - len(data.index)))

filtered out 46 entries without a valid age


In [10]:
total_n = len(data.index)
data = data[~data.raceethnic.isin(["98", "Yes"])]

print(
    "filtered out {} entries without a valid race/ethnic entry".format(
        total_n - len(data.index)
    )
)

filtered out 2 entries without a valid race/ethnic entry


In [11]:
total_n = len(data.index)
data.mrn = pd.to_numeric(data.mrn, errors="coerce")
data = data[~pd.isnull(data.mrn)]
print("filtered out {} entries without a valid MRN".format(total_n - len(data.index)))

filtered out 0 entries without a valid MRN


In [12]:
total_n = len(data.index)
data = data[~pd.isnull(data.stage1)]
print(
    "filtered out {} entries without a valid stage at diagnosis".format(
        total_n - len(data.index)
    )
)

filtered out 85 entries without a valid stage at diagnosis


In [13]:
data.mrn = data.mrn.astype(int)
data.Age = data.Age.astype(int)
data.stage1 = data.stage1.astype(int)
data.her2 = pd.to_numeric(data.her2, errors="coerce")
data["datedx"] = pd.to_datetime(data["datedx"], infer_datetime_format=True)

In [14]:
data["binned_age"] = pd.cut(data.Age, bins=[0, 39, 49, 59, 69, data.Age.max()])
data["binned_age"].value_counts(sort=False)

(0, 39]      432
(39, 49]    1013
(49, 59]    1129
(59, 69]     964
(69, 99]     832
Name: binned_age, dtype: int64

In [15]:
race_groups = {
    "White": ["White"],
    "Black": ["Black"],
    "Asian / Pacific Islander": [
        "Asian Indian",
        "Asian Indian or Pakistani, NOS",
        "Filipino",
        "Chinese",
        "Other Asian, including Asian/Oriental, NOS",
        "Japanese",
        "Korean",
        "Hawaiian",
        "Vietnamese",
        "Thai (effective with 1994 diagnoses)",
        "Pakistani",
    ],
    "Other": ["Other", "Unknown"],
}

for label, group in race_groups.items():
    print(label, len(data[data.race_1.str.contains("|".join(group))]))

White 2439
Black 1680
Asian / Pacific Islander 175
Other 85


In [16]:
print(
    "Hispanic: {}, non-Hispanic: {}".format(
        len(data[data.raceethnic == "Hispanic"]),
        len(data[data.raceethnic != "Hispanic"]),
    )
)

Hispanic: 115, non-Hispanic: 4255


In [17]:
data["hr_pos"] = (data.ER1 == 1) | (data.PR1 == 1)
data["hr_neg"] = (data.ER1 == 0) & (data.PR1 == 0)

In [18]:
print(
    "DCIS: {}\nHR+/HER2+: {}\nHR+/HER2-: {}\nHR-/HER2+: {}\nHR-/HER2-: {}\nUnknown: {}".format(
        len(data[data.stage1 == 0]),
        len(data[(data.stage1 != 0) & (data.hr_pos == True) & (data.her2 == 1)]),
        len(data[(data.stage1 != 0) & (data.hr_pos == True) & (data.her2 == 0)]),
        len(data[(data.stage1 != 0) & (data.hr_neg == True) & (data.her2 == 1)]),
        len(data[(data.stage1 != 0) & (data.hr_neg == True) & (data.her2 == 0)]),
        len(
            data[
                (data.stage1 != 0)
                & (pd.isnull(data.ER1) | pd.isnull(data.PR1) | pd.isnull(data.her2))
            ]
        ),
    )
)

DCIS: 678
HR+/HER2+: 329
HR+/HER2-: 1969
HR-/HER2+: 206
HR-/HER2-: 557
Unknown: 636


In [19]:
print("patients diagnosed from {} to {}".format(data.datedx.min(), data.datedx.max()))

patients diagnosed from 1992-02-03 00:00:00 to 2019-12-31 00:00:00


In [20]:
data = data.merge(mrn_to_study_id, how="left", on="mrn", indicator=True)
data._merge.value_counts()  # all patients in CRDW represented in mrn_to_studyid map

both          4424
left_only        0
right_only       0
Name: _merge, dtype: int64

In [21]:
# drop _merge column, no longer needed
data.drop(columns="_merge", axis=1, inplace=True)

In [22]:
mirai_loaded["study_id"] = mirai_loaded["patient_id"]

In [23]:
total_n = len(data.index)
data = data.drop_duplicates()
print("filtered out {} duplicated entries".format(total_n - len(data.index)))

filtered out 54 duplicated entries


In [24]:
mirai_loaded = mirai_loaded.merge(
    data, how="left", on="study_id", indicator=True, validate="m:1"
)

In [25]:
print(
    " of {} patients in mirai input set, {} match with CRDW data containing mrn which contains {} patients".format(
        mirai_loaded.study_id.nunique(),
        mirai_loaded[mirai_loaded._merge == "both"].study_id.nunique(),
        data.study_id.nunique(),
    )
)

 of 416 patients in mirai input set, 12 match with CRDW data containing mrn which contains 4370 patients


In [26]:
# rename for easier merging later
spore_registration["mrn"] = spore_registration["ucmc"]

In [27]:
d = spore_registration
total_n = len(d.index)
data.mrn = pd.to_numeric(d.mrn, errors="coerce")
data = d[~pd.isnull(d.mrn)]
print("filtered out {} entries without a valid MRN".format(total_n - len(data.index)))
spore_registration = d

filtered out 0 entries without a valid MRN


In [28]:
spore_registration = spore_registration.merge(
    mrn_to_study_id, how="left", on="mrn", indicator=True
)

In [29]:
spore_registration._merge.value_counts()
spore_registration.drop(columns="_merge", axis=1, inplace=True)
mirai_loaded.drop(columns="_merge", axis=1, inplace=True)

In [30]:
mirai_loaded = mirai_loaded.merge(
    spore_registration, how="left", on="mrn", indicator=True
)

In [31]:
mirai_loaded._merge.value_counts()

left_only     7404
both           188
right_only       0
Name: _merge, dtype: int64

In [32]:
mirai_loaded

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,patient_id,exam_id,laterality_x,view,years_to_cancer,years_to_last_followup,file_path,split_group,...,uncle_half_pat_cancer_type___6,uncle_half_pat_cancer_type___5,uncle_half_pat_cancer_type___10,uncle_half_pat_cancer_type___7,uncle_half_pat_cancer_type___8,uncle_half_pat_cancer_type___9,uncle_half_pat_cancer_age,cancer_risk_survey_complete,study_id_y,_merge
0,0,0,58982377,2O10729,L,CC,100,4,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
1,1,1,58982377,2O10729,R,MLO,100,4,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
2,2,2,58982377,2O10729,R,MLO,100,4,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
3,3,3,58982377,2O10729,R,CC,100,4,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
4,4,4,58982377,2O10729,R,CC,100,4,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7587,7555,11003,38461703,2O16486,L,MLO,100,2,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
7588,7556,11004,38461703,2O16486,R,MLO,100,2,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
7589,7557,11005,38461703,2O16486,R,CC,100,2,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only
7590,7558,11006,38461703,2O16486,L,CC,100,2,/gpfs/data/huo-lab/Image/ojomoleye/projects/mi...,test,...,,,,,,,,,,left_only


In [34]:
print(
    " of {} patients in mirai input set, {} match with SPORE registration data which contains {} patients".format(
        mirai_loaded.patient_id.nunique(),
        mirai_loaded[mirai_loaded._merge == "both"].patient_id.nunique(),
        spore_registration.mrn.nunique(),
    )
)

 of 416 patients in mirai input set, 12 match with SPORE registration data which contains 6369 patients


In [75]:
data_path = "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/CRDW_Registry_IndexDx_2020_Jul23.csv"
data = pd.read_csv(data_path)

metadata_path = "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/chimec_mammo_retry.csv"  # output of filter_and_convert.py script
metadata = pd.read_csv(metadata_path)

mrn_to_study_id = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/mrn_to_study_id.csv",
    names=["mrn", "study_id"],
)
mirai_loaded = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/mirai_chimec_all.csv"
)
downloaded_images = pd.read_pickle(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/downloaded_imaging_metadata.pkl"
)
mirai_output = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/data/interim_full_validation_output.csv"
)
cases_and_controls = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/dr_7934_pats.txt",
    sep="|",
)

spore_registration = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/SPORERegistrationDat_DATA_2021-06-21_0927.csv"
)

In [76]:
mirai_output["patient_id"] = mirai_output["patient_exam_id"].apply(
    lambda x: x.split("\t")[0]
)

In [77]:
mirai_output["study_id"] = mirai_output["patient_id"]
mirai_output["study_id"] = mirai_output["study_id"].astype("float").astype("Int64")

In [79]:
mirai_output = mirai_output.merge(metadata, how="left", on="study_id", indicator=True)

In [85]:
mirai_output = mirai_output[~pd.isnull(mirai_output.patient_exam_id)]

In [93]:
mirai_output[mirai_output.case == False].patient_id.nunique()

346

In [96]:
cases = mirai_output[mirai_output.case == True]

In [100]:
mirai_output

Unnamed: 0.1,patient_exam_id,1_year_risk,2_year_risk,3_year_risk,4_year_risk,5_year_risk,patient_id,study_id,Unnamed: 0,AccessionNumber,...,IssuerOfPatientIDQualifiersSequence,PatientReligiousPreference,PatientPosition,AcquisitionComments,DataSetSubtype,DataSetType,ScheduledProcedureStepID,MilitaryRank,EntranceDoseDerivation,_merge
0,58982377\t2O10729,0.024250,0.036742,0.046101,0.054766,0.062453,58982377,58982377,0,2O10730,...,,,,,,,,,,both
1,58982377\t2O10729,0.024250,0.036742,0.046101,0.054766,0.062453,58982377,58982377,1,2O10730,...,,,,,,,,,,both
2,58982377\t2O10729,0.024250,0.036742,0.046101,0.054766,0.062453,58982377,58982377,8,2O10729,...,,,,,,,,,,both
3,58982377\t2O10729,0.024250,0.036742,0.046101,0.054766,0.062453,58982377,58982377,9,2O10729,...,,,,,,,,,,both
4,58982377\t2O10729,0.024250,0.036742,0.046101,0.054766,0.062453,58982377,58982377,10,2O10729,...,,,,,,,,,,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43654,38461703\t2O16486,0.065335,0.090567,0.098347,0.118459,0.125670,38461703,38461703,11004,2O07053,...,,,,,,,,,,both
43655,38461703\t2O16486,0.065335,0.090567,0.098347,0.118459,0.125670,38461703,38461703,11005,2O07053,...,,,,,,,,,,both
43656,38461703\t2O16486,0.065335,0.090567,0.098347,0.118459,0.125670,38461703,38461703,11006,2O07053,...,,,,,,,,,,both
43657,38461703\t2O16486,0.065335,0.090567,0.098347,0.118459,0.125670,38461703,38461703,11007,2O07054,...,,,,,,,,,,both
