In [185]:
%load_ext lab_black
%matplotlib inline
%load_ext autoreload
%autoreload 2
import warnings

import pandas as pd
import matplotlib.pyplot as plt
from upsetplot import plot, from_contents

warnings.filterwarnings("ignore")

In [None]:
data_path = "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/CRDW_Registry_IndexDx_2020_Jul23.csv"
data = pd.read_csv(data_path)

mrn_to_study_id = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/mrn_to_study_id.csv",
    names=["mrn", "study_id"],
)
mirai_loaded = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/mirai_chimec_all.csv"
)
downloaded_images = pd.read_pickle(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/downloaded_imaging_metadata.pkl"
)
mirai_output = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/data/interim_full_validation_output.csv"
)
cases_and_controls = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/dr_7934_pats.txt",
    sep="|",
)

spore_registration = pd.read_csv(
    "/gpfs/data/phs/groups/Projects/Huo_projects/SPORE/ojomoleye/data/SPORERegistrationDat_DATA_2021-06-21_0927.csv"
)

first_metadata = pd.read_csv(
    "/gpfs/data/huo-lab/Image/ojomoleye/projects/mirai_validation/chimec_mammo_retry.csv"
)

In [None]:
mirai_output["patient_id"] = mirai_output["patient_exam_id"].apply(
    lambda x: x.split("\t")[0]
)

In [None]:
total_n = len(data.index)
data = data[~pd.isnull(data.Age)]
print("filtered out {} entries without a valid age".format(total_n - len(data.index)))

filtered out 46 entries without a valid age


In [None]:
total_n = len(data.index)
data = data[~data.raceethnic.isin(["98", "Yes"])]

print(
    "filtered out {} entries without a valid race/ethnic entry".format(
        total_n - len(data.index)
    )
)

filtered out 2 entries without a valid race/ethnic entry


In [None]:
total_n = len(data.index)
data.mrn = pd.to_numeric(data.mrn, errors="coerce")
data = data[~pd.isnull(data.mrn)]
print("filtered out {} entries without a valid MRN".format(total_n - len(data.index)))

filtered out 0 entries without a valid MRN


In [None]:
total_n = len(data.index)
data = data[~pd.isnull(data.stage1)]
print(
    "filtered out {} entries without a valid stage at diagnosis".format(
        total_n - len(data.index)
    )
)

filtered out 85 entries without a valid stage at diagnosis


In [None]:
data.mrn = data.mrn.astype(int)
data.Age = data.Age.astype(int)
data.stage1 = data.stage1.astype(int)
data.her2 = pd.to_numeric(data.her2, errors="coerce")
data["datedx"] = pd.to_datetime(data["datedx"], infer_datetime_format=True)

In [None]:
data["binned_age"] = pd.cut(data.Age, bins=[0, 39, 49, 59, 69, data.Age.max()])
data["binned_age"].value_counts(sort=False)

(0, 39]      432
(39, 49]    1013
(49, 59]    1129
(59, 69]     964
(69, 99]     832
Name: binned_age, dtype: int64

In [None]:
race_groups = {
    "White": ["White"],
    "Black": ["Black"],
    "Asian / Pacific Islander": [
        "Asian Indian",
        "Asian Indian or Pakistani, NOS",
        "Filipino",
        "Chinese",
        "Other Asian, including Asian/Oriental, NOS",
        "Japanese",
        "Korean",
        "Hawaiian",
        "Vietnamese",
        "Thai (effective with 1994 diagnoses)",
        "Pakistani",
    ],
    "Other": ["Other", "Unknown"],
}

for label, group in race_groups.items():
    print(label, len(data[data.race_1.str.contains("|".join(group))]))

White 2439
Black 1680
Asian / Pacific Islander 175
Other 85


In [None]:
print(
    "Hispanic: {}, non-Hispanic: {}".format(
        len(data[data.raceethnic == "Hispanic"]),
        len(data[data.raceethnic != "Hispanic"]),
    )
)

Hispanic: 115, non-Hispanic: 4255


In [None]:
data["hr_pos"] = (data.ER1 == 1) | (data.PR1 == 1)
data["hr_neg"] = (data.ER1 == 0) & (data.PR1 == 0)

In [None]:
print(
    "DCIS: {}\nHR+/HER2+: {}\nHR+/HER2-: {}\nHR-/HER2+: {}\nHR-/HER2-: {}\nUnknown: {}".format(
        len(data[data.stage1 == 0]),
        len(data[(data.stage1 != 0) & (data.hr_pos == True) & (data.her2 == 1)]),
        len(data[(data.stage1 != 0) & (data.hr_pos == True) & (data.her2 == 0)]),
        len(data[(data.stage1 != 0) & (data.hr_neg == True) & (data.her2 == 1)]),
        len(data[(data.stage1 != 0) & (data.hr_neg == True) & (data.her2 == 0)]),
        len(
            data[
                (data.stage1 != 0)
                & (pd.isnull(data.ER1) | pd.isnull(data.PR1) | pd.isnull(data.her2))
            ]
        ),
    )
)

DCIS: 678
HR+/HER2+: 329
HR+/HER2-: 1969
HR-/HER2+: 206
HR-/HER2-: 557
Unknown: 636


In [None]:
print("patients diagnosed from {} to {}".format(data.datedx.min(), data.datedx.max()))

patients diagnosed from 1992-02-03 00:00:00 to 2019-12-31 00:00:00


In [None]:
data = data.merge(mrn_to_study_id, on="mrn")

In [None]:
mirai_loaded["study_id"] = mirai_loaded["patient_id"]

In [None]:
total_n = len(data.index)
data = data.drop_duplicates()
print("filtered out {} duplicated entries".format(total_n - len(data.index)))

filtered out 54 duplicated entries


In [None]:
mirai_loaded = mirai_loaded.merge(
    data, how="left", on="study_id", indicator=True, validate="m:m"
)

In [None]:
print(
    " of {} patients in mirai input set, {} match with CRDW data containing mrn which contains {} patients".format(
        mirai_loaded.study_id.nunique(),
        mirai_loaded[mirai_loaded._merge == "both"].study_id.nunique(),
        data.study_id.nunique()
    )
)

 of 416 patients in mirai input set, 12 match with data containing mrn
