In [1]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
from torch_geometric import seed_everything

import ray

In [2]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

/sc-projects/sc-proj-ukb-cvd


In [3]:
records = pd.read_feather(f"{output_path}/baseline_records_220412.feather").set_index("eid")

In [4]:
records.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 502460 entries, 1000018 to 6025198
Columns: 68527 entries, OMOP_1000560 to OMOP_998415
dtypes: bool(68527)
memory usage: 32.1 GB


In [5]:
records_freq = records.sum(axis=0).sort_values(ascending=False).to_frame().reset_index()
records_freq.columns = ["record", "n"]
records_freq = records_freq.set_index("record")
records_freq

Unnamed: 0_level_0,n
record,Unnamed: 1_level_1
OMOP_4081598,309295
OMOP_4052351,271672
OMOP_4061103,264875
OMOP_4144272,247882
OMOP_4057411,222759
...,...
OMOP_4125272,0
OMOP_4236239,0
OMOP_4236199,0
OMOP_4236188,0


In [6]:
concepts_raw = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/data/mapping/athena/CONCEPT.csv", sep="\t", engine="c", dtype={"concept_id": str})
concepts_raw["record"] = "OMOP_" + concepts_raw["concept_id"]
concept_raw = concepts_raw.set_index("record")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
records_freq_md = records_freq.merge(concept_raw, left_index=True, right_index=True, how="left")

In [17]:
records_freq_md.query("n>1000").shape

(3711, 11)

In [16]:
records_freq_md.query("n>100").shape

(11762, 11)

In [15]:
records_freq_md.query("n>50").shape

(15190, 11)

In [14]:
records_freq_md.query("n>25").shape

(19122, 11)

In [18]:
records_freq_md.query("n>10").shape

(25132, 11)

In [19]:
records_freq_md.query("n>5").shape

(30023, 11)

In [None]:
records_freq_md

In [22]:
records_freq_md

Unnamed: 0_level_0,n,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OMOP_4081598,309295,4081598,Notes summary on computer,Observation,SNOMED,Clinical Finding,S,184229000,20020131,20991231,
OMOP_4052351,271672,4052351,Alcohol intake,Observation,SNOMED,Observable Entity,S,160573003,20020131,20991231,
OMOP_4061103,264875,4061103,O/E - blood pressure reading,Condition,SNOMED,Clinical Finding,S,163020007,20020131,20991231,
OMOP_4144272,247882,4144272,Never smoked tobacco,Observation,SNOMED,Clinical Finding,S,266919005,20020131,20991231,
OMOP_4057411,222759,4057411,Review of medication,Procedure,SNOMED,Procedure,S,182836005,20020131,20991231,
...,...,...,...,...,...,...,...,...,...,...,...
OMOP_4125272,0,4125272,Able to swallow,Observation,SNOMED,Clinical Finding,S,288936000,20020131,20991231,
OMOP_4236239,0,4236239,Functional defects of methionine synthase,Condition,SNOMED,Clinical Finding,S,360376008,20020131,20991231,
OMOP_4236199,0,4236199,Tends not to be sociable,Condition,SNOMED,Clinical Finding,S,90716005,20020131,20991231,
OMOP_4236188,0,4236188,Coronavirus vaccination,Procedure,SNOMED,Procedure,S,90640007,20020131,20991231,


In [27]:
artifact_path = "/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/record_frequencies_220412.feather"

In [28]:
records_freq_md.reset_index()[["record", "n", "concept_id", "concept_name", "domain_id", "vocabulary_id", "concept_class_id", "standard_concept"]].to_feather(artifact_path)

In [29]:
import wandb

run = wandb.init(project="RecordGraphs", entity="cardiors", tags=["artifacts"])

artifact = wandb.Artifact("RecordFrequencies", type="prepare_records")
artifact.add_reference(f"file://{artifact_path}", "RecordsMetadata", checksum=True)
run.log_artifact(artifact)

run.finish()

[34m[1mwandb[0m: wandb version 0.12.14 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [30]:
records_freq_md.reset_index()

Unnamed: 0,record,n,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,OMOP_4081598,309295,4081598,Notes summary on computer,Observation,SNOMED,Clinical Finding,S,184229000,20020131,20991231,
1,OMOP_4052351,271672,4052351,Alcohol intake,Observation,SNOMED,Observable Entity,S,160573003,20020131,20991231,
2,OMOP_4061103,264875,4061103,O/E - blood pressure reading,Condition,SNOMED,Clinical Finding,S,163020007,20020131,20991231,
3,OMOP_4144272,247882,4144272,Never smoked tobacco,Observation,SNOMED,Clinical Finding,S,266919005,20020131,20991231,
4,OMOP_4057411,222759,4057411,Review of medication,Procedure,SNOMED,Procedure,S,182836005,20020131,20991231,
...,...,...,...,...,...,...,...,...,...,...,...,...
68522,OMOP_4125272,0,4125272,Able to swallow,Observation,SNOMED,Clinical Finding,S,288936000,20020131,20991231,
68523,OMOP_4236239,0,4236239,Functional defects of methionine synthase,Condition,SNOMED,Clinical Finding,S,360376008,20020131,20991231,
68524,OMOP_4236199,0,4236199,Tends not to be sociable,Condition,SNOMED,Clinical Finding,S,90716005,20020131,20991231,
68525,OMOP_4236188,0,4236188,Coronavirus vaccination,Procedure,SNOMED,Procedure,S,90640007,20020131,20991231,
