In [1]:
#library(Rmisc)
library(dtplyr)
library(tidyverse)
library(glue)
library(arrow)
library(patchwork)
library(data.table)
library("jsonlite")
library(ggthemes)

“package ‘tidyverse’ was built under R version 4.0.3”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

“package ‘ggplot2’ was built under R version 4.0.5”
“package ‘readr’ was built under R version 4.0.5”
“package ‘purrr’ was built under R version 4.0.3”
“package ‘dplyr’ was built under R version 4.0.5”
“package ‘stringr’ was built under R version 4.0.5”
“package ‘forcats’ was built under R version 4.0.3”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag

In [2]:
if (grepl("sc", Sys.info()[["nodename"]], fixed=TRUE)) {
    base_path = "/sc-projects/sc-proj-ukb-cvd"
} else {
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"}
print(base_path)

project_label="22_medical_records"
project_path = glue("{base_path}/results/projects/{project_label}")
figure_path = glue("{project_path}/figures")
output_path = glue("{project_path}/data")

[1] "/sc-projects/sc-proj-ukb-cvd"


In [3]:
phecode_defs_path = "/sc-projects/sc-proj-ukb-cvd/data/mapping/phecodes/phecode_strings_V2.csv"
phecode_defs = fread(phecode_defs_path, colClasses=c("character", "character", "character", "character", "integer", "character", "integer"))#, dtype={"phecode": str}).sort_values("phecode")
phecode_defs = phecode_defs %>% add_row(phecode = "4306655", phecode_string = "All-Cause Death", phecode_category = "Death", sex="Both")
phecode_defs = phecode_defs %>% as_tibble %>% separate(phecode, into=c("first", "second"), remove=FALSE) %>% 
    mutate(comb = str_remove_all(glue("{first}-{second}"), "-NA")) %>%
    mutate(endpoint=case_when(comb == "4306655" ~ glue("OMOP_{comb}"),
                              TRUE ~ glue("phecode_{comb}"))) %>%
    select(phecode, endpoint, everything(), -first, -second, -comb)
phecode_defs %>% sample_n(5)

“Expected 2 pieces. Missing pieces filled with `NA` in 691 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 18, 20, 21, 24, 25, 26, 27, 29, 30, 31, ...].”


phecode,endpoint,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
<chr>,<glue>,<chr>,<chr>,<chr>,<int>,<chr>,<int>
361.25,phecode_361-25,Paralytic lagophthalmos,Eye,Both,0,361,1
859.1,phecode_859-1,Neonatal jaundice associated with preterm delivery,Neonate,Both,0,859,1
486.4,phecode_486-4,Disorders of diaphragm,Resp,Both,0,486,0
168.11,phecode_168-11,Hereditary hypo-coagulability,Blood,Both,0,168,0
773.7,phecode_773-7,"Lobulated, fused and horseshoe kidney*",Cong,Both,1,773,1


In [8]:
phecode_defs %>% filter(phecode_category == "Cardio") %>% arrange(endpoint)

phecode,endpoint,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
<chr>,<glue>,<chr>,<chr>,<chr>,<int>,<chr>,<int>
400,phecode_400,Rheumatic fever and chronic rheumatic heart diseases,Cardio,Both,0,400,0
400.1,phecode_400-1,Acute rheumatic fever,Cardio,Both,0,400,1
400.2,phecode_400-2,Chronic rheumatic heart diseases,Cardio,Both,0,400,1
401,phecode_401,Hypertension,Cardio,Both,0,401,0
401.1,phecode_401-1,Essential hypertension,Cardio,Both,0,401,1
401.2,phecode_401-2,Hypertensive heart disease,Cardio,Both,0,401,1
401.3,phecode_401-3,Hypertensive chronic kidney disease,Cardio,Both,0,401,1
401.4,phecode_401-4,Hypertensive encephalopathy,Cardio,Both,0,401,1
401.6,phecode_401-6,Secondary hypertension,Cardio,Both,0,401,0
401.61,phecode_401-61,Renovascular hypertension,Cardio,Both,0,401,1


In [5]:
phecode_defs %>% write_feather(glue("{output_path}/phecode_defs_220301.feather"))

In [164]:
phecode_icd10 = fread("/sc-projects/sc-proj-ukb-cvd/data/mapping/phecodes/ICD10_to_phecode_V2.csv", colClasses = c("character", "character"))

In [9]:
athena_path = "/sc-projects/sc-proj-ukb-cvd/data/mapping/athena"
concept = fread(glue("{athena_path}/CONCEPT.csv"))
concept_relationship = fread(glue("{athena_path}/CONCEPT_RELATIONSHIP.csv"))



In [140]:
concept_ICD10 = concept %>% filter(vocabulary_id=='ICD10CM') %>% mutate(ICD10_code = concept_code, ICD10_name = concept_name) %>% select(concept_id, ICD10_code, ICD10_name) %>% as_tibble()
concept_SNOMED = concept %>% filter(vocabulary_id=='SNOMED') %>% mutate(SNOMED_code = concept_code, SNOMED_name = concept_name, OMOP_id = concept_id) %>% select(concept_id, SNOMED_code, SNOMED_name, OMOP_id) %>% as_tibble()

In [141]:
icd_sct_rel = concept_relationship %>% filter(relationship_id == "Maps to") %>%
    filter(concept_id_1 %in% (concept_ICD10)$concept_id) %>%
    filter(concept_id_2 %in% (concept_SNOMED)$concept_id) %>% as_tibble()

In [142]:
icd_snomed_map = icd_sct_rel %>% 
    left_join(concept_ICD10, by=c("concept_id_1"="concept_id")) %>%
    left_join(concept_SNOMED, by=c("concept_id_2"="concept_id")) %>%
    select(ICD10_name, ICD10_code, SNOMED_code, SNOMED_name, OMOP_id) %>% arrange(ICD10_code)

In [118]:
final_records = arrow::read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/final_records_211205.feather")

In [177]:
phecodes_avail = final_records %>% filter(vocabulary=="phecode") %>% distinct(concept_id) %>% arrange(as.numeric(str_remove_all(concept_id, "phecode_"))) %>% mutate(endpoint = str_replace_all(concept_id, "\\.", "-")) %>% left_join(phecode_defs) 

Joining, by = "endpoint"



In [178]:
phecodes_avail

concept_id,endpoint,phecode,phecode_string,phecode_category,sex,ICD10_only,phecode_top,leaf
<chr>,<glue>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<int>
phecode_001,phecode_001,001,Salmonella,ID,Both,0,001,1
phecode_002,phecode_002,002,Staphylococcus,ID,Both,0,002,0
phecode_002.1,phecode_002-1,002.1,Staphylococcus aureus,ID,Both,0,002,1
phecode_003,phecode_003,003,Escherichia coli,ID,Both,0,003,1
phecode_004,phecode_004,004,Streptococcus,ID,Both,0,004,0
phecode_004.1,phecode_004-1,004.1,Streptococcus pneumoniae,ID,Both,0,004,1
phecode_004.2,phecode_004-2,004.2,Group A Streptococcus,ID,Both,0,004,1
phecode_004.3,phecode_004-3,004.3,Group B Streptococcus,ID,Both,0,004,1
phecode_004.4,phecode_004-4,004.4,Scarlet fever,ID,Both,0,004,1
phecode_005,phecode_005,005,Mycobacteria,ID,Both,0,005,0


In [169]:
omop_phecode_map = icd_snomed_map %>% 
    left_join(phecode_icd10, 
              by=c("ICD10_code"="icd10")) %>% 
    filter(!is.na(phecode)) %>%
    left_join(phecode_defs %>% select(phecode, endpoint) %>% as_tibble(), 
              by="phecode") %>%
    filter(!is.na(endpoint)) %>%
    mutate(record = glue("OMOP_{OMOP_id}")) %>% 
    select(OMOP_id, SNOMED_code, SNOMED_name, ICD10_code, ICD10_name, phecode, record, endpoint) %>%
    arrange(as.numeric(phecode))

In [188]:
omop_phecode_map %>% write_csv(glue("{output_path}/omop_phecode_map_220224.csv"))

In [170]:
unique(omop_phecode_map$endpoint)

OMOP_id,SNOMED_code,SNOMED_name,ICD10_code,ICD10_name,phecode,record,endpoint
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<glue>,<glue>
133685,302231008,Salmonella infection,A01,Typhoid and paratyphoid fevers,001,OMOP_133685,phecode_001
192819,4834000,Typhoid fever,A01.0,Typhoid fever,001,OMOP_192819,phecode_001
192819,4834000,Typhoid fever,A01.00,"Typhoid fever, unspecified",001,OMOP_192819,phecode_001
4100102,192648008,Meningitis due to typhoid fever,A01.01,Typhoid meningitis,001,OMOP_4100102,phecode_001
46269829,1084791000119106,Cardiac disorder due to typhoid fever,A01.02,Typhoid fever with heart involvement,001,OMOP_46269829,phecode_001
4166072,45312009,Pneumonia in typhoid fever,A01.03,Typhoid pneumonia,001,OMOP_4166072,phecode_001
141663,60168000,Osteomyelitis,A01.05,Typhoid osteomyelitis,001,OMOP_141663,phecode_001
192819,4834000,Typhoid fever,A01.05,Typhoid osteomyelitis,001,OMOP_192819,phecode_001
433128,116223007,Complication,A01.09,Typhoid fever with other complications,001,OMOP_433128,phecode_001
192819,4834000,Typhoid fever,A01.09,Typhoid fever with other complications,001,OMOP_192819,phecode_001


In [187]:
nrow(phecodes_avail)

In [186]:
nrow(omop_phecode_map %>% filter(endpoint %in% phecodes_avail$endpoint) %>% distinct(endpoint))

In [185]:
nrow(omop_phecode_map %>% filter(endpoint %in% phecode_defs$endpoint) %>% distinct(endpoint))

In [172]:
length(unique(omop_phecode_map$endpoint))

In [175]:
nrow(phecode_defs)

In [113]:
phecode_defs %>% distinct(phecode) 

phecode
<chr>
008
010
079
041
054
069
061
099
078
097.1


In [107]:
omop_phecode_map

OMOP_id,SNOMED_code,SNOMED_name,ICD10_code,ICD10_name,phecode,Phenotype,record,endpoint
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<glue>,<glue>
198677,63650001,Cholera,A00,Cholera,008,Intestinal infection,OMOP_198677,phecode_008
4344638,240349003,Cholera due to Vibrio cholerae O1 Classical biotype,A00.0,"Cholera due to Vibrio cholerae 01, biovar cholerae",008,Intestinal infection,OMOP_4344638,phecode_008
200629,81020007,Cholera due to Vibrio cholerae El Tor,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",008,Intestinal infection,OMOP_200629,phecode_008
198677,63650001,Cholera,A00.9,"Cholera, unspecified",008,Intestinal infection,OMOP_198677,phecode_008
133685,302231008,Salmonella infection,A01,Typhoid and paratyphoid fevers,008,Intestinal infection,OMOP_133685,phecode_008
195460,76623002,Paratyphoid A fever,A01.1,Paratyphoid fever A,008,Intestinal infection,OMOP_195460,phecode_008
193953,71085009,Paratyphoid B fever,A01.2,Paratyphoid fever B,008,Intestinal infection,OMOP_193953,phecode_008
442291,51254007,Paratyphoid C fever,A01.3,Paratyphoid fever C,008,Intestinal infection,OMOP_442291,phecode_008
195177,85904008,Paratyphoid fever,A01.4,"Paratyphoid fever, unspecified",008,Intestinal infection,OMOP_195177,phecode_008
441500,66107000,Bacterial food poisoning,A05.9,"Bacterial foodborne intoxication, unspecified",008,Intestinal infection,OMOP_441500,phecode_008


In [110]:
length(unique(omop_phecode_map$phecode))

In [81]:
phecode_icd10 %>% rename(phecode = "PheCode") %>% as_tibble()

ICD10,ICD10 String,phecode,Phenotype,Excl. Phecodes,Excl. Phenotypes
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
A00,Cholera,8.00,Intestinal infection,001-009.99,Intestinal infection
A00.0,"Cholera due to Vibrio cholerae 01, biovar cholerae",8.00,Intestinal infection,001-009.99,Intestinal infection
A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",8.00,Intestinal infection,001-009.99,Intestinal infection
A00.9,"Cholera, unspecified",8.00,Intestinal infection,001-009.99,Intestinal infection
A01,Typhoid and paratyphoid fevers,8.00,Intestinal infection,001-009.99,Intestinal infection
A01.0,Typhoid fever,8.50,Bacterial enteritis,001-009.99,Intestinal infection
A01.1,Paratyphoid fever A,8.00,Intestinal infection,001-009.99,Intestinal infection
A01.2,Paratyphoid fever B,8.00,Intestinal infection,001-009.99,Intestinal infection
A01.3,Paratyphoid fever C,8.00,Intestinal infection,001-009.99,Intestinal infection
A01.4,"Paratyphoid fever, unspecified",8.00,Intestinal infection,001-009.99,Intestinal infection


In [76]:
phecode_defs %>% select(phecode, endpoint) %>% as_tibble()

phecode,endpoint
<chr>,<glue>
008,phecode_008
010,phecode_010
079,phecode_079
041,phecode_041
054,phecode_054
069,phecode_069
061,phecode_061
099,phecode_099
078,phecode_078
097.1,phecode_097-1


In [22]:
concept %>% filter(vocabulary_id=='ICD10') %>% as_tibble()

concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>
45532996,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45532996,20061230,20160413,D
45533053,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533053,20061230,20160413,D
45533054,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533054,20061230,20160413,D
45533055,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533055,20061230,20160413,D
45533056,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533056,20061230,20160413,D
45533057,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533057,20061230,20160413,D
45533059,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533059,20061230,20160413,D
45533060,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533060,20061230,20160413,D
45533061,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533061,20061230,20160413,D
45533064,"Invalid ICD10 Concept, do not use",Condition,ICD10,ICD10 code,,45533064,20061230,20160413,D


In [None]:
sct_codes = 
icd10_codes = 

In [16]:
concept_relationship.query()

concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
<int>,<int>,<chr>,<int>,<int>,<lgl>
19060425,19060425,Mapped from,19700101,20991231,
19060425,19060425,Maps to,19700101,20991231,
19060425,19082573,RxNorm has dose form,20050404,20991231,
19060425,36219015,RxNorm is a,20160801,20991231,
19060425,42854649,Mapped from,20120702,20991231,
19060425,42854649,RxNorm - VAProd eq,20141201,20991231,
19060429,19060429,Mapped from,19700101,20991231,
19060429,19060429,Maps to,19700101,20991231,
19060431,2044187,Has marketed form,20200731,20991231,
19060431,19060431,Mapped from,19700101,20991231,
