# NCATS Translator Workflow 5, Modules 1-4 - Red Team (COHD)
## Template implementation 
This is a Red Team template implementation of NCATS Translator Workflow 5, Modules 1-4 using COHD. This demonstrates an approach for using COHD to find conditions enhanced within simple cohorts. 

In [1]:
import pandas as pd
import numpy as np
from cohd_requests import *

### Display settings (optional)

In [2]:
# Pandas display options
pd.options.display.max_colwidth = 255
pd.options.display.max_rows = None

# Wider notebook display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## 1) Specify which dataset you want to work with

### 1.1) Show the available data sets to choose from

In [3]:
df = datasets()
display(df)

Unnamed: 0,dataset_id,dataset_name,dataset_description
0,1,5-year non-hierarchical,Clinical data from 2013-2017. Each concept's count reflects the use of that specific concept.
1,2,Lifetime non-hierarchical,Clinical data from all years in the database. Each concept's count reflects the use of that specific concept.
2,3,5-year hierarchical,Clinical data from 2013-2017. Each concept's count includes use of that concept and descendant concepts.


In [4]:
# Choose the data set that you want by ID
dataset_id = 1

## 2) A few options for specifying which concept to use to "define the cohort"

### 2.1) Define a cohort using demographics
COHD has basic demographic data (gender, ethnicity, and race) and data on coditions, drugs, and procedures. Here's a quick way of looking up the available concepts for demographic data.  
Note: There's a lot of missing data in race and ethnicity, hence the total frequencies in either domain are much less than 1. 

In [5]:
display(most_frequent_concepts(limit=1000, dataset_id=dataset_id, domain_id='Gender'))
display(most_frequent_concepts(limit=1000, dataset_id=dataset_id, domain_id='Ethnicity'))
display(most_frequent_concepts(limit=1000, dataset_id=dataset_id, domain_id='Race'))

Unnamed: 0,concept_class_id,concept_count,concept_frequency,concept_id,concept_name,dataset_id,domain_id,vocabulary_id
0,Gender,1037034,0.579209,8532,FEMALE,1,Gender,Gender
1,Gender,753863,0.421051,8507,MALE,1,Gender,Gender
2,Gender,232,0.00013,8570,AMBIGUOUS,1,Gender,Gender
3,Gender,10,6e-06,8551,UNKNOWN,1,Gender,Gender


Unnamed: 0,concept_class_id,concept_count,concept_frequency,concept_id,concept_name,dataset_id,domain_id,vocabulary_id
0,Ethnicity,397386,0.22195,38003564,Not Hispanic or Latino,1,Ethnicity,Ethnicity
1,Ethnicity,194634,0.108708,38003563,Hispanic or Latino,1,Ethnicity,Ethnicity


Unnamed: 0,concept_class_id,concept_count,concept_frequency,concept_id,concept_name,dataset_id,domain_id,vocabulary_id
0,Race,418405,0.23369,8527,White,1,Race,Race
1,Race,106799,0.05965,8516,Black or African American,1,Race,Race
2,Race,32109,0.017934,8552,Unknown,1,Race,Race
3,Race,24020,0.013416,8515,Asian,1,Race,Race
4,Race,5282,0.00295,38003613,Other Pacific Islander,1,Race,Race
5,Race,1906,0.001065,8657,American Indian or Alaska Native,1,Race,Race
6,Race,1815,0.001014,38003574,Asian Indian,1,Race,Race
7,Race,1023,0.000571,38003586,Laotian,1,Race,Race
8,Race,834,0.000466,38003579,Chinese,1,Race,Race
9,Race,415,0.000232,38003585,Korean,1,Race,Race


In [6]:
# Choosing the concept for 'Female'
concept_id = 8532

### 2.2) Start with a concept from an external ontology and try to map to OMOP concepts

In [7]:
external_curie = 'HP:0005110'  # Human Phenotype Ontology: Atrial fibrillation
df_xref = xref_to_omop(external_curie, distance=2)
display(df_xref)

# I'm being naive and choosing the first OMOP concept mapping (shortest distance)
concept_id = df_xref.iloc[0]['omop_standard_concept_id']

Unnamed: 0,source_oxo_id,source_oxo_label,intermediate_oxo_id,intermediate_oxo_label,omop_standard_concept_id,omop_concept_name,omop_domain_id,omop_distance,total_distance
0,HP:0005110,Atrial fibrillation,SNOMEDCT:49436004,Atrial fibrillation,313217,Atrial fibrillation,Condition,0,1
1,HP:0005110,Atrial fibrillation,MeSH:D001281,Atrial Fibrillation,313217,Atrial fibrillation,Condition,1,2
2,HP:0005110,Atrial fibrillation,UMLS:C0004238,Atrial Fibrillation,35204953,Atrial fibrillation,Condition,1,2
3,HP:0005110,Atrial fibrillation,UMLS:C0004238,Atrial Fibrillation,4344544,Atrial Fibrillation,Drug,1,2
4,HP:0005110,Atrial fibrillation,UMLS:C0004238,Atrial Fibrillation,313217,Atrial fibrillation,Condition,1,2
5,HP:0005110,Atrial fibrillation,UMLS:C0004238,Atrial Fibrillation,45883018,Atrial fibrillation,Meas Value,1,2
6,HP:0005110,Atrial fibrillation,SNOMEDCT:266306001,(Atrial fibrillation) or (atrial flutter),313217,Atrial fibrillation,Condition,1,3
7,HP:0005110,Atrial fibrillation,ICD9CM:427.31,Atrial fibrillation,313217,Atrial fibrillation,Condition,1,3
8,HP:0005110,Atrial fibrillation,UMLS:C2585653,Persistent Atrial Fibrillation,42890784,Persistent atrial fibrillation,Condition,1,3
9,HP:0005110,Atrial fibrillation,UMLS:C2585653,Persistent Atrial Fibrillation,4232697,Persistent atrial fibrillation,Condition,1,3


### 2.3) Search for a concept by name and manually choose a concept ID

In [8]:
search = 'Atrial fibrillation'
df_search = find_concept(search, min_count=1, dataset_id=dataset_id)
display(df_search)

Unnamed: 0,concept_id,concept_name,domain_id,concept_class_id,vocabulary_id,concept_code,concept_count
0,313217,Atrial fibrillation,Condition,Clinical Finding,SNOMED,49436004,49452.0
1,4154290,Paroxysmal atrial fibrillation,Condition,Clinical Finding,SNOMED,282825002,12453.0
2,4141360,Chronic atrial fibrillation,Condition,Clinical Finding,SNOMED,426749004,3480.0
3,4232697,Persistent atrial fibrillation,Condition,Clinical Finding,SNOMED,440059007,2131.0
4,43528009,Additional linear or focal intracardiac catheter ablation of the left or right atrium for treatment of atrial fibrillation remaining after completion of pulmonary vein isolation (List separately in addition to code for primary procedure),Procedure,CPT4,CPT4,93657,515.0
5,2101903,Documentation of permanent or persistent or paroxysmal atrial fibrillation (STR),Observation,CPT4,CPT4,1060F,203.0
6,4108832,Atrial fibrillation and flutter,Condition,Clinical Finding,SNOMED,195080001,107.0
7,2108844,Patient receiving warfarin therapy for nonvalvular atrial fibrillation or atrial flutter (AFIB),Observation,CPT4,CPT4,4300F,39.0


In [9]:
# Choose the concept ID
concept_id = df_search.iloc[0]['concept_id']

### 2.4) Start with a known OMOP concept ID
You can use [COHD's UI](http://cohd.io) to do a simple search that only includes concepts found in the COHD data, or you can use OHDSI's tools like [Atlas](http://www.ohdsi.org/web/atlas/#/home) (slower) or [Athena](http://athena.ohdsi.org/search-terms/terms) (faster) for a more powerful search of OMOP concepts

In [10]:
concept_id = 434902  # Autistic disorder of childhood onset

## 3) Two options for finding associated conditions

### 3.1) Using chi-square analysis to find conditions co-occurring at a rate different from expected 

In [11]:
df_association = chi_square(concept_id, concept_id_2=None, domain_id='Condition', dataset_id=dataset_id)

# Show a sample
display(df_association.head(100))

Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,chi_square,p-value
0,1,434902,439703,Active infantile autism,Condition,298449.73889,0.0
1,1,434902,439780,Autistic disorder,Condition,222187.021627,0.0
2,1,434902,40486120,Delay in physiological development,Condition,35600.360156,0.0
3,1,434902,439776,Autism spectrum disorder,Condition,23935.664517,0.0
4,1,434902,4102986,Disorder of psychological development,Condition,21593.24752,0.0
5,1,434902,436373,Developmental speech disorder,Condition,20377.754949,0.0
6,1,434902,4047124,Expressive language disorder,Condition,14118.691324,0.0
7,1,434902,441277,Mixed receptive-expressive language disorder,Condition,13166.829474,0.0
8,1,434902,436077,Developmental delay,Condition,13012.560803,0.0
9,1,434902,436233,Delayed milestone,Condition,12490.660941,0.0


### 3.2) Use the observed-to-expected frequency ratio to find conditions occurring with the concept of interest more (positive ln_ratio) or less (negative ln_ratio) often than expected
Note: The 'relative frequency' method is a third option which uses a different calculation but produces concepts ranked in the same order

In [12]:
df_association = obs_exp_ratio(concept_id, concept_id_2=None, domain_id='Condition', dataset_id=dataset_id)

# Show a sample
display(df_association.head(100))

Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,observed_count,expected_count,ln_ratio
0,1,434902,439702,Residual infantile autism,Condition,13,0.033726,5.954437
1,1,434902,439703,Active infantile autism,Condition,1013,3.42459,5.68969
2,1,434902,439780,Autistic disorder,Condition,759,2.581441,5.683654
3,1,434902,435244,Developmental disorder,Condition,28,0.22765,4.812149
4,1,434902,439776,Autism spectrum disorder,Condition,205,1.729861,4.774969
5,1,434902,4178664,Anomaly of chromosome pair,Condition,14,0.13912,4.611479
6,1,434902,441277,Mixed receptive-expressive language disorder,Condition,134,1.339202,4.605766
7,1,434902,4102986,Disorder of psychological development,Condition,221,2.221697,4.599891
8,1,434902,40486120,Delay in physiological development,Condition,426,4.995657,4.44587
9,1,434902,4168553,Electroencephalogram abnormal,Condition,90,1.083446,4.419663


## 4) Filter the list of associated conditions

### 4.1) Exclude concept-pairs with low co-occurrence because these results may be heavily swayed by the Poisson randomization

In [13]:
cooccurrence_threshold = 50
df_association = df_association[df_association['observed_count'] > cooccurrence_threshold].reset_index(drop=True)
display(df_association)

Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,observed_count,expected_count,ln_ratio
0,1,434902,439703,Active infantile autism,Condition,1013,3.42459,5.68969
1,1,434902,439780,Autistic disorder,Condition,759,2.581441,5.683654
2,1,434902,439776,Autism spectrum disorder,Condition,205,1.729861,4.774969
3,1,434902,441277,Mixed receptive-expressive language disorder,Condition,134,1.339202,4.605766
4,1,434902,4102986,Disorder of psychological development,Condition,221,2.221697,4.599891
5,1,434902,40486120,Delay in physiological development,Condition,426,4.995657,4.44587
6,1,434902,4168553,Electroencephalogram abnormal,Condition,90,1.083446,4.419663
7,1,434902,437092,Physiological development failure,Condition,186,2.727587,4.222329
8,1,434902,4148091,Developmental disorder of motor function,Condition,97,1.523289,4.153839
9,1,434902,4275359,Mental alertness - finding,Condition,95,1.52891,4.129322


## 4.2) Get rid of associated concepts that are ancestors or descendants of the concept of interest
Co-occurrences between concepts that are hierarchically related may not be of much interest, so remove them from the list

In [14]:
df_ancestors = concept_ancestors(concept_id, dataset_id=3)
df_descendants = concept_descendants(concept_id, dataset_id=3)

display(df_ancestors)
display(df_descendants)

Unnamed: 0,ancestor_concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code,standard_concept,concept_count,max_levels_of_separation,min_levels_of_separation
0,434902,Autistic disorder of childhood onset,Condition,SNOMED,Clinical Finding,43614003,S,3914,0,0
1,439780,Autistic disorder,Condition,SNOMED,Clinical Finding,408856003,S,4598,1,1
2,36703485,Developmental disorders cognitive,Condition,MedDRA,HLT,10012561,C,4606,2,2
3,439776,Autism spectrum disorder,Condition,SNOMED,Clinical Finding,35919005,S,5591,2,2
4,36902271,Developmental disorders NEC,Condition,MedDRA,HLGT,10012562,C,5951,3,3
5,4102986,Disorder of psychological development,Condition,SNOMED,Clinical Finding,192562009,S,7225,3,3
6,4043545,Developmental mental disorder,Condition,SNOMED,Clinical Finding,129104009,S,18021,4,4
7,45771096,Neurodevelopmental disorder,Condition,SNOMED,Clinical Finding,700364009,S,28282,5,5
8,4008565,"Mental disorder usually first evident in infancy, childhood AND/OR adolescence",Condition,SNOMED,Clinical Finding,111476001,S,33612,5,5
9,36702245,Mental impairment disorders,Condition,MedDRA,HLGT,10057167,C,40493,3,3


Unnamed: 0,descendant_concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code,standard_concept,concept_count,max_levels_of_separation,min_levels_of_separation
0,434902,Autistic disorder of childhood onset,Condition,SNOMED,Clinical Finding,43614003,S,3914,0,0
1,439703,Active infantile autism,Condition,SNOMED,Clinical Finding,191689008,S,2453,1,1
2,439702,Residual infantile autism,Condition,SNOMED,Clinical Finding,191690004,S,29,1,1


In [15]:
hierarchical_concepts = set(df_ancestors.ancestor_concept_id).union(set(df_descendants.descendant_concept_id))
df_association = df_association[[x not in hierarchical_concepts for x in df_association.concept_id_2]].reset_index(drop=True)
display(df_association)

Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,observed_count,expected_count,ln_ratio
0,1,434902,441277,Mixed receptive-expressive language disorder,Condition,134,1.339202,4.605766
1,1,434902,40486120,Delay in physiological development,Condition,426,4.995657,4.44587
2,1,434902,4168553,Electroencephalogram abnormal,Condition,90,1.083446,4.419663
3,1,434902,437092,Physiological development failure,Condition,186,2.727587,4.222329
4,1,434902,4148091,Developmental disorder of motor function,Condition,97,1.523289,4.153839
5,1,434902,4275359,Mental alertness - finding,Condition,95,1.52891,4.129322
6,1,434902,436373,Developmental speech disorder,Condition,347,5.736223,4.102524
7,1,434902,436233,Delayed milestone,Condition,219,3.721097,4.075053
8,1,434902,440389,Mental retardation,Condition,159,2.809091,4.036043
9,1,434902,434153,Congenital chromosomal disease,Condition,80,1.557015,3.939256


### 4.3) Keep only the strongest associations

In [16]:
# Keep the top 100 associations
df_association = df_association.iloc[0:100, :]
display(df_association)

Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,observed_count,expected_count,ln_ratio
0,1,434902,441277,Mixed receptive-expressive language disorder,Condition,134,1.339202,4.605766
1,1,434902,40486120,Delay in physiological development,Condition,426,4.995657,4.44587
2,1,434902,4168553,Electroencephalogram abnormal,Condition,90,1.083446,4.419663
3,1,434902,437092,Physiological development failure,Condition,186,2.727587,4.222329
4,1,434902,4148091,Developmental disorder of motor function,Condition,97,1.523289,4.153839
5,1,434902,4275359,Mental alertness - finding,Condition,95,1.52891,4.129322
6,1,434902,436373,Developmental speech disorder,Condition,347,5.736223,4.102524
7,1,434902,436233,Delayed milestone,Condition,219,3.721097,4.075053
8,1,434902,440389,Mental retardation,Condition,159,2.809091,4.036043
9,1,434902,434153,Congenital chromosomal disease,Condition,80,1.557015,3.939256


## 5) Optionally, attempt to convert conditions from OMOP back to external ontology
Note: This may take a little while (~1-2 sec / concept) and is not guaranteed to find mappings for all concepts.

In [17]:
target_ontology = 'HP'

nrows = df_association.shape[0]
df_association['external_concept'] = 'N/A'
df_association['external_label'] = 'N/A'
df_association['distance'] = 'N/A'
for i in range(nrows):
    # Show some progress updates
    if i % 10 == 0:
        print('xref progress: {curr} / {total}'.format(curr=i, total=nrows))
    
    # Attempt to match each concept to the external ontology
    source_concept_id = df_association.loc[i, 'concept_id_2']
    df_xref = xref_from_omop(source_concept_id, mapping_targets=target_ontology, distance=3, local=True, recommend=True)
    if df_xref.shape[0] > 0:
        df_association.loc[i, ['external_concept', 'external_label', 'distance']] = df_xref.loc[0, ['target_curie', 'target_label', 'total_distance']].values
        
display(df_association)

xref progress: 0 / 100
xref progress: 10 / 100
xref progress: 20 / 100
xref progress: 30 / 100
xref progress: 40 / 100
xref progress: 50 / 100
xref progress: 60 / 100
xref progress: 70 / 100
xref progress: 80 / 100
xref progress: 90 / 100


Unnamed: 0,dataset_id,concept_id_1,concept_id_2,concept_2_name,concept_2_domain,observed_count,expected_count,ln_ratio,external_concept,external_label,distance
0,1,434902,441277,Mixed receptive-expressive language disorder,Condition,134,1.339202,4.605766,,,
1,1,434902,40486120,Delay in physiological development,Condition,426,4.995657,4.44587,,,
2,1,434902,4168553,Electroencephalogram abnormal,Condition,90,1.083446,4.419663,HP:0002353,Electroencephalogram abnormalities,1.0
3,1,434902,437092,Physiological development failure,Condition,186,2.727587,4.222329,,,
4,1,434902,4148091,Developmental disorder of motor function,Condition,97,1.523289,4.153839,,,
5,1,434902,4275359,Mental alertness - finding,Condition,95,1.52891,4.129322,,,
6,1,434902,436373,Developmental speech disorder,Condition,347,5.736223,4.102524,,,
7,1,434902,436233,Delayed milestone,Condition,219,3.721097,4.075053,,,
8,1,434902,440389,Mental retardation,Condition,159,2.809091,4.036043,HP:0001249,Nonprogressive mental retardation,1.0
9,1,434902,434153,Congenital chromosomal disease,Condition,80,1.557015,3.939256,,,
