# NCATS Translator Workflow 5, Modules 1-4 - Red Team (COHD)
## Gender-related conditions
This is a Red Team implementation of NCATS Translator Workflow 5, Modules 1-4 using COHD to find conditions more prevalent in women than in men and vice versa.

In [1]:
import pandas as pd
import numpy as np
from cohd_requests import *
from cohd_utilities import *

### Display settings (optional)

In [2]:
# Pandas display options
pd.options.display.max_colwidth = 255
pd.options.display.max_rows = None

# Wider notebook display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## 1) Using the 5-year non-hierarchical data set

In [3]:
dataset_id = 3

## 2) Define male and female cohorts

In [4]:
concept_female = 8532
concept_male = 8507
domain = 'Condition'  # This can be changed to 'Drug' or 'Procedure' to find concepts in those domains instead

## 3) Get conditions associated with each gender

In [5]:
df_association_female = obs_exp_ratio(concept_female, concept_id_2=None, domain_id=domain, dataset_id=dataset_id)
df_association_male = obs_exp_ratio(concept_male, concept_id_2=None, domain_id=domain, dataset_id=dataset_id)

## 4) Filter the list of associated conditions

### 4.1) Exclude gender-specific conditions from list of associated conditions.
Many of the enhanced conditions in each gender are just conditions that do not occur in the opposite gender. Do this by comparing the single concept prevalences with the co-occurrence rate. If the co-occurrence rate is much less than the expected co-occurrence rate for independence (or doesn't appear in the data), then exclude that concept from the list

In [6]:
# Get the prevalences for male and female
df_gender_prev = concept_frequency([concept_female, concept_male], dataset_id=dataset_id)

# Pull out the prevalences
prev_female = df_gender_prev[df_gender_prev['concept_id']==concept_female]['concept_frequency'].iloc[0]
prev_male = df_gender_prev[df_gender_prev['concept_id']==concept_male]['concept_frequency'].iloc[0]

# Get single concept prevalences for all concepts
df_conditions = most_frequent_concepts(limit=1000000, dataset_id=dataset_id, domain_id=domain)

# Set the index to the appropriate concept IDs for join operation
df_conditions = df_conditions.set_index('concept_id')
df_association_female = df_association_female.set_index('concept_id_2')
df_association_male = df_association_male.set_index('concept_id_2')

# Exctract certain columns and rename for join operation
df_association_female_join = df_association_female[['observed_count']].rename(columns={'observed_count': 'observed_count_female'})
df_association_male_join = df_association_male[['observed_count']].rename(columns={'observed_count': 'observed_count_male'})

# Left-join co-occurrence table with single
df_joined = df_conditions.join([df_association_female_join, df_association_male_join], how='left')

# Find (potentially) gender-specific concepts by finding concepts where the observed gender-concept co-occurrence is much smaller than the expected gender-concept co-occurrence
# Note that when co-occurrences are <= 10, they do not appear in the COHD data, hence we will exclude all concepts that have a gender-concept co-occurrence <= 10.
def gender_specific(row):
    threshold = 10
    return np.isnan(row['observed_count_female']) or \
        (row['observed_count_female'] < (row['concept_count'] * prev_female / threshold)) or \
        np.isnan(row['observed_count_male']) or \
        (row['observed_count_male'] < (row['concept_count'] * prev_male / threshold))
    
df_joined['gender_specific'] = df_joined.apply(gender_specific, axis=1)
df_gender_specific = df_joined[['gender_specific']]

# Show the concepts detected as gender specific
display(df_joined[df_joined.gender_specific])

# Remove gender-specific concepts from the associations
df_association_female_filtered = df_association_female.join(df_gender_specific, how='left')
df_association_female_filtered = df_association_female[~df_association_female_filtered['gender_specific']]
df_association_male_filtered = df_association_male.join(df_gender_specific, how='left')
df_association_male_filtered = df_association_male[~df_association_male_filtered['gender_specific']]

Unnamed: 0_level_0,concept_class_id,concept_count,concept_frequency,concept_name,dataset_id,domain_id,vocabulary_id,observed_count_female,observed_count_male,gender_specific
concept_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4095793,Clinical Finding,151396,0.087418,Female genitalia finding,3,Condition,SNOMED,150676.0,1173.0,True
4180154,Clinical Finding,118427,0.068381,Female reproductive system disorder,3,Condition,SNOMED,117085.0,1023.0,True
37102309,HLGT,109563,0.063263,Vulvovaginal disorders (excl infections and inflammations),3,Condition,MedDRA,108330.0,865.0,True
37103738,HLT,107572,0.062114,Vulvovaginal disorders NEC,3,Condition,MedDRA,106568.0,869.0,True
443343,Clinical Finding,104781,0.060502,Disorder of female genital system,3,Condition,SNOMED,103900.0,816.0,True
200452,Clinical Finding,103102,0.059533,Disorder of female genital organs,3,Condition,SNOMED,102467.0,720.0,True
4090861,Clinical Finding,72916,0.042103,Male genitalia finding,3,Condition,SNOMED,1502.0,71575.0,True
4181194,Clinical Finding,72294,0.041744,Disorder of male reproductive system,3,Condition,SNOMED,1557.0,70949.0,True
196738,Clinical Finding,71303,0.041171,Disorder of male genital organ,3,Condition,SNOMED,1552.0,69964.0,True
4124486,Clinical Finding,70473,0.040692,Uterus finding,3,Condition,SNOMED,70836.0,199.0,True


## 5) Calculate the ratio of prevalence between genders
Add columns with ratio between genders for easier interpretation

In [7]:
# Join
df_sex = df_association_female_filtered.join(df_association_male_filtered[['concept_id_1', 'observed_count', 'expected_count', 'ln_ratio']], how='inner', lsuffix='_female', rsuffix='_male')

# Calculate the ratio between the ln_ratios of asthma and severe asthma
# subtraction of ln_ratios should be same as division of the ratios
df_sex['ln_ratio_female_vs_male'] = (df_sex.ln_ratio_female - df_sex.ln_ratio_male)

# Calculate 99.9% confidence intervals from Poisson distribution
df_sex[['ci_female', 'ci_male', 'significant']] = \
    df_sex.apply(PoissonSignificance2, axis=1, result_type='expand', c_oc1='observed_count_female', c_oc2='observed_count_male',
                 c_lr1='ln_ratio_female', c_lr2='ln_ratio_male', interval=0.999)

  return np.log(PoissonConfInt(rate, interval, iterations) * np.exp(ln_ratio) / rate)


### Female
Showing the top 1000 conditions more prevalent in females

In [8]:
# Display the most drastic significant conditions that are more prevalent in females
display(df_sex[df_sex.significant].sort_values(by='ln_ratio_female_vs_male', ascending=False).head(1000))

Unnamed: 0_level_0,dataset_id,concept_id_1_female,concept_2_name,concept_2_domain,observed_count_female,expected_count_female,ln_ratio_female,concept_id_1_male,observed_count_male,expected_count_male,ln_ratio_male,ln_ratio_female_vs_male,ci_female,ci_male,significant
concept_id_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
37303826,3,8532,Hypertrichoses,Condition,1745,967.29962,0.590002,8507,74,736.500285,-2.297845,2.887846,"[0.524886733056, 0.650052934118]","[-2.63161770886, -2.02719864391]",True
37103692,3,8532,Lactation disorders,Condition,656,366.495926,0.582173,8507,28,279.049375,-2.299184,2.881358,"[0.482891913131, 0.678037732661]","[-2.92333853538, -1.8937191182]",True
36102962,3,8532,Breast infections,Condition,434,247.356384,0.562214,8507,19,188.33673,-2.293792,2.856007,"[0.431969229328, 0.677444244172]","[-3.04100689871, -1.80424427156]",True
4128846,3,8532,Finding of structures of conception,Condition,16094,9459.679684,0.531408,8507,734,7202.584016,-2.283686,2.815094,"[0.510943730315, 0.551766041933]","[-2.37940508575, -2.19011138916]",True
37102294,3,8532,Breast disorders,Condition,51741,30860.545525,0.516772,8507,2395,23497.166853,-2.283497,2.800269,"[0.505049336572, 0.527862051492]","[-2.3371022256, -2.23381213829]",True
4022933,3,8532,Breast finding,Condition,51383,30748.213956,0.513475,8507,2425,23411.637788,-2.267402,2.780877,"[0.502182449588, 0.524796330907]","[-2.31945492979, -2.21478970358]",True
432375,3,8532,Triplet pregnancy,Condition,315,186.65195,0.523327,8507,15,142.116477,-2.248597,2.771924,"[0.369176305784, 0.659646155438]","[-3.16488751423, -1.73757117859]",True
37103689,3,8532,Breast disorders NEC,Condition,47156,28164.020544,0.515416,8507,2293,21444.037321,-2.235586,2.751002,"[0.503447943145, 0.526990918111]","[-2.29164154276, -2.18209239471]",True
4124325,3,8532,Lesion of vulva,Condition,3143,1883.539436,0.512025,8507,154,1434.123721,-2.231357,2.743382,"[0.465457936622, 0.556824130447]","[-2.4480680524, -2.04256262089]",True
40479565,3,8532,Carrier of cystic fibrosis gene mutation,Condition,5631,3428.949503,0.496033,8507,282,2610.796317,-2.225503,2.721537,"[0.461891774721, 0.529905052592]","[-2.39094700416, -2.08358537376]",True


### Male
Showing the top 1000 conditions more prevalent in males

In [9]:
# Display the most drastic significant conditions that are more prevalent in females
display(df_sex[df_sex.significant].sort_values(by='ln_ratio_female_vs_male', ascending=True).head(1000))

Unnamed: 0_level_0,dataset_id,concept_id_1_female,concept_2_name,concept_2_domain,observed_count_female,expected_count_female,ln_ratio_female,concept_id_1_male,observed_count_male,expected_count_male,ln_ratio_male,ln_ratio_female_vs_male,ci_female,ci_male,significant
concept_id_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
43053699,3,8532,Cell marker analyses,Condition,432,4250.445016,-2.286353,8507,7089,3236.281603,0.784119,-3.070473,"[-2.41197959089, -2.17063007508]","[0.753466626275, 0.81468144822]",True
36903641,3,8532,Orgasmic disorders and disturbances,Condition,55,503.790066,-2.214826,8507,833,383.584899,0.775473,-2.990299,"[-2.6112417328, -1.89142630516]","[0.682440801707, 0.862784118489]",True
201792,3,8532,Nongonococcal urethritis,Condition,17,132.188159,-2.051013,8507,244,100.647839,0.885541,-2.936554,"[-2.80478481434, -1.51693052603]","[0.711382292877, 1.04087903771]",True
37102299,3,8532,Male reproductive tract infections and inflammations,Condition,593,4894.365877,-2.110646,8507,7974,3726.561851,0.7607,-2.871346,"[-2.22107798028, -2.00967921258]","[0.731303156614, 0.789866870674]",True
77673,3,8532,Sign or symptom of the urinary system,Condition,697,5516.728154,-2.068755,8507,9047,4200.42743,0.767247,-2.836002,"[-2.16823437534, -1.97566440406]","[0.739451789514, 0.794290869947]",True
199859,3,8532,Bilateral recurrent inguinal hernia,Condition,11,77.724368,-1.955274,8507,135,59.179202,0.824705,-2.779978,"[-3.25455653961, -1.35743655472]","[0.573390191633, 1.0192962693]",True
4167100,3,8532,Laceration of chest wall,Condition,10,66.377745,-1.892777,8507,121,50.539902,0.873027,-2.765804,"[-3.09674954858, -1.25092285808]","[0.620478421872, 1.0878721266]",True
197919,3,8532,Urethral stricture due to infection,Condition,8,48.790479,-1.808094,8507,89,37.148988,0.8737,-2.681793,"[-3.19438801814, -1.11494647646]","[0.559450740946, 1.12999559941]",True
195926,3,8532,Slowing of urinary stream,Condition,125,871.987986,-1.942462,8507,1387,663.930169,0.736721,-2.679183,"[-2.19606466748, -1.72735052907]","[0.666543402537, 0.803648737442]",True
4156664,3,8532,Deformity of femur,Condition,23,179.276645,-2.053436,8507,246,136.500933,0.589,-2.642436,"[-2.70402346625, -1.60541117758]","[0.411544722507, 0.739677846902]",True


## 6) Get drugs associated with each gender

In [11]:
df_association_female = obs_exp_ratio(concept_female, concept_id_2=None, domain_id=domain, dataset_id=dataset_id)
df_association_male = obs_exp_ratio(concept_male, concept_id_2=None, domain_id=domain, dataset_id=dataset_id)

## 7) Compare ln_ratios between conditions associated with asthma vs severe asthma

In [12]:
# Join
df_sex = df_association_female_filtered.join(df_association_male_filtered[['concept_id_1', 'observed_count', 'expected_count', 'ln_ratio']], how='inner', lsuffix='_female', rsuffix='_male')

# Calculate the ratio between the ln_ratios of asthma and severe asthma
# subtraction of ln_ratios should be same as division of the ratios
df_sex['ln_ratio_female_vs_male'] = (df_sex.ln_ratio_female - df_sex.ln_ratio_male)

# Calculate 99.9% confidence intervals from Poisson distribution
df_sex[['ci_female', 'ci_male', 'significant']] = \
    df_sex.apply(PoissonSignificance2, axis=1, result_type='expand', c_oc1='observed_count_female', c_oc2='observed_count_male',
                 c_lr1='ln_ratio_female', c_lr2='ln_ratio_male', interval=0.999)

  return np.log(PoissonConfInt(rate, interval, iterations) * np.exp(ln_ratio) / rate)


### Female
Showing the top 1000 conditions more prevalent in females

In [None]:
# Display the most drastic significant conditions that are more prevalent in females
display(df_sex[df_sex.significant].sort_values(by='ln_ratio_female_vs_male', ascending=False).head(1000))

### Male
Showing the top 1000 conditions more prevalent in males

In [None]:
# Display the most drastic significant conditions that are more prevalent in females
display(df_sex[df_sex.significant].sort_values(by='ln_ratio_female_vs_male', ascending=True).head(1000))