## CQ1.7 - Gene-Gene Phenotype Similarity for FA Genes

Description: https://docs.google.com/document/d/10wvsoNLbbFOvHWVWRdYQdJ_PD0uO7SGGR-gIqCM8GWU/edit#

In [1]:
# autogenerate biolink_client
# curl --insecure -X POST -H "content-type:application/json" -d '{"swaggerUrl":"https://api.monarchinitiative.org/api/swagger.json"}' https://generator.swagger.io/api/gen/clients/python
# and rename it to biolink_client

In [2]:
import os, sys
# change this path
sys.path.insert(0, "/home/gstupp/projects/NCATS-Tangerine/biolink_client")

In [37]:
import biolink_client
from biolink_client.api_client import ApiClient
from biolink_client.rest import ApiException
import requests
from itertools import chain
from collections import Counter
import pandas as pd
from pprint import pprint
from tqdm import tqdm, tqdm_notebook
from collections import defaultdict

pd.options.display.max_rows = 999

MONARCH_API = "https://api.monarchinitiative.org/api"
SIMSEARCH_API = "https://monarchinitiative.org/simsearch/phenotype"

gene_list = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt"

client = ApiClient(host=MONARCH_API)
client.set_default_header('Content-Type', 'text/plain')
api_instance = biolink_client.BioentityApi(client)

# Get the gene list from github
dataframe = pd.read_csv(gene_list, sep='\t', names=['gene_id', 'symbol'])
df = dataframe.set_index('symbol')
human_FA_genes = set(df.gene_id)

In [12]:
def get_obj(obj_id):
    url = "https://api.monarchinitiative.org/api/bioentity/{}".format(obj_id)
    res = requests.get(url)
    d = res.json()
    return d
def get_taxon_from_gene(gene):
    return get_obj(gene)['taxon']['label']
def get_phenotype_from_gene(gene):
    # https://monarchinitiative.org/gene/NCBIGene%3A2176/phenotype_list.json
    url = "https://monarchinitiative.org/gene/{}/phenotype_list.json"
    return [x['id'] for x in requests.get(url.format(gene)).json()['phenotype_list']]
def get_phenotype_from_gene_verbose(gene):
    # https://monarchinitiative.org/gene/NCBIGene%3A2176/phenotype_list.json
    url = "https://monarchinitiative.org/gene/{}/phenotype_list.json"
    return [(x['id'],x['label']) for x in requests.get(url.format(gene)).json()['phenotype_list']]
def query_orthologs(gene_id, taxon=None):
    """Query Monarch to determine the orthologs of a gene."""
    url = "https://api.monarchinitiative.org/api/bioentity/gene/{}/homologs/".format(gene_id)
    if taxon:
        res = requests.get(url, params={'homolog_taxon': taxon})
    else:        
        res = requests.get(url)
    d = res.json()
    return [x['object']['id'] for x in d['associations']]
def get_phenotypes_from_disease(disease):
    url = "https://api.monarchinitiative.org/api/bioentity/disease/{}/phenotypes/?fetch_objects=true&rows=1000"
    d = requests.get(url.format(disease)).json()
    return d
#query_orthologs('MGI:88276', taxon="NCBITaxon:9606")
#get_phenotype_from_gene("NCBIGene:2176")
#get_taxon_from_gene('NCBIGene:2176')

In [13]:
def get_phenotypically_similar_genes(phenotypes, taxon, return_all=False):
    headers = {
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
    }
    data = {'input_items': " ".join(phenotypes), "target_species": taxon}
    r = requests.post(SIMSEARCH_API, data=data, headers=headers)
    d = r.json()
    if return_all:
        return d
    if 'b' in d:
        scores = [(x['id'],x['score']['score'], x['label']) for x in d['b']]
    else:
        scores = []
    return scores
#get_phenotypically_similar_genes(phenotypes, "10090")

In [14]:
# Step 1: Retrieve set of phenotypes associated with FA

phenotype_results = get_phenotypes_from_disease("DOID:13636")  # FA: DOID:13636

In [15]:
# look at the results
phenotype_results['associations'] = sorted(phenotype_results['associations'], key = lambda x: x['object']['label'])
for ass in phenotype_results['associations']:
    subject = ass['subject']
    relation = ass['relation']
    obj = ass['object']
    print(" ".join([subject['label'], "({})".format(subject['id']), "->", relation['label'], "->",
                   obj['label'], "({})".format(obj['id'])]))

Fanconi anemia (MONDO:0019391) -> has phenotype -> Abnormal aortic morphology (HP:0001679)
Fanconi anemia (MONDO:0019391) -> has phenotype -> Abnormal aortic valve morphology (HP:0001646)
Fanconi anemia (MONDO:0019391) -> has phenotype -> Abnormal carotid artery morphology (HP:0005344)
Fanconi anemia complementation group L (MONDO:0013566) -> has phenotype -> Abnormal facial shape (HP:0001999)
Fanconi anemia complementation group C (MONDO:0009213) -> has phenotype -> Abnormal heart morphology (HP:0001627)
Fanconi anemia complementation group O (MONDO:0013248) -> has phenotype -> Abnormal heart morphology (HP:0001627)
Fanconi anemia complementation group a (MONDO:0009215) -> has phenotype -> Abnormal heart morphology (HP:0001627)
Fanconi anemia complementation group D2 (MONDO:0009214) -> has phenotype -> Abnormal heart morphology (HP:0001627)
Fanconi anemia complementation group E (MONDO:0010953) -> has phenotype -> Abnormal heart morphology (HP:0001627)
Fanconi anemia (MONDO:0019391) -

In [16]:
phenotypes = phenotype_results['objects']

In [18]:
len(phenotypes)

193

In [26]:
# the returned phenotypes, but with the count of the number of FA disease subtypes they are associated with
# There's 193 total, so lets take the ones that appear in 2 or more subtypes (57 of them)
c = Counter([x['object']['label'] for x in phenotype_results['associations']])
print(len([x for x in c.values() if x>=2]))
c.most_common()

57


[('Microcephaly', 13),
 ('Short stature', 12),
 ('Anemia', 11),
 ('Short thumb', 11),
 ('Absent thumb', 9),
 ('Bone marrow hypocellularity', 9),
 ('Microphthalmia', 8),
 ('Cafe-au-lait spot', 8),
 ('Thrombocytopenia', 8),
 ('Chromosomal breakage induced by crosslinking agents', 8),
 ('Neutropenia', 7),
 ('Cryptorchidism', 7),
 ('Pancytopenia', 7),
 ('Leukemia', 6),
 ('Hearing impairment', 6),
 ('Intellectual disability', 6),
 ('Absent radius', 5),
 ('Growth delay', 5),
 ('Abnormality of chromosome stability', 5),
 ('Renal agenesis', 5),
 ('Abnormality of skin pigmentation', 5),
 ('Strabismus', 5),
 ('Horseshoe kidney', 5),
 ('Abnormal heart morphology', 5),
 ('Anal atresia', 5),
 ('Reticulocytopenia', 4),
 ('Deficient excision of UV-induced pyrimidine dimers in DNA', 4),
 ('Complete duplication of thumb phalanx', 4),
 ('Bruising susceptibility', 4),
 ('Ectopic kidney', 4),
 ('Intrauterine growth retardation', 4),
 ('Prolonged G2 phase of cell cycle', 4),
 ('Duplicated collecting system

In [28]:
# do the above-described filtering
c = Counter([x['object']['id'] for x in phenotype_results['associations']])
phenotypes = [k for k,v in c.items() if v>=2]
print(phenotypes)
print(len(phenotypes))

['HP:0001915', 'HP:0009943', 'HP:0002860', 'HP:0001000', 'HP:0002032', 'HP:0000286', 'HP:0000081', 'HP:0001875', 'HP:0009778', 'HP:0000470', 'HP:0000252', 'HP:0001873', 'HP:0005528', 'HP:0001172', 'HP:0001510', 'HP:0001263', 'HP:0000815', 'HP:0000028', 'HP:0003221', 'HP:0001249', 'HP:0003220', 'HP:0000369', 'HP:0000365', 'HP:0007565', 'HP:0001518', 'HP:0012745', 'HP:0000347', 'HP:0000978', 'HP:0000486', 'HP:0001511', 'HP:0003213', 'HP:0000175', 'HP:0009777', 'HP:0002023', 'HP:0002119', 'HP:0001876', 'HP:0001643', 'HP:0000086', 'HP:0000957', 'HP:0001627', 'HP:0002984', 'HP:0003974', 'HP:0000085', 'HP:0001508', 'HP:0001909', 'HP:0001903', 'HP:0002863', 'HP:0003214', 'HP:0004322', 'HP:0000104', 'HP:0000568', 'HP:0000316', 'HP:0002575', 'HP:0001896', 'HP:0001017', 'HP:0000238', 'HP:0004808']
57


In [29]:
# 2. Use this set to execute a single cross-species phenosim analysis to compare against pheno profiles of 
# genes from 5 species (human, mouse, zebrafish, fly, worm)

# note: no human!!
taxids = [10090, 7955, 7227, 6239]
results = dict()
for taxid in taxids:
    response = get_phenotypically_similar_genes(phenotypes, taxid)
    results[taxid] = [{"id": r[0], "score": r[1], "symbol": r[2]} for r in response]

In [30]:
# 4. Remove FA genes from results (i.e. trim inputs)
# There's no human in here, so will do this after we get orthologs

In [31]:
# 5. Take top 25 results as gene ‘hits’
for taxid in results:
    print(taxid)
    print(results[taxid][:10])
# scores of each organism are very different, so I'll take the top X for each organism

7227
[{'id': 'FlyBase:FBgn0015218', 'symbol': 'eIF4E1', 'score': 44}, {'id': 'FlyBase:FBgn0003612', 'symbol': 'Su(var)2-10', 'score': 33}]
10090
[{'id': 'MGI:1922667', 'symbol': 'Rspo2', 'score': 68}, {'id': 'MGI:95729', 'symbol': 'Gli3', 'score': 67}, {'id': 'MGI:1330810', 'symbol': 'Trp63', 'score': 65}, {'id': 'MGI:2443471', 'symbol': 'Sp8', 'score': 65}, {'id': 'MGI:104327', 'symbol': 'Nog', 'score': 65}, {'id': 'MGI:2682064', 'symbol': 'Ift172', 'score': 65}, {'id': 'MGI:106184', 'symbol': 'Npm1', 'score': 65}, {'id': 'MGI:99484', 'symbol': 'Chuk', 'score': 64}, {'id': 'MGI:1924238', 'symbol': 'Rdh10', 'score': 64}, {'id': 'MGI:1342540', 'symbol': 'Ikzf1', 'score': 63}]
7955
[{'id': 'ZFIN:ZDB-GENE-030131-9685', 'symbol': 'ddx18', 'score': 60}, {'id': 'ZFIN:ZDB-GENE-041114-104', 'symbol': 'tbl3', 'score': 56}, {'id': 'ZFIN:ZDB-GENE-080405-1', 'symbol': 'tert', 'score': 56}, {'id': 'ZFIN:ZDB-GENE-040426-1947', 'symbol': 'chaf1b', 'score': 55}, {'id': 'ZFIN:ZDB-GENE-020424-3', 'symbo

In [32]:
results = {k:sorted(v, key=lambda x:x['score'], reverse=True)[:10] for k,v in results.items()}

In [33]:
results_list = list(chain(*results.values()))
results_list

[{'id': 'ZFIN:ZDB-GENE-030131-9685', 'score': 60, 'symbol': 'ddx18'},
 {'id': 'ZFIN:ZDB-GENE-041114-104', 'score': 56, 'symbol': 'tbl3'},
 {'id': 'ZFIN:ZDB-GENE-080405-1', 'score': 56, 'symbol': 'tert'},
 {'id': 'ZFIN:ZDB-GENE-040426-1947', 'score': 55, 'symbol': 'chaf1b'},
 {'id': 'ZFIN:ZDB-GENE-020424-3', 'score': 55, 'symbol': 'lmna'},
 {'id': 'ZFIN:ZDB-GENE-990415-270', 'score': 54, 'symbol': 'tp53'},
 {'id': 'ZFIN:ZDB-GENE-990415-206', 'score': 54, 'symbol': 'pes'},
 {'id': 'ZFIN:ZDB-GENE-001103-1', 'score': 53, 'symbol': 'sox9a'},
 {'id': 'ZFIN:ZDB-GENE-040728-1', 'score': 53, 'symbol': 'ticrr'},
 {'id': 'ZFIN:ZDB-GENE-020910-1', 'score': 52, 'symbol': 'id2a'},
 {'id': 'MGI:1922667', 'score': 68, 'symbol': 'Rspo2'},
 {'id': 'MGI:95729', 'score': 67, 'symbol': 'Gli3'},
 {'id': 'MGI:1330810', 'score': 65, 'symbol': 'Trp63'},
 {'id': 'MGI:2443471', 'score': 65, 'symbol': 'Sp8'},
 {'id': 'MGI:104327', 'score': 65, 'symbol': 'Nog'},
 {'id': 'MGI:2682064', 'score': 65, 'symbol': 'Ift17

In [34]:
# 6. Retrieve human orthologs of any non-human gene hits
for r in tqdm_notebook(results_list):
    r['orthologs'] = query_orthologs(r['id'], "NCBITaxon:9606")




In [35]:
results_list = sorted(results_list, key=lambda x: x['score'], reverse=True)
df = pd.DataFrame(results_list)
df

Unnamed: 0,id,orthologs,score,symbol
0,MGI:1922667,[HGNC:28583],68,Rspo2
1,MGI:95729,[HGNC:4319],67,Gli3
2,MGI:1330810,[HGNC:15979],65,Trp63
3,MGI:2443471,[HGNC:19196],65,Sp8
4,MGI:104327,[HGNC:7866],65,Nog
5,MGI:2682064,[HGNC:30391],65,Ift172
6,MGI:106184,[HGNC:7910],65,Npm1
7,MGI:99484,[HGNC:1974],64,Chuk
8,MGI:1924238,[HGNC:19975],64,Rdh10
9,MGI:1342540,[HGNC:13176],63,Ikzf1


In [39]:
# mark any rows where the ortholog is a FA gene
df = pd.DataFrame(results_list)
df = df[df.orthologs.map(len)>0]
df1 = df.orthologs.apply(pd.Series).stack().rename('ortholog')
df1 = df1.to_frame().reset_index(1, drop=True)
df = df[['id', 'score', 'symbol']]
df = df.join(df1).reset_index(drop=True)
df['fa'] = df.ortholog.isin(human_FA_genes)

In [40]:
df['ortholog_label'] = df.ortholog.map(lambda x: get_obj(x)['label'])

In [41]:
df

Unnamed: 0,id,score,symbol,ortholog,fa,ortholog_label
0,MGI:1922667,68,Rspo2,HGNC:28583,False,RSPO2
1,MGI:95729,67,Gli3,HGNC:4319,False,GLI3
2,MGI:1330810,65,Trp63,HGNC:15979,False,TP63
3,MGI:2443471,65,Sp8,HGNC:19196,False,SP8
4,MGI:104327,65,Nog,HGNC:7866,False,NOG
5,MGI:2682064,65,Ift172,HGNC:30391,False,IFT172
6,MGI:106184,65,Npm1,HGNC:7910,False,NPM1
7,MGI:99484,64,Chuk,HGNC:1974,False,CHUK
8,MGI:1924238,64,Rdh10,HGNC:19975,False,RDH10
9,MGI:1342540,63,Ikzf1,HGNC:13176,False,IKZF1


In [42]:
## Example of inspecting one gene result

In [46]:
# take the top match (MGI:1922667 Rspo2), and gets its phenotypes
mgi_1922667_pheno = get_phenotype_from_gene_verbose("MGI:1922667")
pprint(mgi_1922667_pheno)

[('HP:0003974', 'Absent radius'),
 ('HP:0000175', 'Cleft palate'),
 ('MP:0009898', 'maxillary shelf hypoplasia'),
 ('MP:0002257', 'abnormal arytenoid cartilage morphology'),
 ('MP:0000088', 'short mandible'),
 ('MP:0001928', 'abnormal ovulation'),
 ('MP:0005306', 'abnormal phalanx morphology'),
 ('MP:0004540', 'small maxilla'),
 ('MP:0009887', 'abnormal palatal shelf fusion at midline'),
 ('MP:0013933', "short Meckel's cartilage"),
 ('MP:0000527', 'abnormal kidney development'),
 ('HP:0000204', 'Cleft upper lip'),
 ('MP:0006279', 'abnormal limb development'),
 ('MP:0011087', 'neonatal lethality, complete penetrance'),
 ('MP:0008494', 'absence of all nails'),
 ('HP:0009803', 'Short phalanx of finger'),
 ('MP:0009885', 'abnormal palatal shelf elevation'),
 ('HP:0001159', 'Syndactyly'),
 ('HP:0000138', 'Ovarian cyst'),
 ('HP:0006426', 'Rudimentary to absent tibiae'),
 ('MP:0005270', 'abnormal zygomatic bone morphology'),
 ('HP:0011849', 'Abnormal bone ossification'),
 ('MP:0009888', 'pala

In [51]:
d = get_phenotypically_similar_genes(phenotypes, "10090", return_all=True)
genes = get_phenotypically_similar_genes(phenotypes, "10090", return_all=False)

In [52]:
# look at the matching phenotypes for Rspo2
match = d['b'][0]
(match['id'],match['label'])

('MGI:1922667', 'Rspo2')

In [54]:
# one example phenotype match
match['matches'][0]

{'a': {'IC': 8.776246271125785,
  'id': 'HP:0000978',
  'label': 'Bruising susceptibility'},
 'b': {'IC': 10.195197153340976,
  'id': 'MP:0001182',
  'label': 'lung hemorrhage'},
 'lcs': {'IC': 5.6458484075315125,
  'id': 'MP:0001634',
  'label': 'internal hemorrhage'}}

In [59]:
# FA phenotypes and Rspo2 are "phenotypically similar" because of these phenotypes in common
# the phenotypes in the square brackets are the phenotypes that are determined to be similar to each other
for m in match['matches']:
    print(" ".join([m['lcs']['label'], "({})".format(m['lcs']['id']), 
                    "[{}, {}]".format(m['a']['label'], m['b']['label'])]))

internal hemorrhage (MP:0001634) [Bruising susceptibility, lung hemorrhage]
tissue development phenotype (GO:0009888PHENOTYPE) [Bone marrow hypocellularity, epithelial tube branching involved in lung morphogenesis phenotype]
Abnormality of the neck (HP:0000464) [Short neck, abnormal pharyngeal arch mesenchyme morphology]
micrognathia (MP:0002639) [Micrognathia, small mandibular condyloid process]
absent kidney (MP:0000520) [Renal agenesis, single kidney]
absent radius (MP:0000553) [Absent radius, absent radius]
thoracic segment blood vessel phenotype (UBERON:0003834PHENOTYPE) [Patent ductus arteriosus, abnormal lung vasculature morphology]
abnormal trachea morphology (MP:0002282) [Tracheoesophageal fistula, abnormal tracheal cartilage morphology]
abnormal ear morphology (MP:0002102) [Low-set ears, decreased tympanic ring size]
abnormal phalanx morphology (MP:0005306) [Complete duplication of thumb phalanx, Short phalanx of finger]
oligodactyly (MP:0000565) [Absent thumb, oligodactyly]
