## Orange Team CQ#1.7

### Query: 
What genes show high phenotypic similarity to the 11 Fanconi Anemia core complex genes (set FA-core)?

### Services:
BioLink API (Monarch) - https://api.monarchinitiative.org/api/

Simsearch - https://monarchinitiative.org/simsearch

### Approach:
Take all 27 human FA genes. For each gene, retrieve the phenotypically similar genes from mouse, zebrafish, worm, and fly. The output is a list of genes and a score. Sum of the scores for each of the genes, across all 27 FA genes. Take the top X (10) phenotypically similar genes from each organism, and then get the human orthologs of those genes. 

### Author
Gstupp

In [1]:
# autogenerate biolink_client
# curl --insecure -X POST -H "content-type:application/json" -d '{"swaggerUrl":"https://api.monarchinitiative.org/api/swagger.json"}' https://generator.swagger.io/api/gen/clients/python
# and rename it to biolink_client

In [2]:
import os, sys
# change this path
sys.path.insert(0, "/home/gstupp/projects/NCATS-Tangerine/biolink_client")

In [19]:
import biolink_client
from biolink_client.api_client import ApiClient
from biolink_client.rest import ApiException
import requests
from itertools import chain
import pandas as pd
from pprint import pprint
from tqdm import tqdm, tqdm_notebook
from collections import defaultdict

pd.options.display.max_rows = 999

MONARCH_API = "https://api.monarchinitiative.org/api"
SIMSEARCH_API = "https://monarchinitiative.org/simsearch/phenotype"

gene_list = "https://raw.githubusercontent.com/NCATS-Tangerine/cq-notebooks/master/FA_gene_sets/FA_4_all_genes.txt"

client = ApiClient(host=MONARCH_API)
client.set_default_header('Content-Type', 'text/plain')
api_instance = biolink_client.BioentityApi(client)

# Get the gene list from github
dataframe = pd.read_csv(gene_list, sep='\t', names=['gene_id', 'symbol'])
df = dataframe.set_index('symbol')
human_genes = set(df.gene_id)

In [32]:
taxids = [10090, 7955, 7227, 6239]
prefixes = ['MGI', 'ZFIN', 'WormBase', 'FlyBase']

In [5]:
def get_obj(obj_id):
    url = "https://api.monarchinitiative.org/api/bioentity/{}".format(obj_id)
    res = requests.get(url)
    d = res.json()
    return d
def get_taxon_from_gene(gene):
    return get_obj(gene)['taxon']['label']
get_taxon_from_gene('NCBIGene:2176')

'Homo sapiens'

In [None]:
def query_orthologs(gene_id, taxon=None):
    """Query Monarch to determine the orthologs of a gene."""
    url = "https://api.monarchinitiative.org/api/bioentity/gene/{}/homologs/".format(gene_id)
    if taxon:
        res = requests.get(url, params={'homolog_taxon': taxon})
    else:        
        res = requests.get(url)
    d = res.json()
    return [x['object']['id'] for x in d['associations']]
#query_orthologs('MGI:88276', taxon="NCBITaxon:9606")

In [68]:
def get_phenotype_from_gene(gene):
    # https://monarchinitiative.org/gene/NCBIGene%3A2176/phenotype_list.json
    url = "https://monarchinitiative.org/gene/{}/phenotype_list.json"
    return [x['id'] for x in requests.get(url.format(gene)).json()['phenotype_list']]
def get_phenotype_from_gene_verbose(gene):
    # https://monarchinitiative.org/gene/NCBIGene%3A2176/phenotype_list.json
    url = "https://monarchinitiative.org/gene/{}/phenotype_list.json"
    return [(x['id'],x['label']) for x in requests.get(url.format(gene)).json()['phenotype_list']]
#get_phenotype_from_gene("NCBIGene:2176")

In [82]:
def get_phenotypically_similar_genes(phenotypes, taxon, return_all=False):
    headers = {
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
    }
    data = {'input_items': " ".join(phenotypes), "target_species": taxon}
    r = requests.post(SIMSEARCH_API, data=data, headers=headers)
    d = r.json()
    if return_all:
        return d
    if 'b' in d:
        scores = [(x['id'],x['score']['score'], x['label']) for x in d['b']]
    else:
        scores = []
    return scores
#get_phenotypically_similar_genes(phenotypes, "10090")

In [21]:
# human_genes = ["NCBIGene:2176"]
gene_genes = defaultdict(list)
for taxid in tqdm_notebook(taxids):
    for gene in tqdm_notebook(human_genes, leave=False):
        phenotypes = get_phenotype_from_gene(gene)
        gene_genes[gene].extend(get_phenotypically_similar_genes(phenotypes, taxid))

The installed widget Javascript is the wrong version.
The installed widget Javascript is the wrong version.
The installed widget Javascript is the wrong version.
The installed widget Javascript is the wrong version.
The installed widget Javascript is the wrong version.





In [126]:
s = defaultdict(int)
gene_label = dict()
for human_gene, ortho_genes in gene_genes.items():
    for ortho_gene, score, label in ortho_genes:
        gene_label[ortho_gene] = label
        s[ortho_gene] += score

In [127]:
top10 = dict()
for prefix in prefixes:
    ss = {k:v for k,v in s.items() if k.startswith(prefix)}
    top10[prefix] = sorted(ss.items(), key=lambda x:x[1], reverse=True)[:20]
ss = list(chain(*top10.values()))
ss = [{'gene': s[0], 'score': s[1]} for s in ss]
ss

[{'gene': 'MGI:88276', 'score': 1359},
 {'gene': 'MGI:1347466', 'score': 1341},
 {'gene': 'MGI:88039', 'score': 1332},
 {'gene': 'MGI:99851', 'score': 1321},
 {'gene': 'MGI:105373', 'score': 1314},
 {'gene': 'MGI:88064', 'score': 1310},
 {'gene': 'MGI:95729', 'score': 1308},
 {'gene': 'MGI:1330810', 'score': 1301},
 {'gene': 'MGI:98726', 'score': 1286},
 {'gene': 'MGI:88180', 'score': 1282},
 {'gene': 'MGI:98297', 'score': 1280},
 {'gene': 'MGI:107656', 'score': 1273},
 {'gene': 'MGI:1298393', 'score': 1270},
 {'gene': 'MGI:2148793', 'score': 1267},
 {'gene': 'MGI:1913761', 'score': 1266},
 {'gene': 'MGI:95523', 'score': 1263},
 {'gene': 'MGI:96677', 'score': 1263},
 {'gene': 'MGI:98834', 'score': 1259},
 {'gene': 'MGI:104327', 'score': 1258},
 {'gene': 'MGI:109583', 'score': 1253},
 {'gene': 'WormBase:WBGene00010556', 'score': 747},
 {'gene': 'WormBase:WBGene00004077', 'score': 731},
 {'gene': 'WormBase:WBGene00016837', 'score': 726},
 {'gene': 'WormBase:WBGene00004392', 'score': 705}

In [128]:
for s in tqdm_notebook(ss):
    s['orthologs'] = query_orthologs(s['gene'], "NCBITaxon:9606")
ss

The installed widget Javascript is the wrong version.





[{'gene': 'MGI:88276', 'orthologs': ['NCBIGene:1499'], 'score': 1359},
 {'gene': 'MGI:1347466', 'orthologs': ['NCBIGene:2296'], 'score': 1341},
 {'gene': 'MGI:88039', 'orthologs': ['NCBIGene:324'], 'score': 1332},
 {'gene': 'MGI:99851', 'orthologs': ['NCBIGene:865'], 'score': 1321},
 {'gene': 'MGI:105373', 'orthologs': ['NCBIGene:5727'], 'score': 1314},
 {'gene': 'MGI:88064', 'orthologs': ['NCBIGene:367'], 'score': 1310},
 {'gene': 'MGI:95729', 'orthologs': ['NCBIGene:2737'], 'score': 1308},
 {'gene': 'MGI:1330810', 'orthologs': ['NCBIGene:8626'], 'score': 1301},
 {'gene': 'MGI:98726', 'orthologs': ['NCBIGene:7042'], 'score': 1286},
 {'gene': 'MGI:88180', 'orthologs': ['NCBIGene:652'], 'score': 1282},
 {'gene': 'MGI:98297', 'orthologs': ['NCBIGene:6469'], 'score': 1280},
 {'gene': 'MGI:107656', 'orthologs': ['NCBIGene:3458'], 'score': 1273},
 {'gene': 'MGI:1298393', 'orthologs': ['NCBIGene:9644'], 'score': 1270},
 {'gene': 'MGI:2148793', 'orthologs': ['NCBIGene:84634'], 'score': 1267},

In [129]:
for s in tqdm_notebook(ss):
    s['label'] = get_obj(s['gene'])['label']
    s['ortholog_labels'] = [get_obj(x)['label'] for x in s['orthologs']]

The installed widget Javascript is the wrong version.





## This is the output!!!

In [153]:
ss = sorted(ss, key=lambda x: x['score'], reverse=True)
print("\n".join([",".join([x['orthologs'][0],x['ortholog_labels'][0], str(x['score'])]) for x in ss[:20]]))

NCBIGene:1499,CTNNB1,1359
NCBIGene:2296,FOXC1,1341
NCBIGene:324,APC,1332
NCBIGene:865,CBFB,1321
NCBIGene:5727,PTCH1,1314
NCBIGene:367,AR,1310
NCBIGene:2737,GLI3,1308
NCBIGene:8626,TP63,1301
NCBIGene:7042,TGFB2,1286
NCBIGene:652,BMP4,1282
NCBIGene:6469,SHH,1280
NCBIGene:3458,IFNG,1273
NCBIGene:9644,SH3PXD2A,1270
NCBIGene:84634,KISS1R,1267
NCBIGene:26097,CHTOP,1266
NCBIGene:2263,FGFR2,1263
NCBIGene:3815,KIT,1263
NCBIGene:7157,TP53,1259
NCBIGene:9241,NOG,1258
NCBIGene:5728,PTEN,1253


### Demo with one gene

In [155]:
## FANCC
phenotypes = get_phenotype_from_gene_verbose("NCBIGene:7042")
phenotypes

[('HP:0009751', 'Aplasia of the pectoralis major muscle'),
 ('HP:0000965', 'Cutis marmorata'),
 ('HP:0000316', 'Hypertelorism'),
 ('HP:0000218', 'High palate'),
 ('HP:0002686', 'Prenatal maternal abnormality'),
 ('HP:0002097', 'Emphysema'),
 ('HP:0002647', 'Aortic dissection'),
 ('HP:0200021', 'Down-sloping shoulders'),
 ('HP:0002138', 'Subarachnoid hemorrhage'),
 ('HP:0012163', 'Carotid artery dilatation'),
 ('HP:0002631', 'Dilatation of ascending aorta'),
 ('HP:0001763', 'Pes planus'),
 ('HP:0000822', 'Hypertension'),
 ('HP:0001634', 'Mitral valve prolapse'),
 ('HP:0000767', 'Pectus excavatum'),
 ('HP:0011304', 'Broad thumb'),
 ('HP:0000766', 'Abnormality of the sternum'),
 ('HP:0002105', 'Hemoptysis'),
 ('HP:0006101', 'Finger syndactyly'),
 ('HP:0001166', 'Arachnodactyly'),
 ('HP:0002705', 'High, narrow palate'),
 ('HP:0011106', 'Hypovolemia'),
 ('HP:0001679', 'Abnormal aortic morphology'),
 ('HP:0001171', 'Split hand'),
 ('HP:0003468', 'Abnormal vertebral morphology'),
 ('HP:000977

In [163]:
d = get_phenotypically_similar_genes([x[0] for x in phenotypes], "10090", return_all=True)
genes = get_phenotypically_similar_genes([x[0] for x in phenotypes], "10090", return_all=False)
genes

[('MGI:95489', 66, 'Fbn1'),
 ('MGI:1913761', 65, 'Chtop'),
 ('MGI:2446294', 64, 'Megf8'),
 ('MGI:109340', 63, 'Pitx2'),
 ('MGI:5560774', 63, 'b2b2736Clo'),
 ('MGI:96817', 63, 'Lox'),
 ('MGI:1928901', 62, 'Pdzk1'),
 ('MGI:1920563', 61, 'Rpgrip1l'),
 ('MGI:98726', 61, 'Tgfb2'),
 ('MGI:2154244', 61, 'Plxnd1'),
 ('MGI:106923', 61, 'Tll1'),
 ('MGI:95586', 61, 'Fst'),
 ('MGI:1919247', 61, 'Smg9'),
 ('MGI:107718', 61, 'Dnah5'),
 ('MGI:1891209', 60, 'Efemp2'),
 ('MGI:1347465', 60, 'Foxh1'),
 ('MGI:3050795', 60, 'Mkl2'),
 ('MGI:88452', 59, 'Col2a1'),
 ('MGI:5570107', 59, 'b2b2821Clo'),
 ('MGI:1927166', 59, 'Chst11'),
 ('MGI:5646601', 59, 'b2b3077Clo'),
 ('MGI:1920145', 59, 'Setd5'),
 ('MGI:109448', 58, 'Cfc1'),
 ('MGI:98715', 58, 'Ift88'),
 ('MGI:1298393', 58, 'Sh3pxd2a'),
 ('MGI:1922941', 58, 'Anks6'),
 ('MGI:1920942', 57, '2410089E03Rik'),
 ('MGI:97788', 57, 'Psph'),
 ('MGI:97712', 57, 'Prrx1'),
 ('MGI:97851', 57, 'Slc20a2'),
 ('MGI:96570', 57, 'Inhba'),
 ('MGI:97350', 57, 'Nkx2-5'),
 ('MGI:9

In [161]:
match = d['b'][0]
(match['id'],match['label'])

('MGI:95489', 'Fbn1')

In [162]:
match['matches'][:2]

[{'a': {'IC': 9.394417586875415,
   'id': 'HP:0004950',
   'label': 'Peripheral arterial stenosis'},
  'b': {'IC': 9.952015916687238,
   'id': 'MP:0006133',
   'label': 'calcified artery'},
  'lcs': {'IC': 5.409377205511062,
   'id': 'HP:0011004',
   'label': 'Abnormal systemic arterial morphology'}},
 {'a': {'IC': 10.006932303574196,
   'id': 'HP:0001199',
   'label': 'Triphalangeal thumb'},
  'b': {'IC': 7.882687770295942,
   'id': 'HP:0009803',
   'label': 'Short phalanx of finger'},
  'lcs': {'IC': 6.879123036616339,
   'id': 'MP:0005306',
   'label': 'abnormal phalanx morphology'}}]

In [124]:
# FANCC and Gli3 are "phenotypically similar" because of these phenotypes in common
[(x['lcs']['id'],x['lcs']['label']) for x in match['matches']]

[('MP:0001126', 'abnormal ovary morphology'),
 ('MP:0001914', 'hemorrhage'),
 ('GO:0060348PHENOTYPE', 'bone development phenotype'),
 ('HP:0031093', 'Abnormal breast morphology'),
 ('MP:0000757', 'herniated abdominal wall'),
 ('UBERON:0012359PHENOTYPE', 'pedal digitopodium bone phenotype'),
 ('MP:0000559', 'abnormal femur morphology'),
 ('MP:0000520', 'absent kidney'),
 ('MP:0001216', 'abnormal epidermal layer morphology'),
 ('MP:0001891', 'hydroencephaly'),
 ('UBERON:0003296PHENOTYPE', 'gland of diencephalon phenotype'),
 ('HP:0008438', 'Vertebral arch anomaly'),
 ('MP:0002229', 'neurodegeneration'),
 ('MP:0004174', 'abnormal spine curvature'),
 ('MP:0003130', 'anal atresia'),
 ('MP:0000564', 'syndactyly'),
 ('MP:0000561', 'adactyly'),
 ('HP:0004349', 'Reduced bone mineral density'),
 ('MP:0009931', 'abnormal skin appearance'),
 ('MP:0009890', 'cleft secondary palate'),
 ('HP:0002648', 'Abnormality of calvarial morphology'),
 ('HP:0000164', 'Abnormality of the dentition'),
 ('MP:00010

In [89]:
human_orthologs = query_orthologs(match['id'], taxon="NCBITaxon:9606")
human_orthologs

['NCBIGene:2737']

In [164]:
for human_gene, pgenes in gene_genes.items():
    pgenes = [x for x in pgenes if "MGI:98726" == x[0]]
    print(human_gene, get_obj(human_gene)['label'], pgenes)

NCBIGene:5888 RAD51 [('MGI:98726', 62, 'Tgfb2')]
NCBIGene:80233 FAAP100 []
NCBIGene:29089 UBE2T [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:55215 FANCI [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:83990 BRIP1 [('MGI:98726', 60, 'Tgfb2')]
NCBIGene:2176 FANCC [('MGI:98726', 61, 'Tgfb2')]
NCBIGene:10459 MAD2L2 [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:57697 FANCM [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:7516 XRCC2 [('MGI:98726', 61, 'Tgfb2')]
NCBIGene:5889 RAD51C [('MGI:98726', 61, 'Tgfb2')]
NCBIGene:199990 FAAP20 []
NCBIGene:84464 SLX4 [('MGI:98726', 62, 'Tgfb2')]
NCBIGene:55120 FANCL [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:201254 CENPX []
NCBIGene:675 BRCA2 [('MGI:98726', 57, 'Tgfb2')]
NCBIGene:378708 CENPS []
NCBIGene:55159 RFWD3 []
NCBIGene:91442 FAAP24 []
NCBIGene:2175 FANCA [('MGI:98726', 62, 'Tgfb2')]
NCBIGene:2072 ERCC4 [('MGI:98726', 58, 'Tgfb2')]
NCBIGene:2188 FANCF [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:2178 FANCE [('MGI:98726', 61, 'Tgfb2')]
NCBIGene:2189 FANCG [('MGI:98726', 63, 'Tgfb2')]
NCBIGene:

In [119]:
## Version 2 : Get orthologs first
phenotypes = get_phenotype_from_gene("MGI:88276")
get_phenotypically_similar_genes(phenotypes, "9606")

[('OMIM:164210', 58, 'Goldenhar syndrome'),
 ('OMIM:256520', 57, 'Neu-Laxova syndrome 1'),
 ('OMIM:219000', 57, 'Fraser syndrome'),
 ('OMIM:214800', 57, 'CHARGE syndrome'),
 ('OMIM:268300', 56, 'Roberts syndrome'),
 ('OMIM:312870', 56, 'Simpson-Golabi-Behmel syndrome'),
 ('OMIM:613038', 56, 'Combined pituitary hormone deficiencies, genetic forms'),
 ('OMIM:264090', 56, 'Wiedemann-Rautenstrauch syndrome'),
 ('OMIM:194050', 56, 'Williams-Beuren syndrome'),
 ('OMIM:303600', 56, 'Coffin-Lowry syndrome'),
 ('OMIM:268400', 55, 'Rothmund-Thomson syndrome'),
 ('OMIM:600001',
  55,
  'Pancreatic hypoplasia-diabetes-congenital heart disease syndrome'),
 ('OMIM:261540', 55, 'Peters plus syndrome'),
 ('OMIM:218600', 55, 'Baller-Gerold syndrome'),
 ('OMIM:130650', 55, 'Beckwith-Wiedemann syndrome'),
 ('OMIM:230000', 55, 'fucosidosis'),
 ('OMIM:309400', 54, 'Menkes Disease'),
 ('OMIM:274000', 54, 'Radial aplasia-thrombocytopenia syndrome'),
 ('OMIM:610829', 54, 'holoprosencephaly 9'),
 ('OMIM:192350