#### What diseases have no known causative gene, but have been studied in model organisms?

This notebook explores models that study diseases with no known causative gene.  To search a subset of the phenotype ontology (for example, only Nervous system diseases), change the disease variable accordingly.

Note the first cell takes about 15 minutes

In [39]:
import requests

# Define parent classes
disease = 'DOID:4'

# If the world was flat, we could simply pull down the 9606 gene-disease tsv file, and model-disease files
# however, since models tend to annotate to parent terms (such as MGI), we will search closure lists 
# in solr

solr = 'https://solr-dev.monarchinitiative.org/solr/golr/select'

# Causal sources for human gene to disease (filters out gwas, ctd, clinvar likely patho)
causal_source = ["https://data.monarchinitiative.org/ttl/omim.ttl",
                 "https://data.monarchinitiative.org/ttl/orphanet.ttl",
                 "https://data.monarchinitiative.org/ttl/hpoa.ttl"]

# Copy paste function (need to put somewhere)
def get_solr_results(solr, params):
    resultCount = params['rows']
    while params['start'] < resultCount:
        solr_request = requests.get(solr, params=params)
        response = solr_request.json()
        resultCount = response['response']['numFound']
        params['start'] += params['rows']
        for doc in response['response']['docs']:
            yield doc


human_disease_params = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'relation, is_defined_by, object_closure',
    'fq': ['subject_category:"gene"',
           'object_closure:"{}"'.format(disease),
           'subject_taxon:"NCBITaxon:9606"'
          ]
}

# Filter out human models (cell lines)
model_params = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'object_closure',
    'fq': ['subject_category:"model"',
           'object_closure:"{}"'.format(disease),
           '-subject_taxon:"NCBITaxon:9606"'
          ]
}

non_human_disease = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'object_closure',
    'fq': ['subject_category:"gene"',
           'object_closure:"{}"'.format(disease),
           '-subject_taxon:"NCBITaxon:9606"'
          ]
}

human_disease_set =  set()
model_disease_set = set()

for doc in get_solr_results(solr, human_disease_params):
    is_causal = False
    
    if 'is_defined_by' not in doc:
        continue

    if len([source for source in doc['is_defined_by'] if source in causal_source]) > 0:
        is_causal = True
    elif 'https://data.monarchinitiative.org/ttl/clinvar.ttl' in doc['is_defined_by'] \
            and doc['relation'] == 'GENO:0000840':
        is_causal = True
    
    if is_causal:
        for disease in doc['object_closure']:
            human_disease_set.add(disease)
        
print("Finished fetching human g2d data")
        
for doc in get_solr_results(solr, model_params):
    for disease in doc['object_closure']:
        model_disease_set.add(disease)
        
print("Finished fetching model")
        
for doc in get_solr_results(solr, non_human_disease):
    for disease in doc['object_closure']:
        model_disease_set.add(disease)
        
print("Finished fetching all data")
        
model_only_set = model_disease_set - human_disease_set
model_only_set

Finished fetching human g2d data
Finished fetching model
Finished fetching all data


{'OMIA:001089',
 'UMLS:C0751777',
 'MESH:D020233',
 'OMIA:001531',
 'OMIA:001401',
 'OMIM:120450',
 'OMIA:000405-9940',
 'MESH:C565274',
 'MESH:C563448',
 'OMIA:001937',
 'OMIA:000307',
 'OBO:NCIT_C4833',
 'OMIA:001429-9685',
 'OMIA:000621-9796',
 'UMLS:C4083212',
 'OMIA:001302-9031',
 'OBO:NCIT_C7348',
 'DOID:0050840',
 'OMIA:002096-9796',
 'OMIA:001581',
 'OMIA:001461',
 'MESH:C563184',
 'MESH:D012148',
 'OMIA:000899-9615',
 'OMIA:000214-89462',
 'UMLS:C0269106',
 'UMLS:C0432282',
 'MESH:D009362',
 'MESH:D012004',
 'DOID:14798',
 'OMIA:001314-9615',
 'UMLS:C0347284',
 'OMIA:001079-9913',
 'OMIA:000540-118797',
 'DOID:3525',
 'OMIA:001341-9796',
 'MESH:D003921',
 'OMIA:000666-9685',
 'HGNC:2883',
 'UMLS:C0347390',
 'OMIA:001672',
 'UMLS:CN074258',
 'OMIA:000366-9913',
 'OMIA:001279-9615',
 'OMIA:001721-32536',
 'MESH:D020139',
 'MESH:C566239',
 'MESH:D005236',
 'OMIA:001542',
 'OMIA:001772',
 'OMIA:001745-9825',
 'OMIA:001472-9615',
 'MESH:D010954',
 'DOID:9675',
 'OMIA:001335',
 'OMI

In [40]:
len(model_only_set)

2340

In [41]:
# Length if we remove OMIA?
len([disease for disease in model_only_set if not disease.startswith('OMIA')])

1082

In [44]:
# However, this includes parent classes of each model to disease association
# Iterate and get all direct model to disease associations and output to file
# This also takes a really long time (15 minutes or so)
import copy

output_file = open('./model-disease.tsv', 'w')
output_list = []
direct_disease_set = set()

non_human_disease = {
    'wt': 'json',
    'rows': 100,
    'start': 0,
    'q': '*:*',
    'fl': 'subject,subject_label,object,object_label,relation_label',
    'fq': ['subject_category:"gene" OR subject_category:"model"',
           '-subject_taxon:"NCBITaxon:9606"'
          ]
}

for disease in model_only_set:
    params = copy.deepcopy(non_human_disease)
    params['fq'].append('object:"{}"'.format(disease))
    for doc in get_solr_results(solr, params):
        direct_disease_set.add(doc['object'])
        line = "{}\t{}\t{}\t{}".format(
            doc['subject'],
            doc['subject_label'],
            doc['object'],
            doc['object_label'])
        
        if 'relation_label' in doc:
            line += "\t{}\n".format(doc['relation_label'])
        else:
            line += "\t\n"
        output_list.append(line)
        output_file.write(line)
    
output_file.close()

# Count of direct disease annotations
len(direct_disease_set)

965

In [46]:
len([disease for disease in direct_disease_set if not disease.startswith('OMIA')])

235