In [25]:
import requests

SOLR_URL = 'https://solr.monarchinitiative.org/solr/golr/select'

def get_causal_gene_phenotype_assocs():
    print("Fetching causal human gene phenotype and disease associations")
    result_set = set()
    filters = ['object_closure:"{0}" OR object_closure:"{1}"'.format("UPHENO:0001001", "DOID:4"),
               'subject_category:"gene"',
               'subject_taxon: "{0}"'.format('NCBITaxon:9606')]
    params = {
        'wt': 'json',
        'rows': 1000,
        'start': 0,
        'q': '*:*',
        'fq': filters,
        'fl': 'subject, subject_label, object, object_label, relation, is_defined_by'
    }

    causal_source = ["http://data.monarchinitiative.org/ttl/clinvar.ttl",
                     "https://data.monarchinitiative.org/ttl/omim.ttl",
                     "https://data.monarchinitiative.org/ttl/orphanet.ttl"]
    resultCount = params['rows']
    while params['start'] < resultCount:
        solr_request = requests.get(SOLR_URL, params=params)
        response = solr_request.json()
        resultCount = response['response']['numFound']

        for doc in response['response']['docs']:
            if 'relation' in doc:
                # Filter out likely pathogenic
                if doc['relation'] == 'GENO:0000841':
                    continue

            if 'is_defined_by' in doc\
                    and len([source for source in doc['is_defined_by'] if source in causal_source]) == 0\
                    and doc['is_defined_by'] != ['https://data.monarchinitiative.org/ttl/hpoa.ttl']:
                continue
            elif 'is_defined_by' not in doc:
                continue

            yield(doc)

        params['start'] += params['rows']

outfile = open('g2pd-associations', 'w')
gene_set = set()
for doc in get_causal_gene_phenotype_assocs():
    gene_set.add(doc['subject'])
    def_by = [defined.replace('https://data.monarchinitiative.org/ttl/', '') for defined in doc['is_defined_by']]
    def_by = [defined.replace('http://data.monarchinitiative.org/ttl/', '') for defined in def_by]
    def_by = [defined.replace('.ttl', '') for defined in def_by]

    outfile.write("{}\t{}\t{}\t{}\t{}\n".format(
        doc['subject'],
        doc['subject_label'],
        doc['object'],
        doc['object_label'],
        ', '.join(def_by)
    ))
    
len(gene_set)

Fetching causal human gene phenotype and disease associations


4382