In [25]:
import requests
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

GOLR_URL = 'https://solr.monarchinitiative.org/solr/golr/select'
SEARCH_URL = 'https://solr.monarchinitiative.org/solr/search/select'

### 
# What are all the human genes with

# Set of all human genes in monarch
human_genes = set()

# Set of all genes with ortholog-phenotype associations
ortho_pheno_genes = set()

# Set of all human genes with data (minus homology)
human_genes_data = set()

# Get all human genes
# Schema yaml is here: 
# https://github.com/monarch-initiative/monarch-app/blob/master/conf/golr-views/monarch-search-config.yaml

filters = [
    'taxon:"{0}"'.format("NCBITaxon:9606"),
    'category:"gene"',
]

search_params = {
    'wt': 'json',
    'q': '*:*',
    'fq': filters,
    'fl': 'id',
    'rows': 1000,
    'start': 0
}

resultCount = search_params['rows']

while search_params['start'] < resultCount:
    solr_request = requests.get(SEARCH_URL, params=search_params)
    response = solr_request.json()
    resultCount = response['response']['numFound']
    
    for doc in response['response']['docs']:
        human_genes.add(doc['id'])
        
    search_params['start'] += search_params['rows']
    

#print(human_genes[:][0:5])
print(len(human_genes))



39675


In [26]:

# Get all human genes with data in golr
# schema here: 
# https://github.com/monarch-initiative/monarch-app/blob/master/conf/golr-views/oban-config.yaml

filters = [
    'subject_taxon:"{0}"'.format("NCBITaxon:9606"),
    'subject_category:"gene"',
    '-relation_closure:"RO:HOM0000001"'
]

golr_params = {
    'wt': 'json',
    'q': '*:*',
    'fq': filters,
    'fl': 'subject',
    'rows': 1000,
    'start': 0
}

resultCount = golr_params['rows']

while golr_params['start'] < resultCount:
    solr_request = requests.get(GOLR_URL, params=golr_params)
    response = solr_request.json()
    resultCount = response['response']['numFound']
    
    for doc in response['response']['docs']:
        human_genes_data.add(doc['subject'])
        
    if golr_params['start'] % 100000 == 0:
        logger.info("Processed {} documents".format(golr_params['start']))
        
    golr_params['start'] += golr_params['rows']
    

#print(human_genes_data[:][0:5])
print(len(human_genes_data))

INFO:__main__:Processed 0 documents
INFO:__main__:Processed 10000 documents
INFO:__main__:Processed 20000 documents
INFO:__main__:Processed 30000 documents
INFO:__main__:Processed 40000 documents
INFO:__main__:Processed 50000 documents
INFO:__main__:Processed 60000 documents
INFO:__main__:Processed 70000 documents
INFO:__main__:Processed 80000 documents
INFO:__main__:Processed 90000 documents
INFO:__main__:Processed 100000 documents
INFO:__main__:Processed 110000 documents
INFO:__main__:Processed 120000 documents
INFO:__main__:Processed 130000 documents
INFO:__main__:Processed 140000 documents
INFO:__main__:Processed 150000 documents
INFO:__main__:Processed 160000 documents
INFO:__main__:Processed 170000 documents
INFO:__main__:Processed 180000 documents
INFO:__main__:Processed 190000 documents
INFO:__main__:Processed 200000 documents
INFO:__main__:Processed 210000 documents
INFO:__main__:Processed 220000 documents
INFO:__main__:Processed 230000 documents
INFO:__main__:Processed 240000

28613


In [30]:
# Get all genes with ortholog phenotype associations

filters = [
    'subject_category:"gene"',
    'object_category:"phenotype"'
]

golr_params = {
    'wt': 'json',
    'q': '*:*',
    'fq': filters,
    'fl': 'subject_ortholog_closure',
    'rows': 1000,
    'start': 0
}

resultCount = golr_params['rows']

while golr_params['start'] < resultCount:
    solr_request = requests.get(GOLR_URL, params=golr_params)
    response = solr_request.json()
    resultCount = response['response']['numFound']
    
    for doc in response['response']['docs']:
        if 'subject_ortholog_closure' in doc:
            for gene in doc['subject_ortholog_closure']:
                ortho_pheno_genes.add(gene)
        
    if golr_params['start'] % 100000 == 0:
        logger.info("Processed {} documents".format(golr_params['start']))
        
    golr_params['start'] += golr_params['rows']
    

#print(ortho_pheno_genes[:][0:5])
print(len(ortho_pheno_genes))


INFO:__main__:Processed 0 documents
INFO:__main__:Processed 100000 documents
INFO:__main__:Processed 200000 documents
INFO:__main__:Processed 300000 documents
INFO:__main__:Processed 400000 documents
INFO:__main__:Processed 500000 documents
INFO:__main__:Processed 600000 documents
INFO:__main__:Processed 700000 documents


209594


In [35]:
### We want the intersection of all human_genes and ortho_pheno_genes
### Union of the above and human_genes_data

human_with_op = ortho_pheno_genes.intersection(human_genes)

human_genes_w_data = human_genes_data.union(human_with_op)


print(len(human_genes_w_data))

file = open("/home/kshefchek/gene_list.txt", 'w')

for gene in human_genes_w_data:
    file.write("{}\n".format(gene))


28653


In [None]:
#!/bin/bash

grep -P '^NCBIGene' gene_list.txt >genes_only_entrez.txt