# Count publications in the IDR

This notebook uses the IDR OMERO.web API to fetch the top-level study metadata for all published studies, and fetches Pubmed metadata for each study that contains a Pubmed ID.

This is used to show the journals and number of studies associated with that journal in the IDR.

In [1]:
from collections import Counter, defaultdict
import requests
from time import sleep
from xml.dom.minidom import parseString

IDR_BASE_URL = 'https://idr.openmicroscopy.org'
INDEX_PAGE = f'{IDR_BASE_URL}/webclient/?experimenter=-1'
MAP_URL = '{base}/webclient/api/annotations/?type=map&{type}={id}'
PUBMED_BATCH_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmids}&retmode=xml'
CROSSREF_URL = 'https://api.crossref.org/works/{doi}'

Get all studies (projects and screens)

In [2]:
screens = requests.get(f'{IDR_BASE_URL}/api/v0/m/screens/').json()
projects = requests.get(f'{IDR_BASE_URL}/api/v0/m/projects/').json()
print(f"Found {screens['meta']['totalCount']} screens {projects['meta']['totalCount']} projects")
assert (screens['meta']['totalCount'] <= screens['meta']['limit']), 'Paging required'
assert (projects['meta']['totalCount'] <= projects['meta']['limit']), 'Paging required'

studies = {
    'screen': screens['data'],
    'project': projects['data'],
}

Found 61 screens 58 projects


Get study map annotations, we're interested in the PubMed IDs

In [3]:
class PublicationIds:
    def __init__(self, *, pubmed=None, doi=None):
        self.pubmed = pubmed
        self.doi = doi

    def __repr__(self):
        return f'PublicationIds(pubmed={self.pubmed}, doi={self.doi}'

study_pubids = defaultdict(PublicationIds)
names = set()

for (study_type, container) in studies.items():
    for c in container:
        url = MAP_URL.format(base=IDR_BASE_URL, type=study_type, id=c['@id'])
        annotations = requests.get(url).json()['annotations']
        for a in annotations:
            if a['ns'] == 'idr.openmicroscopy.org/study/info':
                m = dict(a['values'])
                name = c['Name'].split('/')[0]
                names.add(name)
                try:
                    pubmedid = m['PubMed ID'].split()[0]
                    if study_pubids[name].pubmed:
                        assert study_pubids[name].pubmed == pubmedid
                    else:
                        study_pubids[name].pubmed = pubmedid                        
                except KeyError:
                    pass
                try:
                    doi = m['Publication DOI'].split()[0]
                    if study_pubids[name].doi:
                        assert study_pubids[name].doi == doi
                    else:
                        study_pubids[name].doi = doi                        
                except KeyError:
                    pass
                break

In [4]:
pmids = []
dois = []

for (name, pids) in sorted(study_pubids.items()):
#     print(name, pids)
    if pids.pubmed:
        pmids.append(pids.pubmed)
    else:
        dois.append(pids.doi)

print(f'Looking up {len(pmids)} Pubmed IDs {len(dois)} DOIs')

missing = sorted(names.difference(study_pubids.keys()))
if missing:
    print('\nMissing Pubmed ID and DOI:\n\t' + '\n\t'.join(missing))

Looking up 64 Pubmed IDs 1 DOIs

Missing Pubmed ID and DOI:
	idr0011-ledesmafernandez-dad4
	idr0018-neff-histopathology
	idr0085-walsh-mfhrem
	idr0086-miron-micrographs
	idr0089-fischl-coldtemp
	idr0091-julou-lacinduction
	idr0092-ostrop-organoid
	idr0094-ellinger-sarscov2


Now fetch the Pubmed metadata using a single batch query

In [5]:
r = requests.get(PUBMED_BATCH_URL.format(pmids=','.join(pmids)))
r.raise_for_status()

rxml = parseString(r.text)
journalxml = rxml.getElementsByTagName('Journal')
assert len(journalxml) == len(pmids)
journal_titles = [j.getElementsByTagName('Title')[0].firstChild.data for j in journalxml]

If a journal has a DOI but not a Pubmed ID lookup the journal using Crossref

In [6]:
doi_journal_titles = []
for doi in dois:
    sleep(1)
    r = requests.get(CROSSREF_URL.format(doi=doi))
    r.raise_for_status()
    crossref = r.json()
    doi_journal_titles.append(crossref['message']['container-title'][0])

Finally list the journals and the number of studies

In [7]:
journals = Counter(journal_titles + doi_journal_titles)
journals = sorted(journals.items(), key=lambda x: (-x[1], x[0]))

for j in journals:
    print(f'{j[0]:>40} {j[1]:3}')

             The Journal of cell biology   7
                   Nature communications   6
                                   eLife   6
               Molecular systems biology   5
                                PloS one   4
                Science (New York, N.Y.)   4
                                  Nature   3
                          Nature methods   3
                         Scientific data   3
                                    Cell   2
                            Cell systems   2
                     Nature cell biology   2
                      Scientific reports   2
                        BMC cell biology   1
                        Cancer discovery   1
                            Cell reports   1
                    Current biology : CB   1
        Development (Cambridge, England)   1
                      Developmental cell   1
                             GigaScience   1
                      Journal of anatomy   1
                   Journal of proteomics   1
          