In [1]:
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [7]:
# load data
data = pd.read_json('preprints_prep.json')

In [5]:
# separate neuro data from the rest
neuro_data = data[data['preprint_category']=='neuroscience']
neuro_data.shape

(7621, 11)

In [6]:
# set DOI (all lowercase) as index
neuro_data = neuro_data.set_index(neuro_data.published_doi.str.lower())

# add journal column
neuro_data['journal'] = ''

# Finding journal info per paper
### Use crossref API to get journal info
NB: Not all papers are registered with crossref, but it seems to be the vast majority

Useful reference:https://github.com/CrossRef/rest-api-doc#filter-names

1) Split data into sets of n_per_req (entries per request)

2) send a query to crossref api. concatenate the DOIs of n_per_req papers. request the doi and container-title (journal name) in results

3) match the container-titles returned to the DOI to fill in the journal column

4) Save

In [None]:
# go through all DOIs in neuro_data, access journal information using the crossref API
n_per_req = 100 # number of entries per request
# initialize query
query = {'filter':'doi:10.3389/fnsys.2016.00088',
         'rows':n_per_req,
         'select':'DOI,container-title',
         'mailto':'m.frid@protonmail.com'}
# loop through articles
for x in np.arange(0,neuro_data.shape[0]+n_per_req,n_per_req): #
    #build query string with dois of n_per_req papers
    t = time.time()
    doi_str = ""
    for xx in np.arange(n_per_req):
        if (x + xx) < neuro_data.shape[0]:
            doi_str += "doi:" + neuro_data.published_doi.iloc[x+xx] + ','
    doi_str = doi_str[:-1] # get rid of last comma
    query['filter'] = doi_str
    
    #access crossref api
    response = requests.get(crossref_url, params=query)
    r = response.json()
    print(f"{x} of {neuro_data.shape[0]}. {time.time()-t}s elapsed for query. {r['message']['total-results']} results found")
    t = time.time()
    for xx in np.arange(r['message']['total-results']):
        ind = r['message']['items'][xx]['DOI']
        neuro_data.loc[ind,'journal'] = r['message']['items'][xx]['container-title'][0]
    print(f"{time.time()-t}s elapsed for pandas bookkeepping")  
#save df
neuro_data.to_csv('neuro_journals_doi_2.csv')

Alternative (original) way of ac

In [None]:
# use crossref api to get journal title from DOI
# ****very slow bc it gets one entry per request****
journals = neuro_data.published_doi.copy().rename_axis('journal_name')
crossref_url = 'http://api.crossref.org/works' 
query = {'filter':'doi:10.3389/fnsys.2016.00088','rows':1,'select':'DOI,container-title','mailto':'m.frid@protonmail.com'}
for x, doi in enumerate(neuro_data.published_doi.iteritems()):
    print(str(x) + ' out of ' + str(journals.shape[0]) + ': ' + doi[1])
    query['filter'] = 'doi:' + doi[1]
    response = requests.get(crossref_url, params=query)
    r = response.json()
    if r['message']['total-results'] > 0:
        journals[doi[0]] = r['message']['items'][0]['container-title'][0]
    else:
        journals[doi[0]] = ''

pd.concat([journals,neuro_data.published_doi],axis=1).to_csv('neuro_journals_doi_2.csv')