# Crossref

In [1]:
import requests
import re
import pandas as pd
import json

def get_items(json_data):
    
    rows = []
    
    for item in json_data:
        data = dict()
        date = ''
        date_parts = item['published']['date-parts'][0] 
        for i,part in enumerate(date_parts):
            date += f'{part}'
            if i < len(date_parts)-1:
                date += '-'
        data['date'] = date
        
        if 'author' in item:
            authors = []
            for author in item['author']:
                author_name = ''
                author_name += author.get('given','')
                author_name += ' ' + author.get('family','')
                authors.append(author_name.strip())
            if len(authors)>0:
                data['author'] = authors
        data['doi'] = item['DOI']
        data['publisher'] = item['publisher']
        
        if 'isbn-type' in item:
            isbns = []
            for isbn in item['isbn-type']:
                isbns.append( {'value':isbn['value'],'type':isbn['type']} )
            if len(isbns)>0:
                data['isbn'] = isbns
        if 'subject' in item:
            subjects = []
            for subject in item['subject']:
                subjects.append(subject)
            if len(subjects)>0:
                data['subject'] = subjects
        if 'abstract' in item:
            data['abstract'] = item['abstract']     
        if 'container-title' in item:
            data['journal'] = item['container-title']

        data['type']=item['type']
        title = item['title'][0]
        if 'subtitle' in item:
            title += ': ' + item['subtitle'][0]
        data['title']=title
        if 'link' in item:
            links = []
            for url in item['link']:
                links.append(url['URL'])
            if len(links)>0:
                data['link']=links
        rows.append(data)
        
    return rows

In [2]:
results = []

query_nr = 0

query = 'loneliness'
cursor = '*'
rows = 500
publ_type = 'journal-article'
start_date = '2004-01-01'
end_date = '2024-01-01'

headers = {'User-Agent':'mailto:p.a.f.verhaar@hum.leidenuniv.nl'}

url = f'https://api.crossref.org/works?query=%22{query}%22&cursor={cursor}&rows={rows}'

if re.search(r'\d',start_date) and re.search(r'\d',end_date):
    url += f'&filter=type:{publ_type},from-pub-date:{start_date},until-pub-date:{end_date}'

print(url)
    
response = requests.get(url,headers=headers)
print(response.status_code)
json_data = response.json()

print( f"{json_data['message']['total-results']} results. ")
nr_items = len(json_data['message']['items'])

query_nr += 1
print(f"{query_nr}: items {(query_nr*rows)-rows}-{query_nr*rows}")

   
data = get_items(json_data['message']['items'])
results.extend(data)

next_cursor = ''
if 'next-cursor' in json_data['message']:
    next_cursor = json_data['message']['next-cursor']

while re.search( r'\w+' , next_cursor) and nr_items>0:

    url = f'https://api.crossref.org/works?query=%22{query}%22&cursor={next_cursor}&rows={rows}'

    if re.search(r'\d',start_date) and re.search(r'\d',end_date):
        url += f'&filter=type:{publ_type},from-pub-date:{start_date},until-pub-date:{end_date}'

        print(url)
    response = requests.get(url,headers=headers)
    json_data = response.json()
    query_nr += 1
    print(f"{query_nr}: items {(query_nr*rows)-rows}-{query_nr*rows}")
    
    nr_items = len(json_data['message']['items'])

    data = get_items(json_data['message']['items'])
    results.extend(data)

    next_cursor = ''
    if 'next-cursor' in json_data['message']:
        next_cursor = json_data['message']['next-cursor']

https://api.crossref.org/works?query=%22loneliness%22&cursor=*&rows=500&filter=type:journal-article,from-pub-date:2004-01-01,until-pub-date:2024-01-01
200
9091 results. 
1: items 0-500
https://api.crossref.org/works?query=%22loneliness%22&cursor=DnF1ZXJ5VGhlbkZldGNoBgAAAAAZKUWbFmJnOVJRVU1xUzlLa2ZpTkltazIzencAAAAAGid9bhZPY3FES3VMU1R5R3JIWHlwQUZBcktnAAAAABoa2nMWTUpsaGN0RGFRbS1yN0ZYWTJ3MG5pUQAAAAAW3Si1FmZPZjNDVWhLUnh1RDFtUWM4WFIzUHcAAAAAGmB9-BZHaHl0clJVU1NhaUZBSUY2TG81YzRRAAAAABj8zpkWaUpOMms5em5SUmVMR2JjT2VGdEFtdw==&rows=500&filter=type:journal-article,from-pub-date:2004-01-01,until-pub-date:2024-01-01
2: items 500-1000
https://api.crossref.org/works?query=%22loneliness%22&cursor=DnF1ZXJ5VGhlbkZldGNoBgAAAAAZKUWbFmJnOVJRVU1xUzlLa2ZpTkltazIzencAAAAAGid9bhZPY3FES3VMU1R5R3JIWHlwQUZBcktnAAAAABoa2nMWTUpsaGN0RGFRbS1yN0ZYWTJ3MG5pUQAAAAAW3Si1FmZPZjNDVWhLUnh1RDFtUWM4WFIzUHcAAAAAGmB9-BZHaHl0clJVU1NhaUZBSUY2TG81YzRRAAAAABj8zpkWaUpOMms5em5SUmVMR2JjT2VGdEFtdw==&rows=500&filter=type:journal-article,from

20: items 9500-10000


In [3]:

with open(f'articles_{query}.json', 'w') as f:
    json.dump(results, f)
