In [None]:
# -*- coding: utf-8 -*-
"""
last updated Jun 10 2019
@author: ShebleAdmin
query crossref with a list of bibliographic entries, use when bibliographic data has a range of formats / is irregular

scores from crossref seem to work pretty well as estimate of likelihood of match,
even with data that is somewhat rough. use of quotes within publications
"""


import re
import requests
import json
import pandas as pd
import time
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

In [None]:
# review: requests version 
requests.__version__

In [None]:
/filename_to_write_data_to.csv"
#### Set fp variable to your folder with file to be processed ####
#fp = "/<path_to_file_with_jumble of references>/" 
fp = "my_directory/"

# add file name to file path
examples = fp + "file_name of file with input data to be processed.csv"

In [None]:
### identify user - let crossref know who is using their service
headers = {
    'User-Agent': 'user-agent-value', # update with value for User-Agent
    'From': 'me@example.org'}       # update to include contact email

In [None]:
def build_query_url(citation):
    query_url = "http://api.crossref.org/works?query="
#    citation = re.sub('[&,.()\[\]:/"+Õ\_@Ò\*\n]', '', citation)
#    added line below to include row numbers for input data. number is retained for tracking.
#       here, references may be preceded by a number of 0-3 digits, a period, and 0 to 3 spaces... this number is saved for the output later
    citation = re.sub(r'^\d{0,3}\.{0,1} {0,3}', ' ', citation)
    citation = re.sub('[^\s\da-zA-Z-/]', ' ', citation)
    citation = re.sub(r'\s\D{1,2}\s{0,1}\D{0,1}\s', ' ', citation)
    citation = re.sub(r'\s{2,10}', ' ', citation)
    citation = re.sub(r'\d-\d', ' ', citation)
    citation = re.sub(r'$\d-\d', ' ', citation)
    citation.strip(' ')    
    query_url = query_url + '"' + citation + '"' + '&rows=1' # &rows=1 limits to the first result (&rows=0 to get a summary of search results)
    return query_url

''' 
data from specific json fields retrieved from crossref is parsed and output to a flat file via pandas. 

'''

# THIS ONE WORKS BEST AT THE MOMENT think about doing something else with the affiliation data (but there was essentially none in some of my data)
def construct_author(author_item):
    count = 0
    author_construct = ''
    affiliation_construct = ''
    for item in author_item:
        if count < (len(author_item)-1):
            if 'family' in item:
                author_construct = author_construct + str(item['family']) 
            else:
                 author_construct = author_construct + "no_family_name"
            if 'given' in item:
                author_construct = author_construct  + ', ' + str(item['given'])
            else:
                 author_construct = author_construct + ', ' + "no_given_name"
            if 'sequence' in item:
                author_construct = author_construct + ', ' + str(item['sequence'])
            else:
                author_construct = author_construct + ', ' + u'NA'
            au_affiliation = ''
            if 'affiliation'[0] in item:
                if 'name' in item['affiliation'][0] and len(item['affiliation'][0]) == 1:
                    au_affiliation = au_affiliation + item['affiliation'][0]['name']
                    print(item['affiliation'][0]['name'], ' : ', au_affiliation)
                elif 'name' not in item['affiliation'][0]:
                    au_affiliation = u'NA'
                else:
                    afct = ''
                    afct = afct + str(len(item['affiliation'][0]))
                    au_affiliation = au_affiliation + afct + " (multiple affiliations)"
                author_construct = author_construct + ': ' + au_affiliation

            else:
                au_affiliation = u'NA'
                author_construct = author_construct + ': ' + au_affiliation 
            author_construct = author_construct + '; '                  
            count += 1
    return author_construct
    
    
           

def construct_subject(subject_item):
    count=0
    subject_construct = ''
    for item in subject_item:
        if count < (len(subject_item)-1):
            subject_construct = subject_construct + item + '; '
            count += 1
        else:
            subject_construct = subject_construct + item
            count += 1
    return subject_construct

    
# cite_no is used to retain the row number that precedes the bibliographic reference item
def extract_json_fields(data, cite_no):
    reference = []
    # doi = data['message']['items'][0]['DOI']
    if 'DOI' in data['message']['items'][0]:
        doi = data['message']['items'][0]['DOI']
    else: 
        doi = u'NA'
    year = str(data['message']['items'][0]['issued']['date-parts'][0][0])
    # number of subjects may range from 0 to many
    if 'subject' in data['message']['items'][0]:
        subject = construct_subject(data['message']['items'][0]['subject'])
    else: 
        subject = u'NA'
    if 'author' in data['message']['items'][0]:
        author = construct_author(data['message']['items'][0]['author'])
    else:
        author = u'NA'
    if 'score' in data['message']['items'][0]:    
        score = data['message']['items'][0]['score']
    else:
        score = u'NA'
    if 'volume' in data['message']['items'][0]:
        volume = data['message']['items'][0]['volume']
    else:
        volume = u'NA'
    if 'issue' in data['message']['items'][0]:
        issue = data['message']['items'][0]['issue']
    else:
        issue = u'NA'
    # some items don't have a title, so make this optional...        
    if 'title' in data['message']['items'][0]:
        title = data['message']['items'][0]['title'][0]
    else:
        title = u'NA'
    if 'alternative-id' in data['message']['items'][0]:
        alternative_id = data['message']['items'][0]['alternative-id'][0]
    else:
        alternative_id = u'NA'
    # expand for items without a container title... data I've retrieved has had 0-1 container titles (e.g. jrnl of pub)
    if 'container_title' in data['message']['items'][0]:
        container_title = data['message']['items'][0]['container-title'][0]
    else:
        container_title = u'NA'
    if 'page' in data['message']['items'][0]:
        page = data['message']['items'][0]['page']
    else:
        page = u'NA'
    # order retrieved & extracted data for each reference
    reference.extend([cite_no, subject, author, year, title, container_title, volume, issue, page, doi, alternative_id, score])
    return reference

In [None]:
df = pd.read_table(examples, sep=',', header=0, verbose=True, quotechar='"',  error_bad_lines=True, warn_bad_lines=True)


In [None]:
df.head()

In [None]:
citations = df['citestring'].tolist()

In [None]:
# create 2 empty lists, one for returned data rows, and one for errors
rows = []
errors = []

In [None]:
for item in citations:
    item.strip(' ') 
    cite_no = ''
    # keep the number included in my input data... rows looked like this in pandas:
    '''
    0	1. Clearing the Air: A systematic review on th...
    1	2. E-Cigarette Presentation by the American As...
'''
    m = re.match(r'(?P<number>^\d{1,4})\.{0,1} {0,3}', item) 
    if m:
        cite_no = m.group(1)

    else:
        cite_no = "unknown"
    query_item = build_query_url(item)
    request = requests.get(query_item, headers = headers)
    #print(request.text)
    try:    
        refs = request.text
    except:
        errors.append('no text from crossref')
        print(errors)
        print("cite_no: {}, error code: {} \n".format(cite_no, response.status_code))
    #print(refs)
    
    if refs:
        data = json.loads(refs.strip())
        data_extract = extract_json_fields(data, cite_no)
        rows.append(data_extract)
    else:
        print("cite_no: {}, error code: {} \n".format(cite_no, response.status_code))
        rows.append([cite_no, response.status_code])
    # sleeping between api calls... length of sleep could be shortened
    time.sleep(0.2)

In [None]:
# check: count of rows of retrieved data
len(rows)

In [None]:
# view a few rows
rows[0:10]

In [None]:
data_extract

In [None]:
# to see format of most recently retrieved data
data

In [None]:
# all_data

In [None]:
df_refs = pd.DataFrame(rows)
df_refs.columns = ['ref_no', 'subject', 'author: affiliation', 'year', 'title', 'journal', 'volume', 'issue', 'page', 'doi', 'alternative_id', 'score']
# review the data
df_refs.head()

In [None]:
data_df = pd.DataFrame(data) 
print(data_df)

In [None]:
df_out = pd.concat([ df, df_refs], axis=1)
df_out[['citestring', 'ref_no', 'author: affiliation', 'title']].tail()

In [None]:
# write to tab-separated tsv
df_out.to_csv("my_directory/filename_to_write_data_to.csv", sep='\t', header=True, index_label='row_no', na_rep = 'NA')