In [64]:
from requests.exceptions import ReadTimeout, ConnectTimeout
import json
import requests
import sys
import urllib.parse
import datetime
import csv

CROSSREF_CODE = '020'
LOOKUP_PATH = 'lookup.csv'
MIN_SCORE = 75
crossref_api = {
    'free_text' : 'http://api.crossref.org/works?rows=1&query=',
    'doi' : 'http://api.crossref.org/works/'
}

lookup_dic = {}

In [None]:
#build a bib citation with all the available info inside the reference object 
def build_bibc(obj):
    
    if 'unstructured' in obj:
        return obj['unstructured']
    else:
        #all att values are already in string format
        bibc = ""
        strspan= " "
        if 'author' in obj:
            bibc = bibc + obj['author'] + strspan          
        if 'year' in obj:
            bibc = bibc + obj['year'] + strspan
        if 'article-title' in obj:
            bibc = bibc + obj['article-title'] + strspan
        if 'volume-title' in obj:
            bibc = bibc + obj['volume-title'] + strspan
        if 'journal-title' in obj:
            bibc = bibc + obj['journal-title'] + strspan
        if 'volume' in obj:
            bibc = bibc + obj['volume'] + strspan
        if 'first-page' in obj:
            bibc = bibc + obj['first-page'] + strspan
        if 'last-page' in obj:
            bibc = bibc + obj['last-page'] + strspan
        return bibc

In [None]:
            

#call crossref with the corresponding crossref_api[query_type] and the query_text 
def get_data(query_text, is_json = True, query_type = "free_text"):
    api_url = crossref_api[query_type]
    errors = []
    try:
        api_call = api_url + urllib.parse.quote_plus(query_text)
        print(api_call)
        response = requests.get(api_call)
        if response.status_code == 200:
            if is_json:
                 return json.loads(response.text)
            else:
                return response.text
        else:
            errors.append("HTTP error on data retrieving (HTTP status code: %s)." % str(response.status_code))

    except ReadTimeout as e:
        errors.append("A timeout error happened when reading results from the API when retrieving data. %s" % e) 
    except ConnectTimeout as e:
        errors.append("A timeout error happened when connecting to the API when retrieving data. %s" % e) 
    except Exception as e:
        errors.append("A generic error happened when trying to use the API when retrieving data. %s" % sys.exc_info()[0]) 
    
    return errors 
  

In [None]:
#generate the publication-date of a given crossref work object
def gen_pubdate(obj):
    if 'issued' in obj:
        if 'date-parts' in obj['issued']:
            #is an array of parts of dates
            try:
                obj_date = obj['issued']['date-parts'][0]
                
                #lisdate[year,month,day]
                listdate = [1,1,1]
                for i in range(0,len(obj_date)):
                    listdate[i] = obj_date[i]
                    
                #I have a date i generate it
                if (listdate[0] != 1):
                    date_val = datetime.date(listdate[0], listdate[1], listdate[2])
                    date_in_str = date_val.strftime('%Y-%m-%d')
                    return date_in_str
                
            except IndexError:
                pass
    return -1
 

In [None]:
       
# given a textual input (query_txt), call crossref and retrieves the work object of 
# the best scoring result in case the score is higher than MIN_SCORE
def find_work(query_txt):
    #call cross ref 
    res = get_data(query_txt)
    #crossref first and only result with higher score
    work_item = res['message']['items'][0]
    
    if "score" in work_item:
        if work_item["score"] > MIN_SCORE:
            #check if the work got a DOI
            if "DOI" in work_item:
                return work_item
            else:
                return -1
        return "low score"
    return -1
    

In [127]:
    
#given a crossref object get all the COCI data needed and generate the csv files
def process_cr_obj(obj):
        
    #check if obj have a DOI if not call crossref
    if "DOI" not in obj :
        query_text = build_bibc(obj)
        obj = find_work(query_text)
    
    #the doi of my object
    obj_doi = obj["DOI"]
    
    #check if obj have a publcation date, in case it's NO
    #call crossref with its DOI
    pub_date = gen_pubdate(obj)
    if pub_date == -1 :
        obj = get_data(obj_doi, query_type = "doi")
        
    #at this point I have the object and all the data needed
    #Build all the csv files
    

In [65]:
#init the lookup_dic by the contents of its corresponding csv
def init_lookup_dic():
    with open(LOOKUP_PATH,'r') as lookupcsv:
        lookupcsv_reader = csv.DictReader(lookupcsv)
        for row in lookupcsv_reader:
            lookup_dic[row['c']] = row['code']
            
#write a new lookup entry inside the csv, (Char,Code)
def write_new_lookup_val(c,code):
    with open(LOOKUP_PATH, 'wb') as lookupcsv:
        lookupwriter = csv.writer(lookupcsv)
        lookupwriter.writerow([c,code])


In [62]:
#update lookup dictionary and update its corresponding csv
def update_lookup(c):
    #define the code according following the 9 rule ... 
    code = len(lookup_dic)    
    lookup_dic[c] = code
    write_new_lookup_val(c,code)
    
#convert a crossref doi into a citation identifier     
def convert_doi_to_ci(doi_str):
    return CROSSREF_CODE + match_str_to_lookup(doi_str)
   
#convert a giving string in its corresponding ci format
#using the lookup file
def match_str_to_lookup(str_val):
    ci_str = ""
    for c in str_val:
        if c not in lookup_dic:
            update_lookup(c)
        ci_str = ci_str + str(lookup_dic[c])
    return ci_str
            
    

In [126]:
#Some Tests

#Make general textual query and retrieve the DOI
query_txt = "Constantin, A., Peroni, S., Pettifer, S., Shotton, D., Vitali, F. (in press). The Document Components Ontology (DoCO). To appear in Semantic Web – Interoperability, Usability, Applicability. Amsterdam, The Netherlands: IOS Press."
#The DOI is: 10.3233/SW-150177
find_work(query_txt)['DOI']


http://api.crossref.org/works?rows=1&query=Constantin%2C+A.%2C+Peroni%2C+S.%2C+Pettifer%2C+S.%2C+Shotton%2C+D.%2C+Vitali%2C+F.+%28in+press%29.+The+Document+Components+Ontology+%28DoCO%29.+To+appear+in+Semantic+Web+%E2%80%93+Interoperability%2C+Usability%2C+Applicability.+Amsterdam%2C+The+Netherlands%3A+IOS+Press.


'10.3233/sw-150177'

In [130]:
#A sample of work references
reference = {"reference": [
          {
            "issue": "18",
            "key": "10.3233/SW-150177_ref1",
            "doi-asserted-by": "crossref",
            "first-page": "i568",
            "DOI": "10.1093/bioinformatics/btq383",
            "article-title": "Utopia documents: linking scholarly literature with research data",
            "volume": "26",
            "author": "Attwood",
            "year": "2010",
            "journal-title": "Bioinformatics"
          },
          {
            "issue": "6",
            "key": "10.3233/SW-150177_ref5",
            "first-page": "515",
            "article-title": "The collections ontology: creating and handling collections in OWL 2 DL frameworks",
            "volume": "5",
            "author": "Ciccarese",
            "year": "2014",
            "journal-title": "Semantic Web – Interoperability, Usability, Applicability"
          },
        ]
}
first_ref = reference['reference'][0];
query_text = build_bibc(first_ref)
find_work(query_text)['DOI']

http://api.crossref.org/works?rows=1&query=Attwood+2010+Utopia+documents%3A+linking+scholarly+literature+with+research+data+Bioinformatics+26+i568+


'10.1093/bioinformatics/btq383'

In [66]:
#test the date
date_val = datetime.date(2012, 3, 1)
print(date_val.strftime('%Y-%m-%d'))

2012-03-01


In [69]:
init_lookup_dic()
convert_doi_to_ci('10.3233/sw-150177') 

'0200100370302030336283263010500010707'