In [1]:
from requests.exceptions import ReadTimeout, ConnectTimeout
import json
import requests
import sys
import urllib.parse
import datetime
import csv
from time import sleep
from duration import (
    to_iso8601,
    to_seconds,
    to_timedelta,
    to_tuple,
)

conf = {
    "email": "ivan.heibi2@unibo.it",
    "key": None,
    "useragent": "opencitations",
    "postfix": "00000"
}


CROSSREF_CODE = '020'
LOOKUP_CSVPATH = 'lookup.csv'
DATA_CSVPATH = "data/d-%s.csv"
PROV_CSVPATH = 'prov/p-%s.csv'
INDEX_CI_CSVPATH = 'index/ci.csv'
INDEX_DATE_CSVPATH = 'index/date.csv'
INDEX_NODOI_CSVPATH = 'index/nodoi.csv'

MAX_DATA_ENTRIES = 1000000
datacsv_counter = 0
file_id = 0

MIN_SCORE = 75
crossref_api = {
    'free_text' : 'https://api.crossref.org/works?rows=1&query=%s&mailto='+conf["email"],
    'doi' : 'https://api.crossref.org/works/&mailto='+conf["email"]+'&query=%s'
}

lookup_code = 0
lookup_dic = {}
ci_index_dic = {}
date_index_dic = {}
nodoi_index_dic = {}

In [2]:
############### Methods to write on CSV files

#write on a csv_path file a given rows (a list of values)
#def write_row_on_csv(csv_path, new_row, csvid = None, quoting_val = csv.QUOTE_NONE):
#    if csvid != None: 
#        csv_path = csv_path%(csvid)
#    with open(csv_path, 'a', newline='') as csvfile:
#        csvwriter = csv.writer(csvfile, quoting= quoting_val)
#        csvwriter.writerow(new_row)

#write on a csv_path file a given block_txt 
def write_txtblock_on_csv(csv_path, block_txt, csvid = None):
    if csvid != None: 
        csv_path = csv_path%(csvid)
    with open(csv_path, 'a', newline='') as csvfile:
        csvfile.write(block_txt)

#write on a csv_path file a given rows (a list of values)
def write_rows_on_csv(csv_path, row_lis, csvid = None, quoting_flag= True):
    block_txt = ""
    for row in row_lis:
        row_txt = ""
        separator = ","
        for field_i in range(0,len(row)):
            if (field_i == len(row) - 1):
                separator = ""
            field = row[field_i]
            if quoting_flag:
                field = '"'+field+'"'
            row_txt = row_txt + field + separator
        block_txt = block_txt + row_txt + "\n"
    
    if csvid != None: 
        csv_path = csv_path%(csvid)
    with open(csv_path, 'a', newline='') as csvfile:
        csvfile.write(block_txt) 

In [3]:
###############  Convert CrossRef DOI and update the lookup csv

#init the lookup_dic by the contents of its corresponding csv
def init_lookup_dic():
    with open(LOOKUP_CSVPATH,'r') as lookupcsv:
        lookupcsv_reader = csv.DictReader(lookupcsv)
        for row in lookupcsv_reader:
            lookup_dic[row['c']] = row['code']
        #last code used
        global lookup_code
        lookup_code = len(lookup_dic) - 1
    #the original csv file didn't end with a \n therefore appending new lines
    #will put write the first line next to the last one
    write_txtblock_on_csv(LOOKUP_CSVPATH, "\n")

#update lookup dictionary and update its corresponding csv
def update_lookup(c):
    #define the code following the 9 rule ... 
    calc_next_lookup_code()
    code = lookup_code  
    global lookup_dic
    lookup_dic[c] = code
    #add it on the csv
    write_txtblock_on_csv(LOOKUP_CSVPATH, '"%s","%s"\n'%(c,code))

def calc_next_lookup_code():  
    global lookup_code
    rem = lookup_code % 100
    newcode = lookup_code + 1
    if (rem==89):
        newcode = newcode * 10
    lookup_code = newcode
    
#convert a crossref doi into a citation identifier     
def convert_doi_to_ci(doi_str):
    return CROSSREF_CODE + match_str_to_lookup(doi_str)
   
#convert a giving string in its corresponding ci format
#using the lookup file
def match_str_to_lookup(str_val):
    ci_str = ""
    for c in str_val:
        if c not in lookup_dic:
            update_lookup(c)
        ci_str = ci_str + str(lookup_dic[c])
    return ci_str    


In [4]:
#build a bib citation with all the available info inside the reference object 
def build_bibc(obj):
    
    if 'unstructured' in obj:
        return obj['unstructured']
    else:
        #all att values are already in string format
        bibc = ""
        strspan= " "
        if 'author' in obj:
            bibc = bibc + obj['author'] + strspan          
        if 'year' in obj:
            bibc = bibc + obj['year'] + strspan
        if 'article-title' in obj:
            bibc = bibc + obj['article-title'] + strspan
        if 'volume-title' in obj:
            bibc = bibc + obj['volume-title'] + strspan
        if 'journal-title' in obj:
            bibc = bibc + obj['journal-title'] + strspan
        if 'volume' in obj:
            bibc = bibc + obj['volume'] + strspan
        if 'first-page' in obj:
            bibc = bibc + obj['first-page'] + strspan
        if 'last-page' in obj:
            bibc = bibc + obj['last-page'] + strspan
        return bibc

In [5]:
#call crossref with the corresponding crossref_api[query_type] and the query_text 
def get_data(query_text, is_json = True, query_type = "free_text", num_iterations= 1, sleep_time= 60,req_timeout= None):
    api_url = crossref_api[query_type]
    errors = ""
    for i in range(0,num_iterations):
        api_call = api_url % (urllib.parse.quote_plus(query_text))
        print(api_call)
        try:
            response = requests.get(api_call, headers={"User-Agent": conf["useragent"]}, timeout= req_timeout)
            if (response.status_code == 200):
                if is_json:
                    return json.loads(response.text)
                else:
                    return response.text
            else:
                errors = errors + "HTTP error on data retrieving (HTTP status code: %s). " % str(response.status_code)
        except Exception as e:
            errors = errors + "Exception: %s " % e
        
        #try again after a sleep_time period
        sleep(sleep_time)
    
    #if the method arrives here, we got some errors
    return {"errors": errors} 
             

In [6]:
#generate the publication-date of a given crossref work object
def build_pubdate(obj):
    if 'issued' in obj:
        if 'date-parts' in obj['issued']:
            #is an array of parts of dates
            try:
                obj_date = obj['issued']['date-parts'][0]
                
                #lisdate[year,month,day]
                listdate = [1,1,1]
                for i in range(0,len(obj_date)):
                    listdate[i] = obj_date[i]
                    
                #I have a date , so generate it
                if (listdate[0] != 1):
                    date_val = datetime.date(listdate[0], listdate[1], listdate[2])
                    if ((listdate[1] == 1) and (listdate[2] == 1)):
                        #e.g: 2016/1/1
                        date_in_str = date_val.strftime('%Y')
                    else:
                        #e.g: 2016/3/1
                        date_in_str = date_val.strftime('%Y-%m-%d')
                    return date_in_str
                
            except IndexError:
                pass
    return -1
 

In [7]:
# given a textual input (query_txt), call crossref and retrieves the work object of 
# the best scoring result in case the score is higher than MIN_SCORE
def find_work(query_txt):
    #call cross ref 
    res = get_data(query_txt, num_iterations=3, req_timeout= 60)
    
    if "errors" not in res:
        #crossref first and only result with higher score
        work_item = res['message']['items'][0]

        if "score" in work_item:
            if work_item["score"] > MIN_SCORE:
                #check if the work got a DOI
                if "DOI" in work_item:
                    return work_item
                else:
                    return -1
            return "low score"
    return res

In [8]:
def process_list_items(obj):
    list_of_items = obj['message']['items']
    for item in list_of_items:
        csvdata = process_item(item)
        if "errors" in csvdata:
            #write the errors
            write_txtblock_on_csv(INDEX_CI_CSVPATH, csvdata["citing_ci"]+","+csvdata['errors'])
        else:
            write_txtblock_on_csv(DATA_CSVPATH, csvdata["data"], csvid = str(file_id))
            write_txtblock_on_csv(PROV_CSVPATH, csvdata["prov"], csvid = str(file_id))
            write_txtblock_on_csv(INDEX_CI_CSVPATH, csvdata["citing_ci"]+","+"ok")
        
#given a crossref object get all the COCI data needed, returns an object with errors in case something wrong happend 
def process_item(obj):
    data_lis = []
    prov_lis = []
    if "DOI" in obj:
        citing_doi = obj["DOI"].lower()
        citing_ci = convert_doi_to_ci(citing_doi)
        citing_date = build_pubdate(obj)
        
        #in case this is the first time i am elaborating this item
        if citing_ci not in ci_index_dic:
            ci_index_dic[citing_ci] = 1
            
            #check if obj has a date
            if citing_date != -1 :
                #iterate through all references
                references = obj['reference']
                for ref_item in references:
                    ref_entry_attr = process_ref_entry(ref_item)
                    if("errors" not in ref_entry_attr): 
                        #create all other data needed
                        oci = citing_ci+"-"+ref_entry_attr['cited_ci']
                        timespan = to_iso8601(citing_date - ref_entry_attr['cited_date'])
                        
                        data_txtblock = oci+","+citing_doi+","+ref_entry_attr['cited_doi']+","+citing_date+","+timespan+"\n"
                        prov_txtblock = oci+","+"crossref"+","+crossref_api['doi']+str(citing_doi)+","+citing_date+"\n"

                    #we have errors
                    else:
                        #break all and return the errors
                        return {"errors": ref_entry_attr["errors"], "citing_ci": citing_ci}
                        break;
                    
                #once i am done with all the references, write all to csv  
                #update files identifiers
                global datacsv_counter
                global file_id
                datacsv_counter += 1
                if (datacsv_counter // MAX_DATA_ENTRIES == 1):
                    datacsv_counter = 0
                    file_id += 1 
                    
                return {
                    "oci": oci,
                    "citing_ci": citing_ci,
                    "data": data_txtblock,
                    "prov": prov_txtblock
                }
            
#given a reference entry returns it's DOI, CI, and Publication-Date    
#in case one of these attributes is not present: the object will contain the errors string
def process_ref_entry(obj):
    
    #check if obj have a DOI if not call crossref
    if "DOI" not in obj :
        query_text = build_bibc(obj)
        obj = find_work(query_text)
    
    #if my new object have a doi now
    if "DOI" in obj:
        cited_doi = obj["DOI"]
        cited_ci = convert_doi_to_ci(cited_doi)
        
        #check if obj have a publcation date, 
        #first case is true only if find_work has been called before
        creation_date = build_pubdate(obj)
        if creation_date == -1 :
            obj = get_data(cited_doi, query_type = "doi", num_iterations=3, req_timeout=60)
            if "errors" not in obj:
                creation_date = build_pubdate(obj['message'])
                return {'cited_doi': cited_doi, 'cited_ci': cited_ci, 'cited_date':creation_date }
        
    return obj

In [9]:
##################SOME TESTS

##test doi converter and and lookup update
init_lookup_dic()
#write_txtblock_on_csv(LOOKUP_CSVPATH, '"ò#","68"\n')
match_str_to_lookup("10.11/ç§1")

FileNotFoundError: [Errno 2] No such file or directory: 'lookup.csv'

In [115]:
#Make general textual query and retrieve the DOI
query_txt = "Constantin, A., Peroni, S., Pettifer, S., Shotton, D., Vitali, F. (in press). The Document Components Ontology (DoCO). To appear in Semantic Web – Interoperability, Usability, Applicability. Amsterdam, The Netherlands: IOS Press."
#The DOI is: 10.3233/SW-150177
find_work(query_txt)['DOI']

https://api.crossref.org/works?rows=1&query=Constantin%2C+A.%2C+Peroni%2C+S.%2C+Pettifer%2C+S.%2C+Shotton%2C+D.%2C+Vitali%2C+F.+%28in+press%29.+The+Document+Components+Ontology+%28DoCO%29.+To+appear+in+Semantic+Web+%E2%80%93+Interoperability%2C+Usability%2C+Applicability.+Amsterdam%2C+The+Netherlands%3A+IOS+Press.&mailto=ivan.heibi2@unibo.it


'10.3233/sw-150177'

In [100]:
#A sample of work references
reference = {"reference": [
          {
            "issue": "18",
            "key": "10.3233/SW-150177_ref1",
            "doi-asserted-by": "crossref",
            "first-page": "i568",
            "DOI": "10.1093/bioinformatics/btq383",
            "article-title": "Utopia documents: linking scholarly literature with research data",
            "volume": "26",
            "author": "Attwood",
            "year": "2010",
            "journal-title": "Bioinformatics"
          },
          {
            "issue": "6",
            "key": "10.3233/SW-150177_ref5",
            "first-page": "515",
            "article-title": "The collections ontology: creating and handling collections in OWL 2 DL frameworks",
            "volume": "5",
            "author": "Ciccarese",
            "year": "2014",
            "journal-title": "Semantic Web – Interoperability, Usability, Applicability"
          },
        ]
}
first_ref = reference['reference'][0];
process_ref_entry(first_ref)

https://api.crossref.org/works/10.1093%2Fbioinformatics%2Fbtq383


{'cited_ci': '0200100370100090336111824182315242722102918122836112926030803',
 'cited_date': '2010-09-07',
 'cited_doi': '10.1093/bioinformatics/btq383'}

In [None]:
#test the date


In [None]:
init_lookup_dic()
convert_doi_to_ci('10.3233/sw-150177') 

In [None]:
t1 = date_val = datetime.date(2012, 3, 1)
t2 = date_val = datetime.date(2013, 3, 1)
time = t2 - t1
to_iso8601(time)

In [None]:
PROV_CSVPATH = 'prov/p-%s.csv'
print(PROV_CSVPATH%(1))