In [132]:
from requests.exceptions import ReadTimeout, ConnectTimeout
import os
import json
import requests
import sys
import urllib.parse
import datetime
import csv
from time import sleep
from duration import (
    to_iso8601,
    to_seconds,
    to_timedelta,
    to_tuple,
)

conf = {
    "email": "ivan.heibi2@unibo.it",
    "key": None,
    "useragent": "coci-process",
    "postfix": "00000"
}


CROSSREF_CODE = '020'
LOOKUP_CSVPATH = 'lookup.csv'
DATA_CSVPATH = "data/d-%s.csv"
PROV_CSVPATH = 'prov/p-%s.csv'
INDEX_PROCESSED_CSVPATH = 'index/processed.csv'
INDEX_ERRORS_CSVPATH = 'index/error.csv'
INDEX_DATE_CSVPATH = 'index/date.csv'
INDEX_NODOI_CSVPATH = 'index/nodoi.csv'
INDEX_FILE_CSVPATH = 'index/file.csv'

INPUT_DATA_PATH = '.'

MAX_DATA_ENTRIES = 5
datacsv_counter = 0
file_id = 0

MIN_SCORE = 75
crossref_api = {
    'free_text' : 'https://api.crossref.org/works?rows=1&query=%s&mailto='+conf["email"],
    'doi' : 'https://api.crossref.org/works/&mailto='+conf["email"]+'&query=%s'
}

lookup_code = 0

#dictionaries 
lookup_dic = {}
processed_dic = {}
date_dic = {}

In [101]:
############### Methods to write on CSV files

#write on a csv_path file a given rows (a list of values)
#def write_row_on_csv(csv_path, new_row, csvid = None, quoting_val = csv.QUOTE_NONE):
#    if csvid != None: 
#        csv_path = csv_path%(csvid)
#    with open(csv_path, 'a', newline='') as csvfile:
#        csvwriter = csv.writer(csvfile, quoting= quoting_val)
#        csvwriter.writerow(new_row)

#create new file with header
def init_csv(csv_path,header):
    with open(csv_path, 'w') as csvfile:
        csvfile.write(header)

#write on a csv_path file a given block_txt 
def write_txtblock_on_csv(csv_path, block_txt, csvid = None):
    if csvid != None: 
        csv_path = csv_path%(csvid)
    with open(csv_path, 'a', newline='') as csvfile:
        csvfile.write(block_txt)

#write on a csv_path file a given rows (a list of values)
def write_rows_on_csv(csv_path, row_lis, csvid = None, quoting_flag= True):
    block_txt = ""
    for row in row_lis:
        row_txt = ""
        separator = ","
        for field_i in range(0,len(row)):
            if (field_i == len(row) - 1):
                separator = ""
            field = row[field_i]
            if quoting_flag:
                field = '"'+field+'"'
            row_txt = row_txt + field + separator
        block_txt = block_txt + row_txt + "\n"
    
    if csvid != None: 
        csv_path = csv_path%(csvid)
    with open(csv_path, 'a', newline='') as csvfile:
        csvfile.write(block_txt) 

In [102]:

#init the lookup_dic by the contents of its corresponding csv
def init_lookup_dic():
    with open(LOOKUP_CSVPATH,'r') as lookupcsv:
        lookupcsv_reader = csv.DictReader(lookupcsv)
        for row in lookupcsv_reader:
            lookup_dic[row['c']] = row['code']
        #last code used
        global lookup_code
        lookup_code = len(lookup_dic) - 1

#update lookup dictionary and update its corresponding csv
def update_lookup(c):
    #define the code following the 9 rule ... 
    calc_next_lookup_code()
    code = lookup_code  
    global lookup_dic
    lookup_dic[c] = code
    #add it on the csv
    write_txtblock_on_csv(LOOKUP_CSVPATH, '\n"%s","%s"'%(c,code))

def update_date(dateObj, ci_key):
    global date_dic
    date_dic[ci_key] = dateObj
    write_txtblock_on_csv(INDEX_DATE_CSVPATH, "\n"+ci_key+","+dateObj["str_val"]+","+str(dateObj["format"]))
    
def init_date_dic():
    with open(INDEX_DATE_CSVPATH,'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        global date_dic
        for row in csv_reader:
            date_dic[row['id']] = {"str_val": row['value'],"format": row['format']}
            
def update_processed(ci_key):
    global processed_dic
    processed_dic[ci_key] = 1
    write_txtblock_on_csv(INDEX_PROCESSED_CSVPATH, "\n"+ci_key)
    
def init_processed_dic():
    with open(INDEX_PROCESSED_CSVPATH,'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        global processed_dic
        for row in csv_reader:
            processed_dic[row['id']] = 1
            
def update_nodoi(citing, cited, text):
    write_txtblock_on_csv(INDEX_NODOI_CSVPATH, '\n%s,%s,"%s"'%(citing,cited,text))

In [103]:
###############  Convert CrossRef DOI to CI
def calc_next_lookup_code():  
    global lookup_code
    rem = lookup_code % 100
    newcode = lookup_code + 1
    if (rem==89):
        newcode = newcode * 10
    lookup_code = newcode
    
#convert a crossref doi into a citation identifier     
def convert_doi_to_ci(doi_str):
    return CROSSREF_CODE + match_str_to_lookup(doi_str)
   
#convert a giving string in its corresponding ci format
#using the lookup file
def match_str_to_lookup(str_val):
    ci_str = ""
    str_noprefix = str_val[3:]
    for c in str_noprefix:
        if c not in lookup_dic:
            update_lookup(c)
        ci_str = ci_str + str(lookup_dic[c])
    return ci_str    

def reverse_ci_to_doi(str_val):
    str_val = str_val[3:]
    str_doi=""
    i=0
    while i < len(str_val):
        code = str_val[i:i+2]
        
        for key in lookup_dic:
            if lookup_dic[key] == code:
                    str_doi = str_doi + key
        i += 2
    return "10."+str_doi

In [104]:
#build a bib citation with all the available info inside the reference object 
def build_bibc(obj):
    
    if 'unstructured' in obj:
        return obj['unstructured']
    else:
        #all att values are already in string format
        bibc = ""
        strspan= " "
        if 'author' in obj:
            bibc = bibc + obj['author'] + strspan          
        if 'year' in obj:
            bibc = bibc + obj['year'] + strspan
        if 'article-title' in obj:
            bibc = bibc + obj['article-title'] + strspan
        if 'volume-title' in obj:
            bibc = bibc + obj['volume-title'] + strspan
        if 'journal-title' in obj:
            bibc = bibc + obj['journal-title'] + strspan
        if 'volume' in obj:
            bibc = bibc + obj['volume'] + strspan
        if 'first-page' in obj:
            bibc = bibc + obj['first-page'] + strspan
        if 'last-page' in obj:
            bibc = bibc + obj['last-page'] + strspan
        return bibc

In [105]:
#call crossref with the corresponding crossref_api[query_type] and the query_text 
def get_data(query_text, is_json = True, query_type = "free_text", num_iterations= 1, sleep_time= 60,req_timeout= None):
    api_url = crossref_api[query_type]
    errors = ""
    for i in range(0,num_iterations):
        api_call = api_url % (urllib.parse.quote_plus(query_text))
        #print(api_call)
        try:
            response = requests.get(api_call, headers={"User-Agent": conf["useragent"]}, timeout= req_timeout)
            if (response.status_code == 200):
                if is_json:
                    return json.loads(response.text)
                else:
                    return response.text
            else:
                errors = errors + "HTTP error on data retrieving (HTTP status code: %s). " % str(response.status_code)
        except Exception as e:
            errors = errors + "Exception: %s " % e
        
        #try again after a sleep_time period
        sleep(sleep_time)
    
    #if the method arrives here, we got some errors
    return {"errors": errors} 
             

In [106]:
#generate the publication-date of a given crossref work object
def build_pubdate(obj, ci):
    
    if ci in date_dic:
        return date_dic[ci]
    
    if 'issued' in obj:
        if 'date-parts' in obj['issued']:
            #is an array of parts of dates
            try:
                obj_date = obj['issued']['date-parts'][0]
                
                #lisdate[year,month,day]
                listdate = [1,1,1]
                for i in range(0,len(obj_date)):
                    try:
                        intvalue = int(obj_date[i])
                        listdate[i] = intvalue
                    except:
                        pass
                    
                #I have a date , so generate it
                if (listdate[0] != 1):
                    date_val = datetime.date(listdate[0], listdate[1], listdate[2])
                    
                    dformat = '%Y-%m-%d'
                    #e.g: 2016/1/1
                    if ((listdate[1] == 1) and (listdate[2] == 1)):
                        dformat = '%Y'

                    #e.g: 2016/3/1
                    date_in_str = date_val.strftime(dformat)
                    
                    dateobj = {"str_val": date_in_str, "format":  dformat}
                    #date_dic[ci] = dateobj
                    return dateobj
                
            except IndexError:
                pass
            
    #date_dic[ci] = {"str_val":"","format":-1}
    return {"str_val":"","format":-1}
 

In [107]:
# given a textual input (query_txt), call crossref and retrieves the work object of 
# the best scoring result in case the score is higher than MIN_SCORE
def find_work(query_txt):
    #call cross ref 
    res = get_data(query_txt, num_iterations=2, req_timeout= 60)
    
    if "errors" not in res:
        try:
            #crossref first and only result with higher score
            work_item = res['message']['items'][0]

            if "score" in work_item:
                if work_item["score"] > MIN_SCORE:
                    #check if the work has a DOI
                    if "DOI" in work_item:
                        return work_item
                    else:
                        return -1
                #low score
                return -1
        except IndexError:
                return -1
    return res

In [108]:
def process_list_items(obj, obj_file_id):
    list_of_items = obj['message']['items']
    for item in list_of_items:
        ##process the item 
        csvdata = process_item(item)
        
        #if this is the first time i am processing this element
        if csvdata != -1:
            if "errors" in csvdata:
                #write the errors
                write_txtblock_on_csv(INDEX_ERRORS_CSVPATH, '\n%s,"%s"'%(csvdata["citing_ci"],csvdata['errors']))

            else:    
                global datacsv_counter
                global file_id
                
                #update files identifiers
                datacsv_counter += 1
                if (datacsv_counter // MAX_DATA_ENTRIES == 1):
                    datacsv_counter = 0
                    file_id += 1 
                    init_csv(DATA_CSVPATH%str(file_id),'oci,citing,cited,creation,timestamp')
                    init_csv(PROV_CSVPATH%str(file_id),'oci,agent,source,datetime')
                
                if csvdata["data"] != "":
                    write_txtblock_on_csv(DATA_CSVPATH, csvdata["data"], csvid = str(file_id))
                if csvdata["prov"] != "":
                    write_txtblock_on_csv(PROV_CSVPATH, csvdata["prov"], csvid = str(file_id))

            #add item to processed 
            write_txtblock_on_csv(INDEX_PROCESSED_CSVPATH, "\n%s"%(csvdata["citing_ci"]))
            
    write_txtblock_on_csv(INDEX_FILE_CSVPATH, "\n%s"%(str(obj_file_id)))
        
#given a crossref object get all the COCI data needed, returns an object with errors in case something wrong happend 
#returns -1 in case the object has already been processed
def process_item(obj):
    data_lis = []
    prov_lis = []
    
    if (("DOI" in obj) and ("reference" in obj)):
        print("Processing:"+obj["DOI"])
        citing_doi = obj["DOI"].lower()
        citing_ci = convert_doi_to_ci(citing_doi)
        citing_date = build_pubdate(obj,citing_ci)
        
        #update dates
        update_date(citing_date, citing_ci)
        
        #in case this is the first time i am elaborating this item
        if citing_ci not in processed_dic:
            update_processed(citing_ci)
                
            data_txtblock = ""
            prov_txtblock = ""
            
            #iterate through all references
            for ref_item in obj['reference']:
                
                ref_entry_attr = process_ref_entry(ref_item)
                
                if(ref_entry_attr != -1):
                    if("errors" not in ref_entry_attr): 
                        
                        #in case It was a No-DOI 
                        if (ref_entry_attr["nodoi_text"] != -1):
                            update_nodoi(citing_ci, ref_entry_attr['cited_ci'], ref_entry_attr["nodoi_text"])
                        
                        #create all other data needed
                        oci = citing_ci+"-"+ref_entry_attr['cited_ci']
                        
                        timestamp = ""
                        if ((citing_date["format"] != -1) and (ref_entry_attr['cited_date']["format"] != -1)):
                            
                            citing_dt = datetime.datetime.strptime(citing_date["str_val"], citing_date["format"])
                            cited_dt = datetime.datetime.strptime(ref_entry_attr['cited_date']["str_val"], ref_entry_attr['cited_date']["format"])
                            timestamp = to_iso8601(citing_dt - cited_dt)
                        
                        data_txtblock = data_txtblock +"\n"+ oci+","+citing_doi+","+ref_entry_attr['cited_doi']+","+citing_date["str_val"]+","+timestamp
                        
                        timenow = str(datetime.datetime.now().replace(microsecond=0))
                        prov_txtblock = prov_txtblock +"\n"+ oci+","+conf["useragent"]+","+crossref_api['doi']+citing_doi+","+timenow
                    #we have errors
                    else:
                        #break all and return the errors
                        return {"errors": ref_entry_attr["errors"], "citing_ci": citing_ci}
                        break;
                
            return {
                "citing_ci": citing_ci,
                "data": data_txtblock,
                "prov": prov_txtblock
            }
        return - 1
    return {"errors": "entry without a DOI or Ref-List"}
            
#given a reference entry returns it's DOI, CI, and Publication-Date    
#in case one of these attributes is not present: the methods returns -1
def process_ref_entry(obj):
    
    nodoi_text = -1
    
    #check if obj have a DOI if not call crossref
    if "DOI" not in obj :
        query_text = build_bibc(obj)
        obj = find_work(query_text)
        if (obj != -1):
            nodoi_text = query_text
    
    if (obj != -1):
        if "errors" in obj:
            return obj
        else:
            #if my new object have a doi now
            if "DOI" in obj:
                cited_doi = obj["DOI"]
                cited_ci = convert_doi_to_ci(cited_doi)

                #check if obj have a publcation date, 
                #first case is true only if find_work has been called before
                cited_date = build_pubdate(obj,cited_ci)
                
                #in case i don't have a date, try look at it again
                if cited_date["format"] == -1 :
                    obj = get_data(cited_doi, query_type = "doi", num_iterations=3, req_timeout=60)
                    if "errors" not in obj:
                        cited_date = build_pubdate(obj['message'],cited_ci)
                
                #update dates
                update_date(cited_date, cited_ci)
                
                return {'cited_doi': cited_doi, 'cited_ci': cited_ci, 'cited_date':cited_date, 'nodoi_text':nodoi_text }
    else:
        return -1



In [131]:
#iterate all the input data and process the json files
for subdir, dirs, files in os.walk(INPUT_DATA_PATH):
    for file in files:
        if file.lower().endswith('.json'):
            data = json.load(open(os.path.join(subdir, file)))