***
Welcome to the CV parsing notebook! This notebook sends a single citation to CrossRef and if there's a match it parses the citation and gets open access information from Sherpa/RoMEO
***

Habanero is the Python library for the CrossRef API will need to be installed locally first before running notebook. More info about habanero: https://pypi.org/project/habanero/

In [1]:
import sys
!{sys.executable} -m pip install habanero
!{sys.executable} -m pip install pandas
from habanero import Crossref
import csv #for handling output
from IPython.display import display
import urllib.request, json
import json
import urllib.parse
from xml.etree import ElementTree as ET

You should consider upgrading via the '/usr/local/Cellar/jupyterlab/2.2.8/libexec/bin/python3.8 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/Cellar/jupyterlab/2.2.8/libexec/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [2]:
cr = Crossref()

Should get author info (Last name, First initial) for comparison

In [3]:
oauthorLast = input("(Enter last name):")

(Enter last name):Kaplan


In [4]:
oauthorFirst = input("(Enter first name):")

(Enter first name):Stephen


In [5]:
print(oauthorLast, oauthorFirst)

Kaplan Stephen


In [6]:
oauthor = oauthorLast + ", " + oauthorFirst

In [7]:
print(oauthor)

Kaplan, Stephen


In [8]:
inputFile = input("Input file name path (ex: ./CV/dbcompareKaplanStephen.txt)):")

Input file name path (ex: ./CV/dbcompareKaplanStephen.txt)):./CV/ShortSKaplanCV.txt


In [9]:
print(inputFile)

./CV/ShortSKaplanCV.txt


In [10]:
outputFile = "output" + oauthorLast + oauthorFirst + ".txt" 

In [11]:
print(outputFile)

outputKaplanStephen.txt


***
(Optional) Run this cell to see raw results from CrossRef, these results are parsed in the "cv_parse()" function!
***

In [12]:
#Optional to see individual citation results

x = cr.works(query = "A.Basu, R. Kaplan and S. Kaplan (2014) Creating supportive environments to foster reasonableness and achieve sustainable well-being. In T. J. Hämäläinen and J. Michaelson (Eds.) Well-being and beyond: Broadening the public and policy discourse.  (Pp. 182-217) Cheltenham, UK:  Edward Elgar Publishing, New Horizons in Management Series.", limit = 1)
print(x['message'])

{'facets': {}, 'total-results': 14983852, 'items': [{'indexed': {'date-parts': [[2020, 7, 29]], 'date-time': '2020-07-29T18:22:20Z', 'timestamp': 1596046940498}, 'reference-count': 0, 'publisher': 'Edward Elgar Publishing', 'isbn-type': [{'value': '9781783472901', 'type': 'electronic'}], 'content-domain': {'domain': [], 'crossmark-restriction': False}, 'DOI': '10.4337/9781783472901.00015', 'type': 'other', 'created': {'date-parts': [[2014, 6, 9]], 'date-time': '2014-06-09T10:18:28Z', 'timestamp': 1402309108000}, 'page': '182-218', 'source': 'Crossref', 'is-referenced-by-count': 2, 'title': ['Creating supportive environments to foster reasonableness and achieve sustainable well-being'], 'prefix': '10.4337', 'author': [{'given': 'Avik', 'family': 'Basu', 'sequence': 'first', 'affiliation': []}, {'given': 'Rachel', 'family': 'Kaplan', 'sequence': 'additional', 'affiliation': []}, {'given': 'Stephen', 'family': 'Kaplan', 'sequence': 'additional', 'affiliation': []}], 'member': '2918', 'con

***
Parse the results of the CrossRef query!
***

In [13]:
def cv_parse(l, r):

    cv_dict = dict() #creates an empty "dict" that will be populated with all the fields and values from CrossRef and S/R
    

    for z in x['message']['items']: # Goes through every field in "items" --> what's returned from CrossRef see Cell above!
        cv_dict['Citation'] = l.rstrip() # Gets the citation and strips the last character ='.\n'
        
        score = z.get('score', None)
        cv_dict['Score'] = score # Gets the relevance score from CrossRef 
        if score > 73.0: # and continues to grab other data if the score is greater than 73.0
        
            authors = z.get('author',None) # Gets the contents of the author field
            if authors != None: # If there are author names present continue
                authors = "" # resets the authors field to blank

                for a in z['author']: # walks through the author field in "items"
                    famname = a.get('family',None) # gets last name
                    if famname == None:
                        famname = "NoName"
                    givname = a.get('given',None) 
                    if givname == None:
                        givname = "NoName"
                    fullname = famname + ", " + givname + "; "
                    #print(fullname)
                    authors = authors + fullname
                if oauthor in authors:
                    cv_dict['Author'] = authors.rstrip('; ')
                else:
                    cv_dict['Author'] = "NotAMatch" # include "This is a book!"
            else:
                cv_dict['Author'] = "No Authors Listed"

            ppubdate = z.get('published-print',None)
            opubdate = z.get('published-online',None)
            if ppubdate != None:
                if len(ppubdate['date-parts'][0]) == 3:
                    cv_dict['Date'] = str(ppubdate['date-parts'][0][0]) + "-" + str(ppubdate['date-parts'][0][1]) + "-" + str(ppubdate['date-parts'][0][2])
                else:
                    cv_dict['Date'] = str(ppubdate['date-parts'][0][0])
            elif opubdate != None:

                if len(opubdate['date-parts'][0]) == 3:
                    cv_dict['Date'] = str(opubdate['date-parts'][0][0]) + "-" + str(opubdate['date-parts'][0][1]) + "-" + str(opubdate['date-parts'][0][2])
                else:
                    cv_dict['Date'] = str(opubdate['date-parts'][0][0])
            else:
                cv_dict['Date'] = None

            title = z.get('title', None)
            if title != None:
                cv_dict['Title'] = title[0].rstrip('.\n')
            else:
                cv_dict['Title'] = None

             # Capture 'subtitle' as well -- In which column should we put this? Should there be a subtitle column?   

            pubtype = z.get('type',None)
            cv_dict['Type'] = pubtype

            publisher = z.get('publisher',None)
            cv_dict['Publisher'] = publisher

            jname = z.get('container-title',None)
            if jname != None:
                cv_dict['Source'] = jname[0]    
            else:
                cv_dict['Source'] = None


            #cv_dict['Editor'] = editor

            editor = z.get('editor',None)
            if editor != None:
                editors = ""

                for a in z['editor']:
                    famname = a.get('family',None) 
                    if famname == None:
                        famname = "NoName"
                    givname = a.get('given',None) 
                    if givname == None:
                        givname = "NoName"
                    fullname = famname + ", " + givname + "; "
                    #print(fullname)
                    editors = editors + fullname
                    cv_dict['Editor'] = editors

            else:
                cv_dict['Editor'] = None

            volume = z.get('volume',None)
            cv_dict['Journal_Volume'] = str(volume)

            #Ask about issue?
            issue = z.get('issue',None)
            cv_dict['Journal_Issue'] = str(issue)

            page = z.get('page',None)
            if page != None:
                pageSplit = page.split("-")
                cv_dict['Start_page'] = str(pageSplit[0])
                if len(pageSplit) == 1:
                    cv_dict['End_page'] = None
                else:
                    cv_dict['End_page'] = int(pageSplit[1])
            else:
                cv_dict['Start_page'] = None
                cv_dict['End_page'] = None

            doi = z.get('DOI',None)
            cv_dict['DOI'] = doi

            issn = z.get('issn-type',None)
            if issn != None:
                cv_dict['ISSN'] = issn[0]['value'] + ", " + issn[0]['type']
            else:
                cv_dict['ISSN'] = None
            
            # Sherpa Romeo section
            if issn != None:
                srdict = dict()
                condlist = list()
                artcondict = dict()

                #url = 'http://www.sherpa.ac.uk/romeo/api29.php?issn=0001-6918'
                #url = 'http://www.sherpa.ac.uk/romeo/api29.php?issn=' + issn[0]['value'] + '&ak=uAZ3ggWwyjM'
                #http://www.sherpa.ac.uk/romeo/api29.php?issn=1386-145X&ak=uAZ3ggWwyjM
                #print(url)
                
                url = 'https://v2.sherpa.ac.uk/cgi/retrieve_by_id?item-type=publication&api-key=B606E5B4-1D49-11EB-8371-EBE6B22EDBEE&format=Json&identifier=' + issn[0]['value'] 
                f = urllib.request.urlopen(url)
                data = json.loads(f.read().decode())



                
                if data['items'] != []:
                    #print("not empty")

                    for j in data['items']:    
                        for p in j['publisher_policy']:       
                            for c in p['permitted_oa']:  
                                condlist = []
                                if ['conditions']: # get ALL conditions make this a list then append as needed
                                    #print(type(['conditions']))
                                    for item in ['conditions']:
                                        conditions = c.get(item, None)
                                        condlist.append(conditions)
                                        #print(conditions)

                                        #make condlist a dictionary with article version as the key and condition list the value.
                                else:
                                    continue
                                artVersion = c.get(['article_version'][0], None)
                                #print(artVersion)

                                if artVersion[0] in artcondict:
                                    artcondict[artVersion[0]] = [artcondict[artVersion[0]], condlist]

                                else: 
                                    artcondict[artVersion[0]] = condlist
                            #condlist.append("article_version: " + artVersion[0] + " conditions: " + conditions)
                            #print(condlist)
            
                        for p in j['issns']:
                            issn = p['issn']
                        for p in j['title']:
                            title = p['title']
                        for p in j['publishers']:
                            for pn in p['publisher']['name']:
                                pubname = (pn['name'])
                                
                    srdict['journalInfo'] = "%s | %s | %s" % (title, issn, pubname)
                    #print(srdict)
                    cv_dict['SHERPA-RoMEOconditions'] = artcondict
                    cv_dict['SHERPA-RoMEOversion'] = srdict['journalInfo']

                else:
                    srdict['journalInfo'] = 'Journal not Found' 

                #print(condlist)
                #print(issn)
                #print(title)
                #print(pubname)


                #cv_dict['SHERPA-RoMEO'] = url

        else:
            break
        #writer.writerow(cv_dict)   
    return(cv_dict)

***
This is the main part of the notebook
    - Opens the input file
    - Writes out the field names (top row of the tab-delimited output file)
    - Calls the parsing function (in the cell above)
    - Writes out each row of data in the output file
***

In [15]:

with open(inputFile, "r", newline="\n",  encoding='utf-8') as input_file, open(outputFile, "w", newline="\n", encoding='utf-8') as output_file:
    #rline = input_file.readline()
    fieldnames = ['Score', 'Filename', 'Citation', 'Author', 'ORCID', 'Date', 'Title', 'OtherTitle', 'Type', 'Peer_Reviewed', 'Source', 'Editor', 'Publisher', 'Journal_Volume', 'Journal_Issue', 'Start_page', 'End_page', 'Chapter_Number_dc.relation.haspart', 'DOI', 'identifier.uri', 'HathiTrust', 'Fulcrum', 'ISBN', 'ISSN', 'SHERPA-RoMEOversion', 'SHERPA-RoMEOconditions', 'Date_to_lift_embargo_dc.date.available', 'Request_copy_name', 'Request_copy_email', 'CC_License_dc.rights.none', 'rights.uri', 'Subject', 'Description', 'hlbtoplevel', 'hlbsecondlevel', 'Affiliationum', 'Affiliationumcampus', 'Notes', 'josejose']
    writer = csv.DictWriter(output_file, delimiter = '\t', fieldnames=fieldnames)
    writer.writeheader()
    #writer.writerow(['Citation', 'Author', 'Year', 'Month-Day', 'Title', 'Type', 'Editor', 'Publisher', 'JournalName', 'JournalVolume', 'JournalIssue', 'Startpage', 'Endpage', 'DOI', 'issn', 'SHERPA-RoMEO', 'Notes']) # replace with real field names
    row = 0
    for line in input_file: # walks through the input CV text file line by line
        #print(line)
        # check for "" in citation use \" to get around this issue or check UTF-8
        x = cr.works(query = line, limit = 1) # Actually gets the results from CrossRef and stores them in 'x'
        #print(x['message'])
        cvd = cv_parse(line, x) # call function to parse citations and the citation and the variable of 'x' 
        #print(cvd)   
        writer.writerow(cvd) # results of the parsing function are printed each result is one row
        row = row + 1 # rows are counted so you know the progress of the parser
        print(row)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


***
(Optional)
Run this only if you want to quickly check your output file.
***

In [17]:
#import sys
#!{sys.executable} -m pip install pandas # this does not need to be re-run if already installed in first cell.
import pandas
#f = open('output_file', 'w')
df = pandas.read_csv(outputFile, sep='\t')
pandas.set_option('display.max_rows', 500)
pandas.set_option('display.max_colwidth', None)
pandas.set_option('display.max_columns', 500)
display(df)

Unnamed: 0,Score,Filename,Citation,Author,ORCID,Date,Title,OtherTitle,Type,Peer_Reviewed,Source,Editor,Publisher,Journal_Volume,Journal_Issue,Start_page,End_page,Chapter_Number_dc.relation.haspart,DOI,identifier.uri,HathiTrust,Fulcrum,ISBN,ISSN,SHERPA-RoMEOversion,SHERPA-RoMEOconditions,Date_to_lift_embargo_dc.date.available,Request_copy_name,Request_copy_email,CC_License_dc.rights.none,rights.uri,Subject,Description,hlbtoplevel,hlbsecondlevel,Affiliationum,Affiliationumcampus,Notes,josejose
0,95.30456,,"Kaplan, S. Kaplan, R. and E. L. Walker. (1960) Individual differences in learning as a function of shock level. Journal of Experimental Psychology, 60, 404 407.","Kaplan, Rachel; Kaplan, Stephen; Walker, Edward L.",,1960,Individual differences in learning as a function of shock level,,journal-article,,Journal of Experimental Psychology,,American Psychological Association (APA),60.0,6.0,404,407.0,,10.1037/h0044325,,,,,"0022-1015, print",Journal of Experimental Psychology | 0022-1015 | American Psychological Association,"{'submitted': [['Must be labeled with date and accompanied with statement that paper has not (yet) been published', 'Set statement must accompany deposit', 'Must link to publisher version with DOI (when available)']], 'accepted': [[['After acceptance', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI', 'Set statement must accompany deposit']], [['Publisher submits on authors behalf, upon completion of submission form', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI']]], 'published': [['Published source must be acknowledged with citation']]}",,,,,,,,,,,,,
1,31.896053,,"S. Kaplan. (1962) Arousal and perseveration: A theoretical model. Doctoral dissertation, University of Michigan.",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,97.690254,,"S. Kaplan and R. Kaplan. (1962) Skin resistance recording in the unrestrained rat. Science, 138, 1403 1404.",NotAMatch,,1962-12-28,Skin Resistance Recording in the Unrestrained Rat,,journal-article,,Science,,American Association for the Advancement of Science (AAAS),138.0,3548.0,1403,1404.0,,10.1126/science.138.3548.1403,,,,,"0036-8075, print",Science | 1095-9203 | American Association for the Advancement of Science,"{'submitted': [['May be considered prior publication, contact journal editor for clarification']], 'accepted': [[['Published source must be acknowledged with DOI', 'Set statement must accompany post-print (see policy)', 'Must link to publisher version', 'Can not be deposited until publication by AAAS']], [['Must state on submission Funding agency requirements', 'Published source must be acknowledged with DOI', 'Set statement must accompany post-print (see policy)', 'Must link to publisher version']]]}",,,,,,,,,,,,,
3,108.35461,,"L. J. Kleinsmith and S. Kaplan. (1963) Paired associate learning as a function of arousal and interpolated interval. Journal of Experimental Psychology, 65, 190 193.","Kleinsmith, Lewis J.; Kaplan, Stephen",,1963,Paired-associate learning as a function of arousal and interpolated interval,,journal-article,,Journal of Experimental Psychology,,American Psychological Association (APA),65.0,2.0,190,193.0,,10.1037/h0040288,,,,,"0022-1015, print",Journal of Experimental Psychology | 0022-1015 | American Psychological Association,"{'submitted': [['Must be labeled with date and accompanied with statement that paper has not (yet) been published', 'Set statement must accompany deposit', 'Must link to publisher version with DOI (when available)']], 'accepted': [[['After acceptance', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI', 'Set statement must accompany deposit']], [['Publisher submits on authors behalf, upon completion of submission form', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI']]], 'published': [['Published source must be acknowledged with citation']]}",,,,,,,,,,,,,
4,82.32727,,"L. J. Kleinsmith, S. Kaplan and R. D. Tarte. (1963) The relationship of arousal and short and long term verbal recall. Canadian Journal of Psychology, 17, 393 397.","Kleinsmith, Lewis J.; Kaplan, Stephen; Trate, Robert D.",,1963,The relationship of arousal to short- and long-term verbal recall,,journal-article,,Canadian Journal of Psychology/Revue canadienne de psychologie,,American Psychological Association (APA),17.0,4.0,393,397.0,,10.1037/h0083278,,,,,"0008-4255, print",,,,,,,,,,,,,,,
5,98.81151,,"S. Kaplan. (1963) Neither watered down nor tidied up. Review of Brown, et al.: New directions in psychology. Contemporary Psychology, 67, 124 126.",NotAMatch,,1963,Neither Watered Down Nor Tidied Up,,journal-article,,Contemporary Psychology: A Journal of Reviews,,Portico,8.0,11.0,"430, 432",,,10.1037/007093,,,,,"0010-7549, print",,,,,,,,,,,,,,,
6,125.192184,,"L. J. Kleinsmith and S. Kaplan. (1964) The interaction of arousal and recall interval in nonsense syllable paired associate learning. Journal of Experimental Psychology, 67, 124 126.","Kleinsmith, Lewis J.; Kaplan, Stephen",,1964,Interaction of arousal and recall interval in nonsense syllable paired-associate learning,,journal-article,,Journal of Experimental Psychology,,American Psychological Association (APA),67.0,2.0,124,126.0,,10.1037/h0045203,,,,,"0022-1015, print",Journal of Experimental Psychology | 0022-1015 | American Psychological Association,"{'submitted': [['Must be labeled with date and accompanied with statement that paper has not (yet) been published', 'Set statement must accompany deposit', 'Must link to publisher version with DOI (when available)']], 'accepted': [[['After acceptance', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI', 'Set statement must accompany deposit']], [['Publisher submits on authors behalf, upon completion of submission form', 'Publisher copyright and source must be acknowledged', 'Must link to publisher version with DOI']]], 'published': [['Published source must be acknowledged with citation']]}",,,,,,,,,,,,,
7,61.92447,,"S. Kaplan. (1964) Searching behavior in undergraduates. Psychology in the Schools,",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,40.049854,,"1:403 405. (Reprinted in W. R. Hatch and A. L. Richards [Eds.] Approach to independent study. U.S. Department of HEW, New Dimensions in Higher Education #13, 1965.)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,35.79404,,"S. Kaplan, S. L. Mann and R. Kaplan. (1964) Honors housing opportunity at the University of Michigan. Superior Student, 7, 14 .",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


To see individual citation results:

SHERPA-RoMEO API, just plug in an ISSN

In [18]:

import urllib.request, json 
import json
with urllib.request.urlopen("https://v2.sherpa.ac.uk/cgi/retrieve_by_id?item-type=publication&api-key=B606E5B4-1D49-11EB-8371-EBE6B22EDBEE&format=Json&identifier=0199-0039") as url:
    data = json.loads(url.read().decode())
    #print(data)
   
srdict = dict()
condlist = list()
artcondict = dict()

    
# Return: Journal title, ISSN, Publisher, and conditions
# ISSN: j['issns'][0]['issn']
# Journal title: j['title'][0]['title']
# Publisher: j['publishers'][0]['name'][0]['name']
# Conditions: j['publisher_policy'][0]['permitted_oa'][0]['conditions']

# write code for if issn not found
# update Sherpa Romeo code in above function

if data['items'] != []:
    #print("not empty")
    
    for j in data['items']:    
        for p in j['publisher_policy']:       
            for c in p['permitted_oa']:  
                condlist = []
                if ['conditions']: # get ALL conditions make this a list then append as needed
                    #print(type(['conditions']))
                    for item in ['conditions']:
                        conditions = c.get(item, None)
                        condlist.append(conditions)
                        #print(conditions)

                        #make condlist a dictionary with article version as the key and condition list the value.
                else:
                    continue
                artVersion = c.get(['article_version'][0], None)
                #print(artVersion)
                
                if artVersion[0] in artcondict:
                    artcondict[artVersion[0]] = [artcondict[artVersion[0]], condlist]
                    
                else: 
                    artcondict[artVersion[0]] = condlist
            #condlist.append("article_version: " + artVersion[0] + " conditions: " + conditions)
            #print(condlist)
            
        for p in j['issns']:
            issn = p['issn']
        for p in j['title']:
            title = p['title']
        for p in j['publishers']:
            for pn in p['publisher']['name']:
                pubname = (pn['name'])
                
else:
    print("no data")

print(artcondict)
#print(issn)
#print(title)
#print(pubname)

srdict['journalInfo'] = "%s | %s | %s" % (title, issn, pubname)
print(srdict)
#cv_dict['SHERPA-RoMEOconditions'] = condlist
#cv_dict['SHERPA-RoMEOversion'] = srdict['journalInfo']


{'submitted': [['Published source must be acknowledged', 'Must link to publisher version with DOI']], 'accepted': [[['Published source must be acknowledged', 'Must link to publisher version with DOI', 'Post-prints are subject to Springer Nature re-use terms', 'Set statement to accompany deposit (see policy)']], [['Published source must be acknowledged', 'Must link to publisher version with DOI', 'Post-prints are subject to Springer Nature re-use terms']]], 'published': [None]}
{'journalInfo': 'Population and Environment | 1573-7810 | Springer'}
