Experimenting with the CORE API
----

# Setup

In [1]:
from dotenv import load_dotenv
from os import getenv
import requests
import pandas as pd
import numpy as np
import json

if load_dotenv():
    print("Loaded environment variables")
else:
    print("Failed to load environemnt variables")
CORE_API_KEY = getenv('CORE_API_KEY')

Loaded environment variables


[API reference](https://api.core.ac.uk/docs/v3)

In [38]:
def query_works(query_text, **query_kwargs):
    url = 'https://api.core.ac.uk/v3/search/works'
    headers = {"Authorization": f"Bearer {CORE_API_KEY}"}
    query = {
        'q': query_text,
        **query_kwargs
    }
    res = requests.post(url, json.dumps(query), headers=headers)
    remaining = int(res.headers['X-RateLimit-Remaining'])
    print("Remaining limit: ", remaining, type(remaining))
    if res.status_code != 200:
        raise RuntimeError(f'API returns code {res.status_code}')
    return json.loads(res.text)

In [9]:
def query_works_json(query_text, **query_kwargs):
    url = 'https://api.core.ac.uk/v3/search/works'
    headers = {"Authorization": f"Bearer {CORE_API_KEY}"}
    query = {
        'q': query_text,
        **query_kwargs
    }
    res = requests.post(url, json=query, headers=headers)
    if res.status_code != 200:
        raise RuntimeError(f'API returns code {res.status_code}')
    return res.json()

# Checking basic syntax

In [4]:
raw = query_works(
    '(sociology OR anthropology OR (political science)) AND yearPublished:2022 AND _exists_:abstract',
    limit=250
)
print(raw.keys())

dict_keys(['totalHits', 'limit', 'offset', 'scrollId', 'results', 'tooks', 'esTook'])


In [5]:
print(raw['totalHits'], raw['limit'])

110274 250


In [103]:
results = raw['results']
results[-1]

{'acceptedDate': None,
 'arxivId': None,
 'authors': [{'name': 'Magee, Caroline Alexandra'}],
 'citationCount': None,
 'contributors': [],
 'outputs': ['https://api.core.ac.uk/v3/outputs/519836734'],
 'createdDate': '2022-06-10T08:06:13',
 'dataProviders': [{'id': 3615,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/3615',
   'logo': 'https://api.core.ac.uk/data-providers/3615/logo'}],
 'depositedDate': None,
 'abstract': 'This thesis applies social science research, historical and cultural analysis, and legislative review to understand the relationship between Texas’s school-based sexuality education policies and public health and, in so doing, lay the groundwork for policy change that will better promote sexual and reproductive health, health equity, and health justice in Texas. The thesis first situates Texas state-level sexuality education policy within a larger US political, historical, and legal context, highlighting the key roles of race, class, gender, and s

In [128]:
df = pd.DataFrame([
    {
        'title': result['title'],
        'len_fullText': len(result['fullText']), 
        'len_abstract': len(result['abstract']),
        'fieldOfStudy':result['fieldOfStudy'], 
        'documentType':result['documentType'],
        'publisher': result['publisher']
    } for result in results])

# documentType doesn't have much useful information in these results
print(df.documentType.value_counts(dropna=False))

# many of these documents seem to be theses, so they are quite long: median is ~75,000
# but shortest are <500
print(df.len_fullText.describe())
print(df.len_abstract.describe())

documentType
    248
Name: count, dtype: int64
count    2.480000e+02
mean     2.456155e+05
std      3.316800e+05
min      4.260000e+02
25%      4.823925e+04
50%      7.610800e+04
75%      3.608900e+05
max      1.567517e+06
Name: len_fullText, dtype: float64
count     248.000000
mean     1274.326613
std       888.760695
min        12.000000
25%       903.250000
50%      1196.000000
75%      1487.250000
max      9824.000000
Name: len_abstract, dtype: float64


In [123]:
# abstract can be searched for in text to remove, but NB small differences can prevent match
# in this example, inclusion of line breaks in full text
print(results[-1]['fullText'].find(results[-1]['abstract']))
print(results[-1]['fullText'].find(results[-1]['abstract'][:100]))
print(results[-1]['fullText'].find(results[-1]['abstract'][-100:]))

-1
1327
3542


In [124]:
# maybe one way select docs is to filter by abstract length: below ~300 is probably not interesting
# the long ones are probably for books and theses, which are also interesting -- but still multi-thousand word is perhaps not ideal
df.sort_values('len_fullText')

Unnamed: 0,title,len_fullText,len_abstract,fieldOfStudy,documentType,publisher
39,Dynamics of Difference: Inequality and Transfo...,426,1103,,,Routeldge
116,Private security in Africa,2830,909,,,'Bloomsbury Academic'
81,Book Review: \u3cem\u3eFat Matters: From Socio...,5837,140,,,Open PRAIRIE: Open Public Research Access Inst...
2,Dr. Kevin C. Casey Motl named dean of Ouachi...,6038,232,,,Scholarly Commons @ Ouachita
1,Entangled Legalities Beyond the State,6391,327,,,'Cambridge University Press (CUP)'
...,...,...,...,...,...,...
183,Social and Political Issues on Sustainable Dev...,1213316,979,,,'Informa UK Limited'
221,Dynamics in the History of Religions between A...,1278558,424,,,'Brill'
238,Irony as a Method of Neoclassical Sociology : ...,1313335,9824,,,Åbo Akademis förlag - Åbo Akademi University P...
135,La sociologia di Luciano Cavalli,1442746,2086,,,'Firenze University Press'


In [125]:
df.publisher.value_counts()

publisher
'Informa UK Limited'                                29
                                                    19
'Lifescience Global'                                14
'Springer Science and Business Media LLC'           12
'SAGE Publications'                                  8
                                                    ..
Routeldge                                            1
'School of Advanced Study'                           1
Encompass                                            1
London School of Economics and Political Science     1
UNI ScholarWorks                                     1
Name: count, Length: 119, dtype: int64

In [126]:
# there are some odd results: e.g. this is a press release
results[2]

{'acceptedDate': None,
 'arxivId': None,
 'authors': [{'name': 'Lawson, Felley'}, {'name': 'Ouachita News Bureau,'}],
 'citationCount': None,
 'contributors': [],
 'outputs': ['https://api.core.ac.uk/v3/outputs/534809425'],
 'createdDate': '2022-10-10T21:50:37',
 'dataProviders': [{'id': 3629,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/3629',
   'logo': 'https://api.core.ac.uk/data-providers/3629/logo'}],
 'depositedDate': None,
 'abstract': 'Dr. Kevin C. “Casey” Motl has been named dean of the William H. Sutton School of Social Sciences at Ouachita Baptist University. The Sutton School houses Ouachita’s departments of History, Political Science, Psychology and Sociology',
 'documentType': '',
 'doi': None,
 'downloadUrl': 'https://core.ac.uk/download/534809425.pdf',
 'fieldOfStudy': None,
 'fullText': 'Ouachita Baptist University Scholarly Commons @ Ouachita Press Releases Office of Communication 6-27-2022 Dr. Kevin C. "Casey" Motl named dean of Ouachita\'s Sut

In [131]:
# a book review: the abstract is not actually informative
results[81]

{'acceptedDate': None,
 'arxivId': None,
 'authors': [{'name': 'Hansen, Brittney'}],
 'citationCount': None,
 'contributors': [],
 'outputs': ['https://api.core.ac.uk/v3/outputs/519731483'],
 'createdDate': '2022-06-08T17:37:00',
 'dataProviders': [{'id': 2573,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/2573',
   'logo': 'https://api.core.ac.uk/data-providers/2573/logo'}],
 'depositedDate': None,
 'abstract': 'Tsichlia, Gina and Alexandra M. Johnstone (Eds.). Fat Matters: From Sociology to Science. M & K Update Ltd., 2010, 144 pp., $45.00 paperback',
 'documentType': '',
 'doi': None,
 'downloadUrl': 'https://core.ac.uk/download/519731483.pdf',
 'fieldOfStudy': None,
 'fullText': 'Great Plains Sociologist Volume 21 Issue 1 Article 4 2010 Book Review: Fat Matters: From Sociology to Science Edited by Gina Tsichlia and Alexandra M. Johnstone Brittney Hansen University of North Dakota Follow this and additional works at: https://openprairie.sdstate.edu/greatplainsso

In [132]:
# results with too-short text often have problems: e.g. this is just garbage
results[116]

{'acceptedDate': None,
 'arxivId': None,
 'authors': [],
 'citationCount': None,
 'contributors': [],
 'outputs': ['https://api.core.ac.uk/v3/outputs/548537633'],
 'createdDate': '2022-12-10T02:00:32',
 'dataProviders': [{'id': 4636,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/4636',
   'logo': 'https://api.core.ac.uk/data-providers/4636/logo'}],
 'depositedDate': None,
 'abstract': 'Across Africa, growing economic inequality, instability and urbanization have led to the rapid spread of private security providers. While these PSPs have already had a significant impact on African societies, their impact has so far received little in the way of comprehensive analysis. Drawing on a wide range of disciplinary approaches, and encompassing anthropology, sociology and political science, Private Security in Africa offers unique insight into the lives and experiences of security providers and those affected by them, as well as into the fragile state context which has allo

# Trying to limit to journal articles

In [7]:
articles_raw = query_works(
    "_exists_:journals OR _exists_:acceptedDate",
    limit=500
)
print(articles_raw['totalHits'])
articles_results = articles_raw['results']

14383786


In [13]:
journals_list = []
accepted_date_list = []
doi_list = []
for article in articles_results:
    journals_list.append(article['journals'])
    accepted_date_list.append(article['acceptedDate'])
    doi_list.append(article['doi'])

In [39]:
print('length of journals list:', pd.Series([len(entry) for entry in journals_list]).value_counts(), sep='\n')
print('\nkeys of journalist list:', pd.Series([entry[0].keys() for entry in journals_list if len(entry)>0]).value_counts(), sep='\n')
journals_df = pd.DataFrame([entry[0] if len(entry)>0 else {'title': None, 'identifiers': None} for entry in journals_list])
print('\nmissing values', journals_df.isna().sum(), sep='\n')

length of journals list:
1    481
0     15
Name: count, dtype: int64

keys of journalist list:
(title, identifiers)    481
Name: count, dtype: int64

missing values
title          394
identifiers     15
dtype: int64


In [46]:
print('Accepted date\n')
accepted_dates = pd.Series(accepted_date_list)
print(accepted_dates.describe())

Accepted date

count                     496
unique                    396
top       2010-01-01T00:00:00
freq                        7
dtype: object


In [55]:
articles_results[np.argwhere(journals_df.identifiers.isna())[0,0]]

{'acceptedDate': '2008-02-07T00:00:00',
 'arxivId': None,
 'authors': [{'name': 'Andrews, D. L.'}, {'name': 'Bradshaw, D.S.'}],
 'citationCount': None,
 'contributors': ['Powers, PE'],
 'outputs': ['https://api.core.ac.uk/v3/outputs/193164484',
  'https://api.core.ac.uk/v3/outputs/2783113'],
 'createdDate': '2012-05-02T18:04:13',
 'dataProviders': [{'id': 4786,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/4786',
   'logo': 'https://api.core.ac.uk/data-providers/4786/logo'},
  {'id': 124,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/124',
   'logo': 'https://api.core.ac.uk/data-providers/124/logo'}],
 'depositedDate': '2008-02-07T00:00:00',
 'abstract': 'The possibility of influencing resonance energy transfer through the input of off-resonant pulses of laser radiation is the subject of recent research. Attention is now focused on systems in which resonance energy transfer is designedly precluded by geometric configuration. Here, through an op

# Limiting to particular journals

DOI prefixes do not always uniquely identify journals, e.g. AJS:

In [71]:
ajs_metadata = json.loads(requests.get('https://api.crossref.org/journals/0002-9602/').text)

In [72]:
ajs_metadata.keys()
ajs_metadata['message'].keys()

dict_keys(['last-status-check-time', 'counts', 'breakdowns', 'publisher', 'coverage', 'title', 'subjects', 'coverage-type', 'flags', 'ISSN', 'issn-type'])

In [73]:
ajs_works = json.loads(requests.get('https://api.crossref.org/journals/0002-9602/works/?rows=10').text)

In [83]:
ajs_works.keys()
ajs_works['message'].keys()
ajs_works['message']['items'][0]['DOI']
[item['DOI'] for item in ajs_works['message']['items']]

['10.1086/229722',
 '10.1086/224050',
 '10.1086/211605',
 '10.1086/227449',
 '10.1086/344856',
 '10.1086/222711',
 '10.1086/218764',
 '10.1086/228842',
 '10.1086/217470',
 '10.1086/600025']

In [5]:
ajs_raw = query_works(
    query_text='10.1086',
    limit=100
)

In [6]:
print('hits: ', ajs_raw['totalHits'])
ajs_results = ajs_raw['results']
doi_series = pd.Series([entry['doi'] for entry in ajs_results])
print('share of sample where doi prefix matches: ', doi_series.str.startswith('10.1086', na=False).mean())
[(entry['doi'], entry['journals']) for entry in ajs_results if entry['doi'] and entry['doi'].startswith('10.1086')]

hits:  250330
share of sample where doi prefix matches:  0.4444444444444444


[('10.1086/506971',
  [{'title': 'The American Naturalist',
    'identifiers': ['issn:0003-0147', '0003-0147']}]),
 ('10.1086/379866', [{'title': None, 'identifiers': ['0021-9398']}]),
 ('10.1086/202005', [{'title': None, 'identifiers': ['0011-3204']}]),
 ('10.1086/427638', [{'title': None, 'identifiers': ['0021-9398']}]),
 ('10.1086/660830', []),
 ('10.1086/202312', []),
 ('10.1086/378137', []),
 ('10.1086/286062', []),
 ('10.1086/670161', [{'title': None, 'identifiers': ['0026-8232']}]),
 ('10.1086/507497.',
  [{'title': 'American Art', 'identifiers': ['1073-9300', 'issn:1073-9300']}]),
 ('10.1086/378551', []),
 ('10.1086/669393.', []),
 ('10.1086/687213', []),
 ('10.1086/696619',
  [{'title': 'The Journal of Politics',
    'identifiers': ['1468-2508', 'issn:1468-2508']}]),
 ('10.1086/644533', []),
 ('10.1086/503776', [{'title': None, 'identifiers': ['0022-1899']}]),
 ('10.1086/676525', [{'title': None, 'identifiers': ['0003-0147']}]),
 ('10.1086/656472', [{'title': None, 'identifier

In [10]:
issn_query = query_works(
    '("0002-9602" OR "1537-5390") _exists_:acceptedDate',
    limit=100
)

In [17]:
print('Raw hits: ', issn_query['totalHits'])
issn_results = issn_query['results']
print('Share missing DOI: ', pd.Series([entry['doi'] for entry in issn_results]).isna().mean())
issn_journals = pd.DataFrame([entry['journals'][0] if len(entry['journals'])==1 else {'title':None, 'identifiers':None} for entry in issn_results])
issn_journals.identifiers.value_counts()

Raw hits:  83
Share missing DOI:  0.0


identifiers
[0002-9602]                                               55
[0002-9602, issn:0002-9602]                               11
[0002-9602, 1537-5390, issn:1537-5390, issn:0002-9602]     5
[1699-2407]                                                1
[2302-7525]                                                1
[issn:0161-956X, 0161-956x]                                1
[1079-0268]                                                1
[issn:1441-3523, 1441-3523]                                1
[0047-2891]                                                1
[0718-8358, issn:0718-8358]                                1
[1556-5068]                                                1
[0001-4273]                                                1
[0100-1965]                                                1
Name: count, dtype: int64

In [20]:
issn_results[1]

{'acceptedDate': '2016-07-01T00:00:00',
 'arxivId': None,
 'authors': [{'name': 'Holland, J'}],
 'citationCount': None,
 'contributors': [''],
 'outputs': ['https://api.core.ac.uk/v3/outputs/209830079',
  'https://api.core.ac.uk/v3/outputs/375509433',
  'https://api.core.ac.uk/v3/outputs/46664736'],
 'createdDate': '2016-11-07T23:24:18',
 'dataProviders': [{'id': 4786,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/4786',
   'logo': 'https://api.core.ac.uk/data-providers/4786/logo'},
  {'id': 18201,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/18201',
   'logo': 'https://api.core.ac.uk/data-providers/18201/logo'},
  {'id': 140,
   'name': '',
   'url': 'https://api.core.ac.uk/v3/data-providers/140',
   'logo': 'https://api.core.ac.uk/data-providers/140/logo'}],
 'depositedDate': '2016-07-01T00:00:00',
 'abstract': None,
 'documentType': 'research',
 'doi': '10.1086/686047',
 'downloadUrl': 'https://core.ac.uk/download/46664736.pdf',
 'fieldOfSt

In [53]:
issn_df = pd.DataFrame([
    {
        'issn': str(entry['journals'][0]['identifiers'])[1:-1] if len(entry['journals'])>0 else None,
        'fullText': entry['fullText'],
        'references': entry['references'],
        'abstract': entry['abstract'],
        'accepted': entry['acceptedDate']
    }
    for entry in issn_results
])

In [57]:
matches_ajs = issn_df.issn.str.contains('0002-9602', na=False)
print(matches_ajs.value_counts())
print("share with ISSN in full text: ", issn_df[matches_ajs].fullText.str.contains('0002-9602').mean())
issn_df[(~issn_df.abstract.isna()) & matches_ajs]

issn
True     71
False    11
Name: count, dtype: int64
share with ISSN in full text:  1.0


Unnamed: 0,issn,fullText,references,abstract,accepted
2,'0002-9602',AJS Volume 111 Number 3 (November 2005): 797–8...,"[{'id': 18769633, 'title': 'A Longitudinal Ana...","In Suicide, Durkheim found that involvement in...",2006-01-25T00:00:00
3,"'0002-9602', '1537-5390', 'issn:1537-5390', 'i...",The Future of Weak Ties1\n“The Strength of Wea...,[],“The Strength of Weak Ties” (Granovetter 1973)...,2016-05-12T00:00:00
4,'0002-9602',Early Adolescent Self-Esteem as a Function of ...,[],Self-esteem is postulated to be a function of ...,2002-07-26T00:00:00
6,'0002-9602',Gifts as Economic Signals and Social Symbols\n...,"[{'id': 1451102, 'title': 'A Theory of Primiti...","Gift-giving has often puzzled economists, espe...",2008-01-04T00:00:00
7,'0002-9602',Cornell University ILR School \nDigitalCommons...,[],This article develops a theory that explains h...,2002-09-30T00:00:00
9,'0002-9602',How cultural capital emerged in Gilded Age Ame...,[],This article uses a new database of subscriber...,2018-05-01T00:00:00
10,'0002-9602',1412 AJS Volume 110 Number 5 (March 2005): 141...,"[{'id': 18769441, 'title': 'A Model of Destina...",Immigrants’ destination-language proficiency h...,2005-05-27T00:00:00
11,"'0002-9602', 'issn:0002-9602'",Durham Research Online\nDeposited in DRO:\n23 ...,[],This article examines the effects of crowding ...,2001-03-01T00:00:00
12,'0002-9602',Boston University\nOpenBU http://open.bu.edu\n...,[],Literature on the nonprofit sector focuses on ...,2007-03-29T00:00:00
13,"'0002-9602', 'issn:0002-9602'",Durham Research Online\nDeposited in DRO:\n23 ...,[],"Organizational theories, especially ecological...",2001-01-01T00:00:00


In [66]:
test = query_works(
    '_exists_:acceptedDate _exists_:fullText _exists_:abstract',
    limit=3000,
    # scroll=True,
    # offset=0
)
print(test['totalHits'])
print(len(test['results']))

Remaining limit:  1 <class 'int'>
1894267
2759


In [71]:
type(test['results'][0]['journals']) == list

True