## 1. Importing packages, modules and creating an access token for the API

In [None]:
# Import the necessary packages and modules
import requests
from bs4 import BeautifulSoup
import csv
from datetime import date, timedelta
import re
import pandas as pd
from tqdm.notebook import tqdm      # To track progress of a long for-loop when executing a notebook cell

Communicating with the Zenodo REST API requires an individual secret access token which can be obtained free of charge after registering on Zenodo.

Creating a personal access token
- [Register](https://zenodo.org/signup) for a Zenodo account if you don’t already have one.
- Go to your [Applications](https://zenodo.org/account/settings/applications/), to [create a new token](https://zenodo.org/account/settings/applications/tokens/new/).

In order to reprodce the result, please save your access token in a file called 'zenodo-access-token.txt' in the same folder. This file is called in the below cell.

In [None]:
# Call your individual secret Zenodo access token from file
text_file = open('zenodo-access-token.txt', 'r')
ACCESS_TOKEN = text_file.read()
text_file.close()

## 2. Testing the Zenodo query and the received response

This section tests the Zenodo response on one date before we start building a dataset for a date range. These cells are commented out. Please uncomment if you wish to experiment with your own queries.



In [None]:
# # Test the request on one dated query only
# hum_ss = '(humanit OR architectur OR environment OR geograph OR achaelog OR \
#         economic OR econometric OR business OR management OR law OR politic OR \
#         social OR sociolog OR anthropolog OR development OR education OR sport OR leisure \
#         OR touris OR language OR linguistic OR history OR classics OR philosoph \
#         OR theolog OR religio OR art OR design OR music OR drama OR dance OR film \
#         OR screen OR communicat OR cultur OR media OR library)'
# publication_date = 'publication_date:2022-05-29'
# resource_type = 'resource_type.type:dataset'
# access_right = 'access_right:open'
# query = hum_ss + ' AND ' + publication_date + ' AND ' + resource_type + ' AND ' + access_right
# r = requests.get('https://zenodo.org/api/records',
#                         params={'q': query,
#                                 'status': 'published',
#                                 'sort': '-mostrecent',
#                                 'size': 500,
#                                 'access_token': ACCESS_TOKEN})
# d = r.json()            # The output is a dictionary
# print(type(r.json()))
# r.json()

In [None]:
# # Extract the number of hits if the search has returned any
# not_empty = d['aggregations']['access_right']['buckets']
# if not not_empty:
#     print('List is empty')
# else:    
#     doc_count = d['aggregations']['access_right']['buckets'][0]['doc_count']
#     print(doc_count)

In [None]:
# # Build a list of data for hits on that date
# hits = []
# for i in range(doc_count):
#     doi = d['hits']['hits'][i]['links']['doi']
#     html = d['hits']['hits'][i]['links']['html']
#     publication_date = d['hits']['hits'][i]['metadata']['publication_date']
#     downloads = d['hits']['hits'][i]['stats']['downloads']
#     views = d['hits']['hits'][i]['stats']['views']
#     hit = [doi, html, publication_date, downloads, views]
#     hits.append(hit)
# print(hits[0])
# print(len(hits))

In [None]:
# # Convert dictionary to dataframe and check the output
# df = pd.DataFrame(hits, columns = ['doi', 'html', 'publication_date', 'downloads', 'views'])
# # Convert date string in 'publication_date# column to datetime.data, NOT timestamp data
# df['publication_date'] = pd.to_datetime(df['publication_date'], format='%Y-%m-%d').apply(lambda x: x.date())
# heute = date.today()        # Get today's date in datetime.data format
# # Add a column showing the number of days since publication
# df['days_since_publication'] = heute - df['publication_date']       
# print(df.head(5))
# df.info()                   # Note that data in 'publication_date' is in datetime.data

## 3. Building a dataset of Zenodo datastes in the Humanities and Social Sciences
### 3.1 Create a date range

In [None]:
# Now is the time to create a loop for a range of dates
# Create function for date range
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# Create a list of one-day date ranges in Zenodo search format
start_date = date(2019, 9, 29)              #Inclusive start date - the date of the first JOHD paper
end_date = date(2022, 6, 5)                 #Exclusive end date - the cut-off date is 4 June 2022
date_list =  []
for single_date in daterange(start_date, end_date):
    x = single_date.strftime("%Y-%m-%d")
    r = 'publication_date:' + x
    date_list.append(r)
# print(len(date_list))
# print(date_list)

# Set the cut-off date to 4 June 2022 in datetime.data format
cut_off_date = date(2022, 6, 4)
# print(cut_off_date)
# print(type(cut_off_date)) 

### 3.2 The Zenodo query

The Zenodo query parameter 'q' allows simple and phrase search. In the 'hum_ss' block we created a list a stemmed expressions corresponding with the [Units of Assessment](https://www.ref.ac.uk/panels/units-of-assessment/) in Panels C (Social Sciences) and D (Humanities) of the UK's Research Excellence Framework 2021 to restrict the disciplinary focus of the search. We added the 'humanit' expressions for 'humanities'. We also removed the expression 'international' (corepsonding with UoA 19 'Politics and International Studies') which resulted in many hits not in the humanities or social sciences. The search is limited datasets deposited on Zenodo and made publicly accessible ('open').

In [None]:
# Create a list of dated queries in Zenodo search format to loop through
hum_ss = '(humanit OR architectur OR environment OR geograph OR achaelog OR \
        economic OR econometric OR business OR management OR law OR politic OR \
        social OR sociolog OR anthropolog OR development OR education OR sport OR leisure \
        OR touris OR language OR linguistic OR history OR classics OR philosoph \
        OR theolog OR religio OR art OR design OR music OR drama OR dance OR film \
        OR screen OR communicat OR cultur OR media OR library)'
resource_type = 'resource_type.type:dataset'
access_right = 'access_right:open'
query_list = []
for date in date_list:
        query = hum_ss + ' AND ' + date + ' AND ' + resource_type + ' AND ' + access_right
        query_list.append(query)
# print(len(query_list))
# print(query_list)

In [None]:
# Loop through the list of queries and save information into a list of hits 
hits = []
doi_list = []
for query in tqdm(query_list):
        r = requests.get('https://zenodo.org/api/records',
                                params={'q': query,
                                        'status': 'published',
                                        'sort': '-mostrecent',
                                        'size': 500,
                                        'access_token': ACCESS_TOKEN})
        d = r.json()            # Tranform the response to a dictionary
        not_empty = d['aggregations']['access_right']['buckets']
        if not not_empty:
                next
        else:
                doc_count = d['aggregations']['access_right']['buckets'][0]['doc_count']
                # print(doc_count)
                if doc_count != 0:
                        for i in range(doc_count):
                                doi = d['hits']['hits'][i]['links']['doi']
                                if doi not in doi_list:
                                        doi_list.append(doi)
                                        html = d['hits']['hits'][i]['links']['html']
                                        publication_date = d['hits']['hits'][i]['metadata']['publication_date']
                                        downloads = d['hits']['hits'][i]['stats']['downloads']
                                        views = d['hits']['hits'][i]['stats']['views']
                                        hit = [doi, html, publication_date, downloads, views]
                                        hits.append(hit)
                        # print(len(hits))
                else:
                        continue

In [None]:
# Convert dictionary to dataframe
df = pd.DataFrame(hits, columns = ['doi', 'html', 'publication_date', 'downloads', 'views'])

# Convert date string in 'publication_date# column to datetime.data, NOT timestamp data
df['publication_date'] = pd.to_datetime(df['publication_date'], format='%Y-%m-%d').apply(lambda x: x.date())

# Add a column showing the number of days since publication
df['days_since_publication'] = (cut_off_date - df['publication_date']).dt.days

# Check the output
print(df.head(5))           # Peak at the dataframe
df.info()                   # Note that data in 'publication_date' is datetime.data

In [None]:
# Save dataframe in .csv and .json file formats
df.to_csv('zenodo_humss_datasets.csv')
df.to_json('zenodo_humss_datasets.json')