# Search for molecular dynamics datasets in Zenodo

In [1]:
from IPython.display import JSON
import pandas as pd
import requests

Create a token access here: <https://zenodo.org/account/settings/applications/tokens/new/>  
and store it in the file `access_token.txt`

In [2]:
ACCESS_TOKEN = open("access_token.txt", "r").read().strip()

Test token with basic query

In [3]:
r = requests.get('https://zenodo.org/api/deposit/depositions',
                 params={'access_token': ACCESS_TOKEN})
r.status_code

200

In [4]:
def search_zenodo(page=1, hits_per_page=10, year=2016):
    response = requests.get("https://zenodo.org/api/records",
                            params={"q": ("(title:'molecular dynamics' OR description:'molecular dynamics')"
                                         f" AND publication_date:[{year}-01-01 TO {year}-12-31]"
                                          " AND access_right:open"),
                                    "type": "dataset",
                                    "size": hits_per_page,
                                    "page": page,
                                    "status": "published",
                                    "access_token": ACCESS_TOKEN})
    return response.json()

### Test query

In [5]:
resp_json = search_zenodo(hits_per_page=10, year=2019)
JSON(resp_json)

<IPython.core.display.JSON object>

In [6]:
total_hits = resp_json["hits"]["total"]
print(f'Number of hits: {total_hits}')

Number of hits: 436


### Anatomy of a record in json

In [7]:
response = requests.get("https://zenodo.org/api/records/53887",
                        params={"access_token": ACCESS_TOKEN})
resp_json = response.json()
JSON(resp_json)

<IPython.core.display.JSON object>

In [8]:
def extract_records(response_json):
    records = []
    files = []
    for hit in response_json["hits"]["hits"]:
        record = {}
        record["id"] = hit["id"]
        record["conceptid"] = hit["conceptrecid"]
        record["date_created"] = hit["created"]
        record["date_updated"] = hit["updated"]
        record["title"] = hit["metadata"]["title"]
        record["description"] = hit["metadata"]["description"]
        record["access_right"] = hit["metadata"]["access_right"]
        if record["access_right"] != "open":
            continue
        record["license"] = hit["metadata"]["license"]["id"]
        records.append(record)
        for file_in in hit["files"]:
            file_dict = {"record_id": record["id"],
                         "name": file_in["key"],
                         "type": file_in["type"],
                         "size": file_in["size"]}
            files.append(file_dict)
    return records, files

## Search records

There is a strong limit of 10 000 hits per query.

On 2017-10-20, 12062 MD-related records have been published in Zenodo. One cannot retrieve them all.

In [9]:
zenodo_records = []
zenodo_files = []
max_hits_per_record = 10_000
max_hits_per_page = 100
for year in range(2010, 2022):
    resp_json = search_zenodo(hits_per_page=1, year=year)
    total_hits = resp_json["hits"]["total"]
    page_max = total_hits//max_hits_per_page + 1
    for page in range(1, page_max+1):
        resp_json = search_zenodo(page=page, hits_per_page=max_hits_per_page, year=year)
        records_tmp, files_tmp = extract_records(resp_json)
        zenodo_records += records_tmp
        zenodo_files += files_tmp
        print(f"year {year} -- page {page} / {page_max} ({len(records_tmp)})")
        if (page * max_hits_per_page >= max_hits_per_record):
            print("Max records per query reached!")
            break

year 2010 -- page 1 / 1 (1)
year 2011 -- page 1 / 1 (0)
year 2012 -- page 1 / 1 (3)
year 2013 -- page 1 / 1 (3)
year 2014 -- page 1 / 1 (22)
year 2015 -- page 1 / 1 (53)
year 2016 -- page 1 / 1 (69)
year 2017 -- page 1 / 123 (100)
year 2017 -- page 2 / 123 (100)
year 2017 -- page 3 / 123 (100)
year 2017 -- page 4 / 123 (100)
year 2017 -- page 5 / 123 (100)
year 2017 -- page 6 / 123 (100)
year 2017 -- page 7 / 123 (100)
year 2017 -- page 8 / 123 (100)
year 2017 -- page 9 / 123 (100)
year 2017 -- page 10 / 123 (100)
year 2017 -- page 11 / 123 (100)
year 2017 -- page 12 / 123 (100)
year 2017 -- page 13 / 123 (100)
year 2017 -- page 14 / 123 (100)
year 2017 -- page 15 / 123 (100)
year 2017 -- page 16 / 123 (100)
year 2017 -- page 17 / 123 (100)
year 2017 -- page 18 / 123 (100)
year 2017 -- page 19 / 123 (100)
year 2017 -- page 20 / 123 (100)
year 2017 -- page 21 / 123 (100)
year 2017 -- page 22 / 123 (100)
year 2017 -- page 23 / 123 (100)
year 2017 -- page 24 / 123 (100)
year 2017 -- page 

## Store data in csv

In [10]:
print(f"Number of Zenodo records found: {len(zenodo_records)}")
print(f"Number of files found: {len(zenodo_files)}")

Number of Zenodo records found: 12764
Number of files found: 61108


In [11]:
records_df = pd.DataFrame(zenodo_records).set_index("id")
records_df.to_csv("zenodo_records.csv")
print(records_df.shape)
records_df.head()

(12764, 7)


Unnamed: 0_level_0,conceptid,date_created,date_updated,title,description,access_right,license
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3889686,3889685,2020-06-11T10:51:06.773153+00:00,2020-06-26T09:07:44.597779+00:00,Understanding ENSO dynamics through the explor...,<p>The palaeoclimate record shows that signifi...,open,CC-BY-4.0
46596,596093,2016-02-26T10:45:20+00:00,2020-01-24T19:24:50.388374+00:00,Modeling of the bacterial molecular chaperone ...,"<p>These scripts demonstrate the use of IMP, M...",open,LGPL-2.1
3631074,3631073,2020-01-30T08:28:12.103446+00:00,2020-01-30T19:20:47.062665+00:00,Towards deciphering dynamic changes and evolut...,<p><strong>Objectives :</strong></p>\n\n<p>Exa...,open,CC-BY-4.0
901814,901813,2017-09-21T08:14:31.591951+00:00,2020-01-24T19:23:29.719156+00:00,"Supplementary material 1 from: Gruber S, Hadda...",Giemsa-stained metaphases I. a. Aparasphenodon...,open,CC-BY-4.0
51635,633598,2016-05-22T21:27:58+00:00,2020-12-31T18:02:39.471208+00:00,Molecular dynamics simulation trajectory of a ...,<p><strong>System:&nbsp;</strong>DMPC (dimyris...,open,CC-BY-4.0


In [12]:
files_df = pd.DataFrame(zenodo_files).set_index("record_id")
files_df.to_csv("zenodo_files.csv")
print(files_df.shape)
files_df.head()

(61108, 3)


Unnamed: 0_level_0,name,type,size
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3889686,psl_0ka.nc,nc,172081168
3889686,psl_1ka.nc,nc,172081168
3889686,psl_2ka.nc,nc,172081168
3889686,psl_3ka.nc,nc,172081168
3889686,psl_4ka.nc,nc,172081168
