# Search for molecular dynamics datasets in Zenodo

In [1]:
from IPython.display import JSON
import pandas as pd
import requests

Create a token access here: <https://zenodo.org/account/settings/applications/tokens/new/>  
and store it in the file `access_token.txt`. This file is ignored from git.

In [2]:
ACCESS_TOKEN = open("access_token.txt", "r").read().strip()

Test token with basic query

In [3]:
r = requests.get('https://zenodo.org/api/deposit/depositions',
                 params={'access_token': ACCESS_TOKEN})
r.status_code

200

In [4]:
def search_zenodo(page=1, hits_per_page=10, year=2016):
    response = requests.get("https://zenodo.org/api/records",
                            params={"q": ("(title:(+molecular +dynamics) OR description:(+molecular +dynamics)')"
                                         f" AND publication_date:[{year}-01-01 TO {year}-12-31]"
                                          " AND access_right:open"),
                                    "type": "dataset",
                                    "size": hits_per_page,
                                    "page": page,
                                    "status": "published",
                                    "access_token": ACCESS_TOKEN})
    return response.json()

### Test query

In [5]:
resp_json = search_zenodo(hits_per_page=100, year=2017)
JSON(resp_json)

<IPython.core.display.JSON object>

In [6]:
total_hits = resp_json["hits"]["total"]
print(f'Number of hits: {total_hits}')

Number of hits: 25


### Anatomy of a record in json

In [7]:
response = requests.get("https://zenodo.org/api/records/53887",
                        params={"access_token": ACCESS_TOKEN})
resp_json = response.json()
JSON(resp_json)

<IPython.core.display.JSON object>

In [8]:
def extract_records(response_json):
    records = []
    files = []
    for hit in response_json["hits"]["hits"]:
        record = {}
        record["id"] = hit["id"]
        record["conceptid"] = hit["conceptrecid"]
        record["date_created"] = hit["created"]
        record["date_updated"] = hit["updated"]
        record["title"] = hit["metadata"]["title"]
        record["description"] = hit["metadata"]["description"]
        record["access_right"] = hit["metadata"]["access_right"]
        if record["access_right"] != "open":
            continue
        record["license"] = hit["metadata"]["license"]["id"]
        records.append(record)
        for file_in in hit["files"]:
            file_dict = {"record_id": record["id"],
                         "name": file_in["key"],
                         "type": file_in["type"],
                         "size": file_in["size"]}
            files.append(file_dict)
    return records, files

## Search records

There is a strong limit of 10 000 hits per query.

In [9]:
zenodo_records = []
zenodo_files = []
max_hits_per_record = 10_000
max_hits_per_page = 100
for year in range(2010, 2022):
    resp_json = search_zenodo(hits_per_page=1, year=year)
    total_hits = resp_json["hits"]["total"]
    page_max = total_hits//max_hits_per_page + 1
    for page in range(1, page_max+1):
        resp_json = search_zenodo(page=page, hits_per_page=max_hits_per_page, year=year)
        records_tmp, files_tmp = extract_records(resp_json)
        zenodo_records += records_tmp
        zenodo_files += files_tmp
        print(f"year {year} -- page {page} / {page_max} ({len(records_tmp)})")
        if (page * max_hits_per_page >= max_hits_per_record):
            print("Max hits per query reached!")
            break

year 2010 -- page 1 / 1 (0)
year 2011 -- page 1 / 1 (0)
year 2012 -- page 1 / 1 (0)
year 2013 -- page 1 / 1 (1)
year 2014 -- page 1 / 1 (3)
year 2015 -- page 1 / 1 (11)
year 2016 -- page 1 / 1 (18)
year 2017 -- page 1 / 1 (25)
year 2018 -- page 1 / 1 (33)
year 2019 -- page 1 / 1 (53)
year 2020 -- page 1 / 2 (100)
year 2020 -- page 2 / 2 (32)
year 2021 -- page 1 / 1 (28)


## Store data in csv

In [10]:
print(f"Number of Zenodo records found: {len(zenodo_records)}")
print(f"Number of files found: {len(zenodo_files)}")

Number of Zenodo records found: 304
Number of files found: 2857


In [11]:
records_df = pd.DataFrame(zenodo_records).set_index("id")
records_df.to_csv("zenodo_records.csv")
print(records_df.shape)
records_df.head()

(304, 7)


Unnamed: 0_level_0,conceptid,date_created,date_updated,title,description,access_right,license
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
51635,633598,2016-05-22T21:27:58+00:00,2020-12-31T18:02:39.471208+00:00,Molecular dynamics simulation trajectory of a ...,<p><strong>System:&nbsp;</strong>DMPC (dimyris...,open,CC-BY-4.0
8431,642967,2015-05-07T17:26:58+00:00,2020-01-24T19:21:43.576503+00:00,Molecular dynamics derived side chain order pa...,<p>This dataset contains tab-delimited text fi...,open,CC0-1.0
13393,605455,2015-01-06T13:40:44+00:00,2020-01-24T19:24:42.558489+00:00,"POPC @ 298K, Model by Kukol",<p>Files&nbsp;required&nbsp;for a simulation o...,open,CC0-1.0
573033,800331,2017-05-09T06:05:34.703109+00:00,2020-01-24T19:23:25.053804+00:00,Long-Term Single Cell Analysis of S. pombe on ...,<p>Although <em>Schyzosaccharomyces pombe</em>...,open,CC-BY-4.0
20561,611511,2015-07-16T20:53:30+00:00,2020-01-24T19:22:24.254587+00:00,"Simulation trajectories for the article ""Molec...",<p>Simulation trajectories for the article &qu...,open,CC0-1.0


In [12]:
files_df = pd.DataFrame(zenodo_files).set_index("record_id")
files_df.to_csv("zenodo_files.csv")
print(files_df.shape)
files_df.head()

(2857, 3)


Unnamed: 0_level_0,name,type,size
record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51635,ffgmx.atp,atp,2633
51635,dmpc128W.mdp,mdp,935
51635,ffgmx.itp,itp,169
51635,dmpc128W_1ns.tpr,tpr,3351488
51635,ffgmxbon.itp,itp,30883
