# Retrouver dans  Zenodo les jeux de données de son établissement 

In [1]:
import requests, json

In [None]:
# a gist example : https://gist.github.com/slint/d30fc0f415300876facbbeb8a0989ab2

l'API de Zenodo nécessite une clé. cf. https://developers.zenodo.org/#quickstart-upload

In [2]:
with open("../hide/personnal-keys.json") as f : 
    ACCESS_TOKEN = json.load(f)["ZENODO_KEY"]

## Défnir la recherche sur le champs affiliation

cf. https://help.zenodo.org/guides/search/

In [3]:
aff_query = "(univ* AND (diderot OR descartes))"

In [4]:
r = requests.get("https://zenodo.org/api/records",
                 params = {"q" : "creators.affiliation:" +  aff_query,
                           "type" : "dataset",
                           "size" : 300,
                           "sort" : "mostrecent",
                           "all_version" : False,
                           "access_tpoken" : ACCESS_TOKEN
                          })
res = r.json()

Se familiariser avec la structure de la réponse

In [None]:
# export en JSON
with open("zenodo_raw.json", 'w') as f : 
    json.dump(res, f)

In [None]:
# impression json
print(json.dumps(res["hits"],  indent = 1))

In [5]:
# nb de jeux de données trouvés
print(res["hits"]["total"])

70


In [None]:
# imprimer le contenu d'un seul hits
print(json.dumps(res["hits"]["hits"][0], indent = 2))

In [6]:
# imprimer les clés d'un hits
first_hit = res["hits"]["hits"][0]
first_hit.keys()

dict_keys(['conceptdoi', 'conceptrecid', 'created', 'doi', 'files', 'id', 'links', 'metadata', 'owners', 'revision', 'stats', 'updated'])

## Déduire les informations à récupérer
``` 
id, l'identifiant zenodo, pour déduire le lien https://zenodo.org/record/5005868
metadata/publication_date : date du dépot
metadata/titre : titre du jeu de de donnée
metadata/license : la licence apposée
metadata/keywords
metadata/doi
```

## Récupérer les données

In [7]:
# un dict pour coller les données
data = {}

for idx, item in enumerate( res["hits"]["hits"] ) : 
    
    data[idx] = {}
    data[idx]["id"] = item["id"]
    
    data[idx]["title"] = item["metadata"].get("title")
    data[idx]["publication_date"] = item["metadata"].get("publication_date")
    data[idx]["doi"] = item["metadata"].get("doi")
    
    #licence
    if item["metadata"].get("license") : 
        data[idx]["license"] = item["metadata"]["license"]["id"]   
    
    # mots-clés
    if item["metadata"].get("keywords") : 
        data[idx]["keywords"] = ",".join(item["metadata"]["keywords"])  

In [9]:
print( json.dumps(data[3], indent = 1))

{
 "id": 5450933,
 "title": "Decline in telomere length with increasing age across non\u2010human vertebrates: a meta\u2010analysis",
 "publication_date": "2021-09-07",
 "doi": "10.5061/dryad.000000048",
 "license": "CC0-1.0"
}


## Transform json to df and vizualise

In [10]:
import pandas as pd

In [11]:
df = pd.DataFrame(data)

In [12]:
df.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
publication_date,2021-10-27,2021-10-04,2021-09-17,2021-09-07,2019-02-02,2014-06-03,2019-06-14,2016-02-11,2016-02-19,2019-01-04,...,2015-02-14,2015-01-27,2015-01-19,2015-01-27,2015-01-26,2015-01-19,2015-01-27,2015-01-19,2015-01-23,2015-01-27
title,A global inventory of solar photovoltaic gener...,Energetic return on investment determines over...,Effect of water on sandstone's fracture toughn...,Decline in telomere length with increasing age...,Data from: Evolution of a dominant natural iso...,Data from: Postglacial climate changes and ris...,Data from: Risks of 23 specific malformations ...,Data from: Reduced mate availability leads to ...,Data from: Should the WHO growth charts be use...,Data from: Ultrasonic cavitation induces necro...,...,"Dataset from ""Matthieu Delescluse and Christop...",MD simulation trajectory and related files for...,MD simulation trajectory for POPC/50% Chol bil...,MD simulation trajectory and related files for...,MD simulation trajectory and related files for...,MD simulation trajectory for POPC/20% Chol bil...,MD simulation trajectory and related files for...,MD simulation trajectory and related files for...,Data set from Pouzat and Chaffiol (2009) Journ...,Tetrode recording from the antennal lobe of a ...


In [13]:
# renverser colonne & ligne
df = df.T
# trier par date de dépôt
df = df.sort_values( by = "publication_date", ascending = False)

In [14]:
df[:4]

Unnamed: 0,id,title,publication_date,doi,license,keywords
0,5005868,A global inventory of solar photovoltaic gener...,2021-10-27,10.5281/zenodo.5005868,CC-BY-4.0,"photovoltaic solar,remote sensing,geospatial d..."
1,5547311,Energetic return on investment determines over...,2021-10-04,10.5281/zenodo.5547311,CC-BY-4.0,"soil,microorganisms,microbial activity,organic..."
2,5513285,Effect of water on sandstone's fracture toughn...,2021-09-17,10.1016/j.ijrmms.2021.104916,CC-BY-4.0,
3,5450933,Decline in telomere length with increasing age...,2021-09-07,10.5061/dryad.000000048,CC0-1.0,


In [None]:
df.to_csv("data_from_zenodo.csv", index = False)

----
## Vigilance

* la recherche dans l'affiliation se fait pour tous les auteurs (comme le wos) donc du bruit peut être présent en sortie

* l'API donne les résultats avec un systeme de pagination. Vérifier à la fin de la réponse si `links` ne contient pas une url `next`