In [11]:
!jt -t l

Didn't recognize theme name: l
Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [1]:
import requests, json, pandas as pd, numpy as np

## Get data from HAL

In [32]:
hal_req = """
https://api.archives-ouvertes.fr/search/?q=fulltext_t:\
(("applied a CC BY public copyright") OR ("applied a CC-BY public copyright")\
OR ("appliqué une licence CC BY") OR ("appliqué une licence CC-BY") )\
&rows=5000&sort=submittedDate_tdate desc&\
fl=halId_s,doiId_s,docType_s,primaryDomain_s,journalIssn_s,submittedDate_s,anrProjectAcronym_s,journalPublisher_s,publisher_s
"""

In [61]:
req_raw = requests.get(hal_req)
req_json = req_raw.json()
print("total of publications", req_json["response"]["numFound"])

total of publications 54


In [62]:
## transform json to dataframe
req_data = req_json["response"]["docs"]
df = pd.DataFrame(req_data)
df.sample(1)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,halId_s,docType_s,doiId_s,submittedDate_s,anrProjectAcronym_s
18,sdv,Public Library of Science,1935-2727,hal-03648068,ART,10.1371/journal.pntd.0010256,2022-04-21 10:58:03,[FutureHealthSEA]


In [63]:
df["docType_s"].value_counts()

ART          48
UNDEFINED     5
COMM          1
Name: docType_s, dtype: int64

## Enrich w unpaywall

In [64]:
def get_upw(doi):
    # memo https://api.unpaywall.org/v2/10.1101/2021.05.24.445406?email=maxence.larrieu@u-paris.fr
    req = requests.get(f"https://api.unpaywall.org/v2/{doi}?email=m@larri.eu")
    res = req.json()
    
    # get issns
    issns = res.get("journal_issns")
    
    # deduce if vor is oa
    vor_is_oa = False
    if res.get("best_oa_location"):
        best_loc = res["best_oa_location"]
        vor_is_oa = True if best_loc["version"] == "publishedVersion" else False
    
    return [issns, vor_is_oa]

In [65]:
## pour tester
# memo closed doi 10.1017/S135577181900030X
get_upw("10.1017/S135577181900030X")

['1355-7718,1469-8153', False]

In [66]:
## add upw_issn & upw_vor_is_oa to table
df[["upw_issns", "upw_vor_is_oa" ]] = df.apply(
    lambda row : get_upw(row["doiId_s"]), axis = 1, result_type = "expand")

# Rework the table for html display

In [67]:
## shorder submittedDAte (no hours info)
df["submittedDate_s"] = df["submittedDate_s"].str.slice(0,10)
df.sample(2)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,halId_s,docType_s,doiId_s,submittedDate_s,anrProjectAcronym_s,upw_issns,upw_vor_is_oa
3,phys.meca.mefl,Cambridge University Press (CUP),0022-1120,hal-03559481,ART,10.1017/jfm.2022.658,2022-09-25,,"0022-1120,1469-7645",True
32,sdv.mp.par,Nature Publishing Group,2045-2322,pasteur-03491954,ART,10.1038/s41598-021-99152-9,2021-12-18,,2045-2322,True


In [69]:
## lighten anrProject (yes if data, else no)
df["anrProjectAcronym_s"] = df["anrProjectAcronym_s"].notna()

In [42]:
## make simpler doctype category
doctype_matcher = {
    "ART" : "article", 
    "UNDEFINED" : "preprint",
     "COMM" : "proceedings",
    "OUV" : "book",
     "COUV" : "book", 
     "DOUV" : "book"
    }

In [92]:
def change_doctype(hal_doctype) :
    if hal_doctype in doctype_matcher.keys() : 
        return doctype_matcher[hal_doctype]
    else : 
        return "other"
    

df["docType_s"] = df["docType_s"].apply(lambda x : change_doctype(x))

In [93]:
df.docType_s.value_counts()

article        48
preprint        5
proceedings     1
Name: docType_s, dtype: int64

In [94]:
df.columns

Index(['primaryDomain_s', 'journalPublisher_s', 'journalIssn_s', 'halId_s',
       'docType_s', 'doiId_s', 'submittedDate_s', 'anrProjectAcronym_s',
       'upw_issns', 'upw_vor_is_oa'],
      dtype='object')

In [95]:
## remove columns
# /!\/!\/!\ first verify column existance /!\/!\/!\
df_export = df.drop(["primaryDomain_s", "journalIssn_s", "doiId_s", "upw_issns" ], axis = 1)

In [98]:
df_export.columns

Index(['journalPublisher_s', 'halId_s', 'docType_s', 'submittedDate_s',
       'anrProjectAcronym_s', 'upw_vor_is_oa'],
      dtype='object')

In [99]:
## reorder column
df_export = df[["halId_s", "docType_s", "journalPublisher_s", "upw_vor_is_oa", "anrProjectAcronym_s", "submittedDate_s"]]

In [101]:
## rename column
df_export = df_export.rename(columns = 
                 {'halId_s':'HAL URI', 
                  'docType_s':'genre', 
                  "journalPublisher_s" : "publisher",
                  "upw_vor_is_oa" : "publisher vers.\nis oa",
                  "anrProjectAcronym_s" : "ANR project",
                  "submittedDate_s" : "deposit date"
                 })

In [102]:
df_export.to_csv("../fr-publications-rrs-applied.csv", index = False)