In [None]:
!jt -t l

In [1]:
import requests, json, pandas as pd, numpy as np

## Get data from HAL

In [2]:
hal_req = """
https://api.archives-ouvertes.fr/search/?q=fulltext_t:\
(("applied a CC BY public copyright") OR ("applied a CC-BY public copyright")\
OR ("appliqué une licence CC BY") OR ("appliqué une licence CC-BY") )\
&rows=5000&sort=submittedDate_tdate desc&\
fl=halId_s,doiId_s,docType_s,primaryDomain_s,journalIssn_s,submittedDate_s,europeanProjectAcronym_s,anrProjectAcronym_s,journalPublisher_s,publisher_s
"""

In [3]:
req_raw = requests.get(hal_req)
req_json = req_raw.json()
print("total of publications", req_json["response"]["numFound"])

total of publications 59


In [4]:
## transform json to dataframe
req_data = req_json["response"]["docs"]
df = pd.DataFrame(req_data)
df.sample(1)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,halId_s,docType_s,doiId_s,submittedDate_s,europeanProjectAcronym_s,anrProjectAcronym_s
37,sdv.mp.par,Nature Publishing Group,2045-2322,pasteur-03491954,ART,10.1038/s41598-021-99152-9,2021-12-18 10:12:26,,


In [5]:
df["docType_s"].value_counts()

ART          53
UNDEFINED     5
COMM          1
Name: docType_s, dtype: int64

## Enrich with unpaywall

In [6]:
def get_upw(doi):
    # memo https://api.unpaywall.org/v2/10.1101/2021.05.24.445406?email=maxence.larrieu@u-paris.fr
    req = requests.get(f"https://api.unpaywall.org/v2/{doi}?email=maxence.larrieu@u-paris.fr")
    res = req.json()
    
    # get issns
    issns = res.get("journal_issns")
    
    # deduce if vor is oa
    vor_is_oa = False
    if res.get("best_oa_location"):
        best_loc = res["best_oa_location"]
        vor_is_oa = True if best_loc["version"] == "publishedVersion" else False
    
    return [issns, vor_is_oa]

In [7]:
## pour tester
# memo closed doi 10.1017/S135577181900030X
get_upw("10.1017/S135577181900030X")

['1355-7718,1469-8153', False]

In [8]:
## add upw_issn & upw_vor_is_oa to table
df[["upw_issns", "upw_vor_is_oa" ]] = df.apply(
    lambda row : get_upw(row["doiId_s"]), axis = 1, result_type = "expand")

# Rework the table to display it in HTML

In [9]:
## shorder submittedDAte (no hours info)
df["submittedDate_s"] = df["submittedDate_s"].str.slice(0,10)
df.sample(2)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,halId_s,docType_s,doiId_s,submittedDate_s,europeanProjectAcronym_s,anrProjectAcronym_s,upw_issns,upw_vor_is_oa
57,info.info-bi,,,hal-03240376,UNDEFINED,10.1101/2021.05.24.445406,2021-05-28,,,,True
11,sdv.bbm.bs,National Academy of Sciences,0027-8424,hal-03779985,ART,10.1073/pnas.2200468119,2022-09-18,,,"0027-8424,1091-6490",True


In [10]:
## lighten anr and european project (yes if data, else no)
df["europeanProjectAcronym_s"] = df["europeanProjectAcronym_s"].notna()
df["anrProjectAcronym_s"] = df["anrProjectAcronym_s"].notna()

In [16]:
## use one column to know if the publication comes from a funded project
df["funded_project"] = dftest.europeanProjectAcronym_s | dftest.anrProjectAcronym_s

In [17]:
## make simpler doctype category
doctype_matcher = {
     "ART" : "article", 
     "UNDEFINED" : "preprint",
     "COMM" : "proceedings",
     "OUV" : "book",
     "COUV" : "book", 
     "DOUV" : "book"
    }

In [18]:
def change_doctype(hal_doctype) :
    if hal_doctype in doctype_matcher.keys() : 
        return doctype_matcher[hal_doctype]
    else : 
        return "other"

In [19]:
df["docType_s"] = df["docType_s"].apply(lambda x : change_doctype(x))

In [20]:
df.docType_s.value_counts()

article        53
preprint        5
proceedings     1
Name: docType_s, dtype: int64

In [21]:
df.columns

Index(['primaryDomain_s', 'journalPublisher_s', 'journalIssn_s', 'halId_s',
       'docType_s', 'doiId_s', 'submittedDate_s', 'europeanProjectAcronym_s',
       'anrProjectAcronym_s', 'upw_issns', 'upw_vor_is_oa', 'funded_project'],
      dtype='object')

In [26]:
## remove columns
# /!\/!\/!\ first verify column existance /!\/!\/!\
df_export = df.drop(
    ["primaryDomain_s", "journalIssn_s", "doiId_s", "upw_issns", "anrProjectAcronym_s", "europeanProjectAcronym_s" ], 
    axis = 1)

In [27]:
df_export.columns

Index(['journalPublisher_s', 'halId_s', 'docType_s', 'submittedDate_s',
       'upw_vor_is_oa', 'funded_project'],
      dtype='object')

In [28]:
## reorder column
df_export = df[["halId_s", "docType_s", "journalPublisher_s", "upw_vor_is_oa", "funded_project", "submittedDate_s"]]

In [31]:
## rename columns
df_export = df_export.rename(columns = 
                 {'halId_s':'HAL URI', 
                  'docType_s':'genre', 
                  "journalPublisher_s" : "publisher",
                  "upw_vor_is_oa" : "publisher vers.\nis oa",
                  "funded_project" : "European/ANR project",
                  "submittedDate_s" : "deposit date"
                 })

In [32]:
df_export.to_csv("../fr-publications-rrs-applied.csv", index = False)