In [2]:
!jt -t l

Didn't recognize theme name: l
Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [3]:
import requests, json, pandas as pd, numpy as np

## Get medata from HAL

In [4]:
hal_req = """
https://api.archives-ouvertes.fr/search/?q=fulltext_t:\
(("applied a CC BY public copyright") OR ("applied a CC-BY public copyright")\
OR ("appliqué une licence CC BY") OR ("appliqué une licence CC-BY") )\
&rows=5000&sort=submittedDate_tdate desc&\
fl=halId_s,doiId_s,docType_s,primaryDomain_s,journalIssn_s,submittedDate_s,europeanProjectAcronym_s,anrProjectAcronym_s,journalPublisher_s,publisher_s
"""

In [5]:
req_raw = requests.get(hal_req)
req_json = req_raw.json()
print("total of publications", req_json["response"]["numFound"])

total of publications 99


In [6]:
## transform json to dataframe
req_data = req_json["response"]["docs"]
df = pd.DataFrame(req_data)
df.sample(1)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,anrProjectAcronym_s,halId_s,docType_s,doiId_s,submittedDate_s,europeanProjectAcronym_s,publisher_s
80,sdv,BioMed Central,1756-3305,[FutureHealthSEA],hal-03472238,ART,10.1186/s13071-021-05042-4,2021-12-09 11:26:17,,


In [7]:
## view doctype distribution
df["docType_s"].value_counts()

ART            80
UNDEFINED      13
COMM            4
PROCEEDINGS     1
THESE           1
Name: docType_s, dtype: int64

## Enrich with unpaywall

In [8]:
def get_upw(doi):
    # memo https://api.unpaywall.org/v2/10.1101/2021.05.24.445406?email=maxence.larrieu@u-paris.fr
    req = requests.get(f"https://api.unpaywall.org/v2/{doi}?email=maxence.larrieu@u-paris.fr")
    res = req.json()
    
    # get issns
    issns = res.get("journal_issns")
    
    # deduce if vor is oa
    vor_is_oa = False
    if res.get("best_oa_location"):
        best_loc = res["best_oa_location"]
        vor_is_oa = True if best_loc["version"] == "publishedVersion" else False
    
    return [issns, vor_is_oa]

In [9]:
## pour tester
# memo closed doi 10.1017/S135577181900030X
get_upw("10.1017/S135577181900030X")

['1355-7718,1469-8153', False]

In [10]:
## add upw_issn & upw_vor_is_oa to table
df[["upw_issns", "upw_vor_is_oa" ]] = df.apply(
    lambda row : get_upw(row["doiId_s"]), axis = 1, result_type = "expand")

# Reorder dataframe columns to prepare HTML display

In [11]:
## shorter submittedDAte (no hours info)
### 2022-12-12 is 10 carac length
df["submittedDate_s"] = df["submittedDate_s"].str.slice(0,10)
df.sample(2)

Unnamed: 0,primaryDomain_s,journalPublisher_s,journalIssn_s,anrProjectAcronym_s,halId_s,docType_s,doiId_s,submittedDate_s,europeanProjectAcronym_s,publisher_s,upw_issns,upw_vor_is_oa
59,sdv,Nature Publishing Group,2041-1723,,hal-03684839,ART,10.1038/s41467-021-25649-6,2022-06-01,,,2041-1723,True
65,sdv.bbm.bc,Wiley-VCH Verlag,0265-9247,[DEFineNMD],pasteur-03606359,ART,10.1002/bies.202100296,2022-03-11,,,"0265-9247,1521-1878",True


In [12]:
## make european & anrproject columns boolean (True if data else False)
df["europeanProjectAcronym_s"] = df["europeanProjectAcronym_s"].notna()
df["anrProjectAcronym_s"] = df["anrProjectAcronym_s"].notna()

In [13]:
def deduce_funders(row) : 
    """
    create a column to indicate the funder, either ANR or Europe or both
    """
    if row["europeanProjectAcronym_s"] & row["anrProjectAcronym_s"] : 
        return "Europe & ANR"
    
    elif row["europeanProjectAcronym_s"] : 
        return "Europe"
    
    elif row["anrProjectAcronym_s"] : 
        return "ANR"

In [14]:
df["funded_project"] = df.apply(lambda row : deduce_funders(row), axis = 1)

In [15]:
## make simpler document types
doctype_matcher = {
     "ART" : "article", 
     "UNDEFINED" : "preprint",
     "PROCEEDINGS" : "proceedings",
     "OUV" : "book",
     "COUV" : "book", 
     "ISSUE" : "issue"
    }

In [16]:
def change_doctype(hal_doctype) :
    """
    change document types for less information
    """
    if hal_doctype in doctype_matcher.keys() : 
        return doctype_matcher[hal_doctype]
    else : 
        return "other"

In [17]:
df["docType_s"] = df["docType_s"].apply(lambda x : change_doctype(x))

In [18]:
df.docType_s.value_counts()

article        80
preprint       13
other           5
proceedings     1
Name: docType_s, dtype: int64

In [19]:
df.columns

Index(['primaryDomain_s', 'journalPublisher_s', 'journalIssn_s',
       'anrProjectAcronym_s', 'halId_s', 'docType_s', 'doiId_s',
       'submittedDate_s', 'europeanProjectAcronym_s', 'publisher_s',
       'upw_issns', 'upw_vor_is_oa', 'funded_project'],
      dtype='object')

In [20]:
## remove columns
# /!\/!\/!\ first verify column existance /!\/!\/!\
df_export = df.drop(
    ["primaryDomain_s", "journalIssn_s", "doiId_s", "upw_issns", "anrProjectAcronym_s", "europeanProjectAcronym_s" ], 
    axis = 1)

In [21]:
df_export.columns

Index(['journalPublisher_s', 'halId_s', 'docType_s', 'submittedDate_s',
       'publisher_s', 'upw_vor_is_oa', 'funded_project'],
      dtype='object')

In [22]:
## reorder column
df_export = df[["halId_s", "docType_s", "journalPublisher_s", "upw_vor_is_oa", "funded_project", "submittedDate_s"]]

In [23]:
## rename columns
df_export = df_export.rename(columns = 
                 {'halId_s':'HAL URI', 
                  'docType_s':'genre', 
                  "journalPublisher_s" : "publisher",
                  "upw_vor_is_oa" : "publisher vers.\nis OA",
                  "funded_project" : "European/ANR project",
                  "submittedDate_s" : "deposit date"
                 })

In [24]:
df_export.to_csv("../fr-publications-rrs-applied.csv", index = False)