In [1]:
import os
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
webpage = "https://www.anvur.it/attivita/classificazione-delle-riviste/classificazione-delle-riviste-ai-fini-dellabilitazione-scientifica-nazionale/elenchi-di-riviste-scientifiche-e-di-classe-a/"

In [3]:
headers = {
    'User-Agent': 'Anvur Journal Acquisition',
    "Mail": "luigi.palumbo@unitus.it"
}

In [4]:
# Load the list of downloaded files if it exist
if "anvur_journals.csv" in os.listdir("data"):
    current_anvur_df = pd.read_csv("data/anvur_journals.csv")
    current_links = [
        item.get("link")
        for item in
        current_anvur_df.to_dict(orient="records")
    ]
else:
    current_links = []

In [5]:
with requests.get(webpage, headers=headers) as res:
    response = BeautifulSoup(res.text, "html.parser")

In [6]:
link_list = [
    {
        "area": item.get_text(strip=True),
        "filename": item.get("href").split("/")[-1],
        "link": item.get("href")
    }
    for item in
    response.find_all("a", {"href": re.compile("pdf$")})
]

In [7]:
link_classe_a = [
    item for item
    in link_list
    if item.get("area").endswith("classe A")
]

In [8]:
link_scientifici = [
    item for item
    in link_list
    if item not in link_classe_a
]

In [9]:
link_df = pd.DataFrame(link_list)

In [10]:
link_df.to_csv("data/anvur_journals.csv", index=False)

In [11]:
for item in link_list:
    time.sleep(2)
    # Only download files not yet downloaded
    if item.get("link") not in current_links:
        with requests.get(item.get("link"), headers=headers) as res:
            with open("data/{}".format(item.get("filename")), "wb") as f:
                f.write(res.content)

## Download ranking from Scimago

In [12]:
scimago_link = "https://www.scimagojr.com/journalrank.php?out=xls"

In [13]:
with requests.get(scimago_link, headers=headers) as res:
    with open("data/scimago.csv", "wb") as f:
        f.write(res.content)

In [14]:
scimago_df = pd.read_csv("data/scimago.csv", sep=";")

Categories and Areas to dummies

In [28]:
scimago_df["Categories"].head(10).str.get_dummies(sep="; ")

Unnamed: 0,Artificial Intelligence (Q1),"Biochemistry, Genetics and Molecular Biology (miscellaneous) (Q1)",Biomaterials (Q1),Cancer Research (Q1),Cell Biology (Q1),Drug Discovery (Q1),Economics and Econometrics (Q1),"Electronic, Optical and Magnetic Materials (Q1)",Energy (miscellaneous) (Q1),Hematology (Q1),Human-Computer Interaction (Q1),Materials Chemistry (Q1),Medicine (miscellaneous) (Q1),Molecular Biology (Q1),Oncology (Q1),Pharmacology (Q1),Software (Q1),"Surfaces, Coatings and Films (Q1)"
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [16]:
scimago_df["Areas"].head(10).str.get_dummies(sep=", ")

Unnamed: 0,Biochemistry,Computer Science,Econometrics and Finance,Economics,Energy; Materials Science,Genetics and Molecular Biology,Genetics and Molecular Biology; Medicine,Medicine,Medicine; Pharmacology,Toxicology and Pharmaceutics
0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0
5,1,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,1,1
7,0,0,1,1,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,1,0,0


In [17]:
scimago_df[["Title","Issn"]].head(10)

Unnamed: 0,Title,Issn
0,Ca-A Cancer Journal for Clinicians,"15424863, 00079235"
1,Foundations and Trends in Machine Learning,"19358245, 19358237"
2,Nature Reviews Molecular Cell Biology,"14710072, 14710080"
3,Quarterly Journal of Economics,"00335533, 15314650"
4,Nature Reviews Cancer,"1474175X, 14741768"
5,Cell,"00928674, 10974172"
6,Nature Reviews Drug Discovery,"14741784, 14741776"
7,American Economic Review,"19447981, 00028282"
8,Nature Reviews Materials,20588437
9,Nature Reviews Clinical Oncology,"17594782, 17594774"


In [18]:
scimago_df["Issn"].head(10).str.split(", ", expand=True)

Unnamed: 0,0,1
0,15424863,79235.0
1,19358245,19358237.0
2,14710072,14710080.0
3,00335533,15314650.0
4,1474175X,14741768.0
5,00928674,10974172.0
6,14741784,14741776.0
7,19447981,28282.0
8,20588437,
9,17594782,17594774.0


In [30]:
test_df = scimago_df.head(20).copy()
test_df["Issn"] = test_df["Issn"].str.split(", ")
test_df =test_df.explode("Issn")
#test_df

## Classe A

In [12]:
from ctypes.util import find_library

find_library("gs")

'libgs.so.10'

In [16]:
import camelot

In [17]:
classe_a_test = camelot.read_pdf("data/{}".format(link_classe_a[0].get("filename")), pages="1-end")

In [21]:
classe_a_test[0].df

Unnamed: 0,0,1,2,3,4,5,6
0,TITOLO,ISSN,08/C1,08/D1,08/E1,08/E2,08/F1
1,2G,1136-9647,A,A,A,A,A
2,ABITARE,0001-3218,A,A,A,A,A
3,ABITARE LA TERRA,1592-8608,A,A,A,A,A
4,ACM JOURNAL ON COMPUTING AND CULTURAL HERITAGE,1556-4673,,,A(2017),,
5,ACOUSTICS,2624-599X,A(2018),,,,
6,ADVANCED MATERIALS RESEARCH,1022-6680,‡(2020),‡(2020),‡(2020),‡(2020),‡(2020)
7,ADVANCED MATERIALS RESEARCH,1662-8985,‡(2020),‡(2020),‡(2020),‡(2020),‡(2020)
8,AGATHON,2464-9309,A(2017),A(2017),A(2017),A(2017),
9,AGATHÓN,2532-683X,A(2017),A(2017),A(2017),A(2017),
