In [76]:
import os
import re
import time
from functools import reduce
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tinydb import TinyDB, Query
from tinydb.table import Document

In [2]:
webpage = "https://www.anvur.it/attivita/classificazione-delle-riviste/classificazione-delle-riviste-ai-fini-dellabilitazione-scientifica-nazionale/elenchi-di-riviste-scientifiche-e-di-classe-a/"

In [3]:
headers = {
    'User-Agent': 'Anvur Journal Acquisition',
    "Mail": "luigi.palumbo@unitus.it"
}

In [4]:
# Load the list of downloaded files if it exist
if "anvur_journals.csv" in os.listdir("data"):
    current_anvur_df = pd.read_csv(os.path.join("data", "anvur_journals.csv"))
    current_links = [
        item.get("link")
        for item in
        current_anvur_df.to_dict(orient="records")
    ]
else:
    current_links = []

In [5]:
with requests.get(webpage, headers=headers) as res:
    response = BeautifulSoup(res.text, "html.parser")

In [6]:
link_list = [
    {
        "area": item.get_text(strip=True),
        "filename": item.get("href").split("/")[-1],
        "link": item.get("href")
    }
    for item in
    response.find_all("a", {"href": re.compile("pdf$")})
]

In [7]:
link_classe_a = [
    item for item
    in link_list
    if item.get("area").endswith("classe A")
]

In [8]:
link_scientifici = [
    item for item
    in link_list
    if item not in link_classe_a
]

In [9]:
link_df = pd.DataFrame(link_list)

In [10]:
link_df.to_csv(os.path.join("data","anvur_journals.csv"), index=False)

In [11]:
for item in link_list:
    time.sleep(2)
    # Only download files not yet downloaded
    if item.get("link") not in current_links:
        with requests.get(item.get("link"), headers=headers) as res:
            with open(os.path.join("data", item.get("filename")), "wb") as f:
                f.write(res.content)

## Download ranking from Scimago

In [12]:
scimago_link = "https://www.scimagojr.com/journalrank.php?out=xls"

In [13]:
with requests.get(scimago_link, headers=headers) as res:
    with open(os.path.join("data","scimago.csv"), "wb") as f:
        f.write(res.content)

In [120]:
scimago_df = pd.read_csv(os.path.join("data","scimago.csv"), sep=";")

Categories and Areas to dummies

In [112]:
scimago_df["Categories"].head(10).str.get_dummies(sep="; ")

Unnamed: 0,Artificial Intelligence (Q1),"Biochemistry, Genetics and Molecular Biology (miscellaneous) (Q1)",Biomaterials (Q1),Cancer Research (Q1),Cell Biology (Q1),Drug Discovery (Q1),Economics and Econometrics (Q1),"Electronic, Optical and Magnetic Materials (Q1)",Energy (miscellaneous) (Q1),Hematology (Q1),Human-Computer Interaction (Q1),Materials Chemistry (Q1),Medicine (miscellaneous) (Q1),Molecular Biology (Q1),Oncology (Q1),Pharmacology (Q1),Software (Q1),"Surfaces, Coatings and Films (Q1)"
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [119]:
scimago_df["Areas"].head(10).str.get_dummies(sep="; ").replace({0:None})

Unnamed: 0,"Biochemistry, Genetics and Molecular Biology",Computer Science,"Economics, Econometrics and Finance",Energy,Materials Science,Medicine,"Pharmacology, Toxicology and Pharmaceutics"
0,,,,,,1.0,
1,,1.0,,,,,
2,1.0,,,,,,
3,,,1.0,,,,
4,1.0,,,,,1.0,
5,1.0,,,,,,
6,,,,,,1.0,1.0
7,,,1.0,,,,
8,,,,1.0,1.0,,
9,,,,,,1.0,


In [17]:
scimago_df[["Title","Issn"]].head(10)

Unnamed: 0,Title,Issn
0,Ca-A Cancer Journal for Clinicians,"15424863, 00079235"
1,Foundations and Trends in Machine Learning,"19358245, 19358237"
2,Nature Reviews Molecular Cell Biology,"14710072, 14710080"
3,Quarterly Journal of Economics,"00335533, 15314650"
4,Nature Reviews Cancer,"1474175X, 14741768"
5,Cell,"00928674, 10974172"
6,Nature Reviews Drug Discovery,"14741784, 14741776"
7,American Economic Review,"19447981, 00028282"
8,Nature Reviews Materials,20588437
9,Nature Reviews Clinical Oncology,"17594782, 17594774"


In [18]:
scimago_df["Issn"].head(10).str.split(", ", expand=True)

Unnamed: 0,0,1
0,15424863,79235.0
1,19358245,19358237.0
2,14710072,14710080.0
3,00335533,15314650.0
4,1474175X,14741768.0
5,00928674,10974172.0
6,14741784,14741776.0
7,19447981,28282.0
8,20588437,
9,17594782,17594774.0


In [30]:
test_df = scimago_df.head(20).copy()
test_df["Issn"] = test_df["Issn"].str.split(", ")
test_df =test_df.explode("Issn")
#test_df

In [121]:
areas_df = scimago_df["Areas"].str.get_dummies(sep="; ").replace({0:None})

In [122]:
scimago_df = scimago_df.join(areas_df)

In [123]:
scimago_df = scimago_df.rename(columns={"Issn":"ISSN"})

In [127]:
scimago_df["Link"] = scimago_df.apply(lambda row: "https://www.scimagojr.com/journalsearch.php?q={}&tip=sid".format(row["Sourceid"]), axis=1)

In [128]:
[{k:v for k,v in journ.items() if v is not None} for journ in scimago_df.head().to_dict(orient="records")]

[{'Rank': 1,
  'Sourceid': 28773,
  'Title': 'Ca-A Cancer Journal for Clinicians',
  'Type': 'journal',
  'ISSN': '15424863, 00079235',
  'SJR': '106,094',
  'SJR Best Quartile': 'Q1',
  'H index': 211,
  'Total Docs. (2023)': 49,
  'Total Docs. (3years)': 124,
  'Total Refs.': 4844,
  'Total Cites (3years)': 35427,
  'Citable Docs. (3years)': 89,
  'Cites / Doc. (2years)': '381,89',
  'Ref. / Doc.': '98,86',
  '%Female': '43,95',
  'Overton': 2,
  'SDG': 35,
  'Country': 'United States',
  'Region': 'Northern America',
  'Publisher': 'Wiley-Blackwell',
  'Coverage': '1950-2023',
  'Categories': 'Hematology (Q1); Oncology (Q1)',
  'Areas': 'Medicine',
  'Medicine': 1,
  'Link': 'https://www.scimagojr.com/journalsearch.php?q=28773&tip=sid'},
 {'Rank': 2,
  'Sourceid': 19300156903,
  'Title': 'Foundations and Trends in Machine Learning',
  'Type': 'journal',
  'ISSN': '19358245, 19358237',
  'SJR': '37,044',
  'SJR Best Quartile': 'Q1',
  'H index': 39,
  'Total Docs. (2023)': 3,
  'Tota

## Parse Anvur PDF

In [2]:
import camelot

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [27]:
current_anvur_df = pd.read_csv(os.path.join("data","anvur_journals.csv"))

docs_classe_a = [
    item
    for item in
    current_anvur_df.to_dict(orient="records")
    if item.get("area").endswith("classe A")
]

docs_scientifici = [
    item for item in
    current_anvur_df.to_dict(orient="records")
    if item not in docs_classe_a
]

Classe A

In [28]:
classe_a_list = []

for item in docs_classe_a:
    table_list = camelot.read_pdf(os.path.join("data",item.get("filename")), pages="1-end")
    # Move first line as header
    for i, _ in enumerate(table_list):
        table_list[i].df.columns = table_list[i].df.iloc[0].to_list()
        table_list[i].df = table_list[i].df[1:]
    # Concatenate all dataframes
    table_list = pd.concat([item.df for item in table_list])
    # Remove hypens from ISSN
    table_list["ISSN"] = table_list["ISSN"].str.replace("-", "")
    classe_a_list.append(table_list.copy(deep=True))

In [33]:
# Combine all titles and ISSN
classe_a_journals = pd.concat([df[["TITOLO", "ISSN"]].copy() for df in classe_a_list])

In [35]:
classe_a_journals = classe_a_journals.drop_duplicates()

In [40]:
# Combine classe A markings

# Drop Title column
classe_a_list = [df.drop(columns=["TITOLO"]) for df in classe_a_list]

classe_a_df = reduce(lambda  left,right: pd.merge(left,right,on=['ISSN'],how='outer'), classe_a_list)

In [59]:
classe_a_list = [
    {k:v for k,v in elem.items() if pd.notna(v) and v != ""} 
    for elem in classe_a_df.to_dict(orient="records")
    ]

Scientific

In [62]:
scientific_list = []

for item in docs_scientifici:
    table_list = camelot.read_pdf(os.path.join("data",item.get("filename")), pages="1-end")
    # Move first line as header
    for i, _ in enumerate(table_list):
        table_list[i].df.columns = table_list[i].df.iloc[0].to_list()
        table_list[i].df = table_list[i].df[1:]
    # Concatenate all dataframes
    table_list = pd.concat([item.df for item in table_list])
    # Remove hypens from ISSN
    table_list["ISSN"] = table_list["ISSN"].str.replace("-", "")
    scientific_list.append(table_list.copy(deep=True))

In [63]:
# Combine all titles and ISSN
scientific_journals = pd.concat([df[["TITOLO", "ISSN"]].copy() for df in scientific_list])
scientific_journals = scientific_journals.drop_duplicates()

In [64]:
# Drop Title column
scientific_list = [df.drop(columns=["TITOLO"]) for df in scientific_list]

scientific_df = reduce(lambda  left,right: pd.merge(left,right,on=['ISSN'],how='outer'), scientific_list)

In [65]:
scientific_list = [
    {k:v for k,v in elem.items() if pd.notna(v) and v != ""} 
    for elem in scientific_df.to_dict(orient="records")
    ]

Combine the list of journals

In [97]:
anvur_journals = pd.concat([classe_a_journals,scientific_journals]).drop_duplicates(subset="ISSN").sort_values(by="TITOLO")

Save journal lists

In [129]:
# Save data
db = TinyDB('db.json')
# Drop previous tables
db.drop_table('anvur')
db.drop_table('classea')
db.drop_table('scientific')

In [106]:
db = TinyDB('db.json')
# Drop previous tables
db.drop_tables()

anvur = db.table('anvur')
classea = db.table('classea')
scientific = db.table('scientific')

In [107]:
_ = anvur.insert_multiple(anvur_journals.to_dict(orient="records"))

In [108]:
_ = classea.insert_multiple(classe_a_list)

In [109]:
_ = scientific.insert_multiple(scientific_list)

In [110]:
# Test search
Journal = Query()

anvur.search(Journal.TITOLO.search("ECONOMETR+"))

[{'TITOLO': 'ADVANCES IN ECONOMETRICS', 'ISSN': '07319053'},
 {'TITOLO': 'APPLIED ECONOMETRICS AND INTERNATIONAL DEVELOPMENT',
  'ISSN': '15784487'},
 {'TITOLO': 'ASIAN-AFRICAN JOURNAL OF ECONOMICS AND ECONOMETRICS',
  'ISSN': '09723986'},
 {'TITOLO': 'CENTRAL EUROPEAN JOURNAL OF ECONOMIC MODELLING AND ECONOMETRICS',
  'ISSN': '20800886'},
 {'TITOLO': 'ECONOMETRIC REVIEWS', 'ISSN': '07474938'},
 {'TITOLO': 'ECONOMETRIC REVIEWS', 'ISSN': '15324168'},
 {'TITOLO': 'ECONOMETRIC THEORY', 'ISSN': '14694360'},
 {'TITOLO': 'ECONOMETRIC THEORY', 'ISSN': '02664666'},
 {'TITOLO': 'ECONOMETRICA', 'ISSN': '14680262'},
 {'TITOLO': 'ECONOMETRICA', 'ISSN': '00129682'},
 {'TITOLO': 'ECONOMETRICS', 'ISSN': '22251146'},
 {'TITOLO': 'ECONOMETRICS AND STATISTICS', 'ISSN': '24523062'},
 {'TITOLO': 'ECONOMETRICS JOURNAL', 'ISSN': '13684221'},
 {'TITOLO': 'ECONOMETRICS JOURNAL ONLINE', 'ISSN': '1368423X'},
 {'TITOLO': 'INTERNATIONAL JOURNAL OF APPLIED ECONOMETRICS AND QUANTITATIVE STUDIES',
  'ISSN': '1698415

In [105]:
if "db.json" in os.listdir("."):
    os.remove("db.json")

In [130]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://anvuradmin:WqCi43hT4PnKQC6j@anvur.2ow2j.mongodb.net/?retryWrites=true&w=majority&appName=anvur"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [131]:
journals_coll = client["anvur"]["journal"]
_ = journals_coll.insert_many(anvur_journals.to_dict(orient="records"))

InsertManyResult([ObjectId('66e6adae066f9a8d23e36e99'), ObjectId('66e6adae066f9a8d23e36e9a'), ObjectId('66e6adae066f9a8d23e36e9b'), ObjectId('66e6adae066f9a8d23e36e9c'), ObjectId('66e6adae066f9a8d23e36e9d'), ObjectId('66e6adae066f9a8d23e36e9e'), ObjectId('66e6adae066f9a8d23e36e9f'), ObjectId('66e6adae066f9a8d23e36ea0'), ObjectId('66e6adae066f9a8d23e36ea1'), ObjectId('66e6adae066f9a8d23e36ea2'), ObjectId('66e6adae066f9a8d23e36ea3'), ObjectId('66e6adae066f9a8d23e36ea4'), ObjectId('66e6adae066f9a8d23e36ea5'), ObjectId('66e6adae066f9a8d23e36ea6'), ObjectId('66e6adae066f9a8d23e36ea7'), ObjectId('66e6adae066f9a8d23e36ea8'), ObjectId('66e6adae066f9a8d23e36ea9'), ObjectId('66e6adae066f9a8d23e36eaa'), ObjectId('66e6adae066f9a8d23e36eab'), ObjectId('66e6adae066f9a8d23e36eac'), ObjectId('66e6adae066f9a8d23e36ead'), ObjectId('66e6adae066f9a8d23e36eae'), ObjectId('66e6adae066f9a8d23e36eaf'), ObjectId('66e6adae066f9a8d23e36eb0'), ObjectId('66e6adae066f9a8d23e36eb1'), ObjectId('66e6adae066f9a8d23e36e

In [132]:
classea_coll = client["anvur"]["classea"]
_ = classea_coll.insert_many(classe_a_list)

In [133]:
scientific_coll = client["anvur"]["scientific"]
_ = scientific_coll.insert_many(scientific_list)

In [135]:
db.tables()

{'anvur', 'classea', 'scientific'}

In [137]:
test_df = pd.read_csv("https://www.scimagojr.com/journalrank.php?out=xls", sep=";")

In [138]:
test_df.head()

Unnamed: 0,Rank,Sourceid,Title,Type,Issn,SJR,SJR Best Quartile,H index,Total Docs. (2023),Total Docs. (3years),...,Ref. / Doc.,%Female,Overton,SDG,Country,Region,Publisher,Coverage,Categories,Areas
0,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",106094,Q1,211,49,124,...,9886,4395,2,35,United States,Northern America,Wiley-Blackwell,1950-2023,Hematology (Q1); Oncology (Q1),Medicine
1,2,19300156903,Foundations and Trends in Machine Learning,journal,"19358245, 19358237",37044,Q1,39,3,13,...,29900,2778,0,0,United States,Northern America,Now Publishers Inc,2008-2023,Artificial Intelligence (Q1); Human-Computer I...,Computer Science
2,3,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",35910,Q1,508,123,336,...,9319,2941,1,20,United Kingdom,Western Europe,Nature Publishing Group,2000-2023,Cell Biology (Q1); Molecular Biology (Q1),"Biochemistry, Genetics and Molecular Biology"
3,4,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",30448,Q1,306,47,136,...,7755,2667,35,22,United Kingdom,Western Europe,Oxford University Press,1886-2023,Economics and Econometrics (Q1),"Economics, Econometrics and Finance"
4,5,12464,Nature Reviews Cancer,journal,"1474175X, 14741768",26837,Q1,505,105,304,...,10290,4433,1,59,United Kingdom,Western Europe,Nature Publishing Group,2001-2023,Cancer Research (Q1); Oncology (Q1),"Biochemistry, Genetics and Molecular Biology; ..."
