In [None]:
import json
from tqdm import tqdm
import pycountry
import plotly.express as px

from utils import get_year, get_month, get_category
from utils import get_submitter_info, get_country_from_domain

In [None]:
years = []
categories = []
submitters = []

with open('data/arxiv.json') as f:
    for line in f:
        data = json.loads(line)
        year = get_year(data["versions"])
        month = get_month(data["versions"])
        if year >= 2020 and month >= 10:
            categories.append(data["categories"])
            submitters.append(data["submitter"])
            years.append(year)

print("Number of articles:", len(years))

In [None]:
already_seen_submitters = {}
already_seen_domains = {}

countries = []
for submitter in tqdm(submitters):
    country = None
    try:
        if submitter in already_seen_submitters:
            country = already_seen_submitters[submitter]
        else:
            submitter_info = get_submitter_info(submitter)
            domain = submitter_info.email.split("@")[1]
            if domain in already_seen_domains:
                country = already_seen_domains[domain]
            else:
                country = get_country_from_domain(domain)
                already_seen_submitters[submitter] = country
                already_seen_domains[domain] = country
    except:
        pass
    countries.append(country)

In [None]:
countries_num_table = {}

for c in countries:
    if c is not None:
        country_data = pycountry.countries.search_fuzzy(c)
        country_code = country_data[0].alpha_3
        if country_code not in countries_num_table:
            countries_num_table[country_code] = 0
        countries_num_table[country_code] += 1

table = {"Country": [], "Articles": []}

for c in pycountry.countries:
    country_code = c.alpha_3
    table["Country"].append(country_code)
    if country_code in countries_num_table:
        table["Articles"].append(countries_num_table[country_code])
    else:
        table["Articles"].append(0)

In [None]:
fig = px.choropleth(
    data_frame = table,
    locations = "Country",
    color = "Articles",
    hover_name = "Country",
    color_continuous_scale= "Peach",
)

fig.show()