In [None]:
from pathlib import Path
import pycountry
import plotly.express as px
from tqdm import tqdm

In [None]:
countries_dir = Path("data/countries")
countries = []

for p in countries_dir.glob("*.txt"):
    with open(p) as f:
        lines = [s.strip() for s in f.readlines()]
        countries.extend([s for s in lines if s != "None"])

print("Collected data:", len(countries))

In [None]:
countries_num_table = {}
name_map = {"Macedonia, the Former Yugoslav Republic of": "Macedonia"}

for c in tqdm(countries, desc="Collecting countries codes"):
    country_name = name_map.get(c, c)
    country_data = pycountry.countries.search_fuzzy(country_name)
    country_code = country_data[0].alpha_3
    if country_code not in countries_num_table:
        countries_num_table[country_code] = 0
    countries_num_table[country_code] += 1

In [None]:
def get_num_articles_label(n):
    if n < 50:
        return "< 50"
    elif n >= 50 and n < 100:
        return "50-100"
    elif n >= 100 and n < 200:
        return "100-200"
    elif n >= 200 and n < 500:
        return "200-500"
    elif n > 500:
        return "> 500"

table = {"Country": [], "# Articles": []}

for c in countries_num_table:
    table["Country"].append(c)
    label = get_num_articles_label(countries_num_table[c])
    table["# Articles"].append(label)

In [None]:
fig = px.choropleth(
    title="Number of articles submitted to arXiv (2020/08 - 2020/10)",
    data_frame = table,
    locations = "Country",
    hover_name = "Country",
    color = "# Articles",
    category_orders={
        "# Articles": [
            "< 50",
            "50-100",
            "100-200",
            "200-500",
            "> 500",
        ]
    },
    color_discrete_sequence=[
        "#fff4d6",
        "#fcbe7c",
        "#ff6600",
        "#c20000",
        "#5d00fc",
    ],
)

fig.update_layout(
    geo={"showframe": False},
)
fig.write_image("data/countries.pdf")
fig.show()