In [1]:
import pandas as pd

In [2]:
# via https://www.worldometers.info/world-population/population-by-country/
tables = pd.read_html("SciHubData/Population-by-Country-2022-Worldometer.html")
fullcdata = tables[0]
fullcdata

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399.0,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687.0,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806.0,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955.0,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379.0,3.6,23,35 %,2.83 %
...,...,...,...,...,...,...,...,...,...,...,...,...
230,231,Montserrat,4992,0.06 %,3,50,100,,N.A.,N.A.,10 %,0.00 %
231,232,Falkland Islands,3480,3.05 %,103,0,12170,,N.A.,N.A.,66 %,0.00 %
232,233,Niue,1626,0.68 %,11,6,260,,N.A.,N.A.,46 %,0.00 %
233,234,Tokelau,1357,1.27 %,17,136,10,,N.A.,N.A.,0 %,0.00 %


In [36]:
cdata = fullcdata[["Country (or dependency)", "Population (2020)"]] \
  .rename(
    columns={
        "Country (or dependency)": "country",
        "Population (2020)": "population",
    }
)
cdata

Unnamed: 0,country,population
0,China,1439323776
1,India,1380004385
2,United States,331002651
3,Indonesia,273523615
4,Pakistan,220892340
...,...,...
230,Montserrat,4992
231,Falkland Islands,3480
232,Niue,1626
233,Tokelau,1357


In [1]:
# Last month's Sci-Hub downloads by country
# via https://sci-hub.se/datasets/country%20downloads%20per%20month/2022-02-14.tab
# via https://twitter.com/ringo_ring/status/1493407802957746176
allscidata = pd.read_csv("SciHubData/sci-hub-stats--2022-02-14.tab",
                         header=None, names=["country", "articles"], sep="\t").dropna()
scidata = allscidata.loc[0:199,:]
scidata

NameError: name 'pd' is not defined

## Join the two data tables

We want to join sci-hub download stats with country population data using `merge`.

In [43]:
df = pd.merge(scidata, cdata, on="country", how="inner").dropna()
df

Unnamed: 0,country,articles,population
0,China,23712335,1439323776
1,United States,8919666,331002651
2,France,4265975,65273511
3,Brazil,2210009,212559417
4,India,1609609,1380004385
...,...,...,...
184,Grenada,162,112523
185,U.S. Virgin Islands,147,104425
186,Cayman Islands,96,65722
187,Sint Maarten,95,42876


In [46]:
# compute the number-of-articles-per-capita score
df2 = df \
  .assign(artperpop = df["articles"] / df["population"]) \
  .sort_values("artperpop", ascending=False)

# add the article-per-capita rank
df2["relrank"] = df2["artperpop"].rank(ascending=False).astype(int)

# show top 20
df2[["relrank", "country", "artperpop", "articles", "population"]] \
  .set_index("relrank").head(20)

Unnamed: 0_level_0,country,artperpop,articles,population
relrank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Seychelles,3.093018,304189,98347
2,Singapore,0.098986,579102,5850342
3,Luxembourg,0.077327,48405,625978
4,France,0.065355,4265975,65273511
5,Macao,0.034244,22236,649335
6,United States,0.026947,8919666,331002651
7,Greece,0.025539,266193,10423054
8,Slovenia,0.024099,50101,2078938
9,Iceland,0.02318,7910,341243
10,Portugal,0.022685,231308,10196709


Seychelles is killing it! Singapore, Luxembourg, and France, and Macao are the other in the top-five.

According to [this tweet](https://twitter.com/science_surf/status/1493507065607827458) the high numbers for the Seychelles, Luxembourg, Singapore, and Macao are due to many users of TOR and VPNs using these locations as exit nodes.