In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
summary_stats_df = pd.read_csv("../data/summary_stats.csv")

summary_stats_df["country"] = summary_stats_df["country"].apply(lambda x: "GB" if x == "UK" else x)
summary_stats_df.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words
0,2010,1,AU,3AW,1,1505
1,2010,1,AU,ABC Message Stick,2,7284
2,2010,1,AU,ABC Online,143,78050
3,2010,1,AU,ABC Regional Online,55,21602
4,2010,1,AU,Architecture and Design,14,4981


In [3]:
by_publisher = summary_stats_df.groupby(["country", "publisher"])
by_publisher = by_publisher[["num_articles", "total_words"]]
by_publisher = by_publisher.sum().reset_index().rename(columns={
    "num_articles": "publisher_tot_articles", 
    "total_words": "publisher_tot_words"
})

bins=[0, 1, 2, 3, 5, 10, 25, 100, 500, 1000, np.inf]
labels=["1", "2", "3", "4-5", "6-10", "11-25", "26-100", "101-500", "501-1000", "1001+"]

by_publisher["publisher_bin"] = pd.cut(by_publisher.publisher_tot_articles, bins=bins, labels=labels)
by_publisher.head()

Unnamed: 0,country,publisher,publisher_tot_articles,publisher_tot_words,publisher_bin
0,AU,'Bourne this way,48,14834,26-100
1,AU,10 News First,37,6240,26-100
2,AU,10 daily,221,103872,101-500
3,AU,123Jump.com,3,778,3
4,AU,2DayFM,15,3771,11-25


In [4]:
big_publishers = summary_stats_df.merge(
    by_publisher.query("publisher_bin == '1001+'"), 
    how="right", on=["country", "publisher"]
)

big_publishers.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words,publisher_tot_articles,publisher_tot_words,publisher_bin
0,2016,11,AU,9Honey,14,5941,3798,1839552,1001+
1,2016,12,AU,9Honey,52,18680,3798,1839552,1001+
2,2016,11,AU,9Honey,14,5941,3798,1839552,1001+
3,2016,12,AU,9Honey,56,20555,3798,1839552,1001+
4,2017,1,AU,9Honey,75,31747,3798,1839552,1001+


In [13]:
CAP = 10000

def get_num_samples(x, num_samples):
    pct, max_ = x["pct"], x["total_articles"]
    n = int(np.around(num_samples * pct))
    if n > max_:
        n = max_
        
    return n

sampled_counts = []
for country, country_df in big_publishers.groupby("country"):
    
    articles_by_year = country_df.groupby("year")["num_articles"].sum()
    
    num_to_sample = articles_by_year[articles_by_year.index < 2016].median()
    num_to_sample = np.min([CAP, num_to_sample])
        
    for year, year_df in country_df.groupby("year"):
        publishers = year_df.groupby("publisher")["num_articles"].sum().reset_index()
        publishers["country"] = country
        publishers["year"] = year
        
        publishers["pct"] = publishers.num_articles / publishers.num_articles.sum()
        publishers["total_articles"] = publishers.num_articles
        
        publishers["sampled_articles"] = publishers.apply(
            get_num_samples, axis=1, num_samples=num_to_sample
        )
    
        sampled_counts.append(publishers[[
            "country", "year", "publisher", "total_articles", "sampled_articles"
         ]].copy())
        
big_publisher_sampling = pd.concat(sampled_counts)
big_publisher_sampling.to_csv("sampling_amounts.csv", index=False)
big_publisher_sampling.head()

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,ABC Local,338,248
1,AU,2010,ABC Online,2743,2010
2,AU,2010,AdNews,7,5
3,AU,2010,Architecture and Design,147,108
4,AU,2010,Australian Personal Computer,263,193


In [14]:
print(big_publisher_sampling[["total_articles", "sampled_articles"]].sum().apply(lambda x: f"{x:,d}"))

print("\nBy Year:")
big_publisher_sampling.groupby("year")[["total_articles", "sampled_articles"]].sum()

total_articles      17,336,805
sampled_articles     1,464,959
dtype: object

By Year:


Unnamed: 0_level_0,total_articles,sampled_articles
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,258826,84139
2011,347212,109477
2012,448073,133394
2013,510685,141804
2014,576154,142360
2015,734857,142358
2016,3006000,142357
2017,3064616,142295
2018,2575893,142342
2019,2956593,142332
