In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import shutil
import random
import pathlib
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm import tqdm
import slugify

import utils

In [3]:
summary_stats_df = pd.read_csv("data/summary_stats.csv")

summary_stats_df["country"] = summary_stats_df["country"].apply(lambda x: "GB" if x == "UK" else x)
summary_stats_df.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words
0,2010,1,AU,3AW,1,1505
1,2010,1,AU,ABC Message Stick,2,7284
2,2010,1,AU,ABC Online,143,78050
3,2010,1,AU,ABC Regional Online,55,21602
4,2010,1,AU,Architecture and Design,14,4981


In [4]:
by_publisher = summary_stats_df.groupby(["country", "publisher"])
by_publisher = by_publisher[["num_articles", "total_words"]]
by_publisher = by_publisher.sum().reset_index().rename(columns={
    "num_articles": "publisher_tot_articles", 
    "total_words": "publisher_tot_words"
})

by_publisher["is_big_publisher"] = by_publisher.publisher_tot_articles.apply(lambda x: x > 1000)
by_publisher.head()

Unnamed: 0,country,publisher,publisher_tot_articles,publisher_tot_words,is_big_publisher
0,AU,'Bourne this way,48,14834,False
1,AU,10 News First,37,6240,False
2,AU,10 daily,221,103872,False
3,AU,123Jump.com,3,778,False
4,AU,2DayFM,15,3771,False


In [5]:
big_publishers = summary_stats_df.merge(
    by_publisher.query("is_big_publisher"), 
    how="right", on=["country", "publisher"]
)

big_publishers.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words,publisher_tot_articles,publisher_tot_words,is_big_publisher
0,2016,11,AU,9Honey,14,5941,3798,1839552,True
1,2016,12,AU,9Honey,52,18680,3798,1839552,True
2,2016,11,AU,9Honey,14,5941,3798,1839552,True
3,2016,12,AU,9Honey,56,20555,3798,1839552,True
4,2017,1,AU,9Honey,75,31747,3798,1839552,True


In [6]:
CAP = 10000

def get_num_samples(x, num_samples):
    pct, max_ = x["pct"], x["total_articles"]
    n = int(np.around(num_samples * pct))
    if n > max_:
        n = max_
        
    return n if n > 0 else 1

sampled_counts = []
for country, country_df in big_publishers.groupby("country"):
    
    # Get rid of month
    # For each contry group by years. Articles per year. 
    articles_by_year = country_df.groupby("year")["num_articles"].sum()
    
    # Median from the number of articles that are of the year 2015 and below. 
    num_to_sample = articles_by_year[articles_by_year.index < 2016].median()
    num_to_sample = np.min([CAP, num_to_sample])
    
    # Going through each year at a time per country. 
    for year, year_df in country_df.groupby("year"):
        publishers = year_df.groupby("publisher")["num_articles"].sum().reset_index()
        publishers["country"] = country
        publishers["year"] = year
        
        publishers["pct"] = publishers.num_articles / publishers.num_articles.sum()
        publishers["total_articles"] = publishers.num_articles
        
        publishers["sampled_articles"] = publishers.apply(
            get_num_samples, axis=1, num_samples=num_to_sample
        )
    
        sampled_counts.append(publishers[[
            "country", "year", "publisher", "total_articles", "sampled_articles"
         ]].copy())
        
big_publisher_sampling = pd.concat(sampled_counts)
big_publisher_sampling.to_csv("misc/sampling_amounts.csv", index=False)
big_publisher_sampling.head()

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,ABC Local,338,248
1,AU,2010,ABC Online,2743,2010
2,AU,2010,AdNews,7,5
3,AU,2010,Architecture and Design,147,108
4,AU,2010,Australian Personal Computer,263,193


In [7]:
print(big_publisher_sampling[["total_articles", "sampled_articles"]].sum().apply(lambda x: f"{x:,d}"))

print("\nBy Year:")
big_publisher_sampling.groupby("year")[["total_articles", "sampled_articles"]].sum()

total_articles      17,336,805
sampled_articles     1,465,631
dtype: object

By Year:


Unnamed: 0_level_0,total_articles,sampled_articles
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,258826,84177
2011,347212,109529
2012,448073,133451
2013,510685,141856
2014,576154,142436
2015,734857,142400
2016,3006000,142405
2017,3064616,142362
2018,2575893,142433
2019,2956593,142388


## Sampling

In [8]:
def get_all_file_paths(country: str, publisher: str, year: int, slug=True):
    if slug:
        publisher = slugify.slugify(publisher)
        
    folder_path = os.path.join(utils.CLEAN_FULL_DATA_FOLDER, country, publisher, str(year))
    if not os.path.exists(folder_path):
        print(folder_path)
        return None
    
    file_path = os.path.join(folder_path, "*.txt")
    text_files = glob.glob(file_path)
    return [os.path.join(*pathlib.Path(x).parts[-4:]) for x in text_files]

def sample_files(text_files, num_articles):
    if num_articles > len(text_files):
        return text_files
    
    return random.choices(text_files, k=num_articles)

def get_articles_to_sample(to_sample):
    text_file_paths = get_all_file_paths(to_sample.country, to_sample.publisher, to_sample.year, slug=True)
    if not text_file_paths:
        sampled_text_file_paths = [None]
    else:
        sampled_text_file_paths = sample_files(text_file_paths, to_sample.sampled_articles)
    
    return (to_sample.country, to_sample.publisher, to_sample.year, len(text_file_paths), sampled_text_file_paths)

def copy_sampled_file(file_path):
    from_path = pathlib.Path(os.path.join(utils.CLEAN_FULL_DATA_FOLDER, file_path))
    to_path = pathlib.Path(os.path.join(utils.CLEAN_DATA_FOLDER, file_path))

    if not to_path.parent.exists():
        to_path.parent.mkdir(parents=True)

    shutil.copyfile(from_path, to_path)

def get_id_from_path(file_path):
    if not pd.isna(file_path):
        file_name = pathlib.Path(file_path).parts[-1]
        return int(file_name.split("_")[0])

    return np.nan

def get_month_from_path(file_path):
    if not pd.isna(file_path):
        file_name = pathlib.Path(file_path).parts[-1]
        date_str = file_name.split("_")[-1][:-4]
        return datetime.strptime(date_str, "%d-%m-%y").month

    return np.nan

tqdm.pandas()
to_sample_df = pd.DataFrame(
    big_publisher_sampling.progress_apply(get_articles_to_sample, axis=1).to_list(), 
    columns=["country", "publisher", "year", "articles_found", "to_sample_file_paths"]
).explode("to_sample_file_paths")

to_sample_df["id"] = to_sample_df.to_sample_file_paths.apply(get_id_from_path)
to_sample_df["month"] = to_sample_df.to_sample_file_paths.apply(get_month_from_path)

  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████| 14529/14529 [12:05<00:00, 20.02it/s]


In [20]:
big_publisher_sampling

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,ABC Local,338,248
1,AU,2010,ABC Online,2743,2010
2,AU,2010,AdNews,7,5
3,AU,2010,Architecture and Design,147,108
4,AU,2010,Australian Personal Computer,263,193
...,...,...,...,...,...
60,ZA,2020,timeslive.co.za,2724,211
61,ZA,2020,weetracker.com,662,51
62,ZA,2020,wheels24.co.za,2323,180
63,ZA,2020,za.ign.com,2319,179


In [34]:
to_sample_df = to_sample_df[["country", "year", "publisher", "articles_found", "to_sample_file_paths"]].copy()
to_sample_df["id"] = to_sample_df.to_sample_file_paths.apply(get_id_from_path)

big_publisher_sampling.merge(
    to_sample_df, 
    on=["country", "publisher", "year"], 
    how="left"
).to_csv("data/sampled_articles.csv", index=False)

In [None]:
%%time
_ = to_sample_df[~to_sample_df.to_sample_file_paths.isna()].to_sample_file_paths.progress_apply(copy_sampled_file)

### Number of paths where the directory wasn't found.

In [35]:
to_sample_df[to_sample_df.to_sample_file_paths.isna()].shape[0]

27674

#### Two reasons for this. 
#### 1. The folder wasn't created beacuse none of the files in the folder were selected when subsampled. <br> (i.e. our 'clean' folder is output of running Clean Text Files- SAMPLE.ipynb, with MAX_SIZE = 50). <br> For example, none of the articles from year 2010 of the publisher skysports-com were selected thus the folder wasn't created. This won't be a problem on your side since you are using all the data. 
#### 2. Most of the US publishers don't have a 2019/2020 year folder. I remember you saying that there was a problem in how some of the US raw files were formatted. I'm not sure if you solved this problem but I think this is the reason why there are so many US 2019/2020 directories not found. 

In [36]:
to_sample_df[to_sample_df.to_sample_file_paths.isna()]

Unnamed: 0,country,year,publisher,articles_found,to_sample_file_paths,id
0,AU,2010,ABC Local,,,
2263,AU,2010,AdNews,,,
2265,AU,2010,AdNews,,,
2367,AU,2010,Architecture and Design,,,
2568,AU,2010,Ballarat Courier,,,
...,...,...,...,...,...,...
1511934,ZA,2020,Reuters,,,
1514118,ZA,2020,channel24.co.za,,,
1515955,ZA,2020,fin24.com,,,
1516052,ZA,2020,goal.com,,,
