In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import shutil
import random
import pathlib
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm import tqdm
import slugify

import utils

In [3]:
summary_stats_df = pd.read_csv("data/summary_stats.csv")

summary_stats_df["country"] = summary_stats_df["country"].apply(lambda x: "GB" if x == "UK" else x)
summary_stats_df.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words
0,2010,1,AU,3AW,1,1505
1,2010,1,AU,ABC Message Stick,2,7284
2,2010,1,AU,ABC Online,143,78050
3,2010,1,AU,ABC Regional Online,55,21602
4,2010,1,AU,Architecture and Design,14,4981


In [4]:
by_publisher = summary_stats_df.groupby(["country", "publisher"])
by_publisher = by_publisher[["num_articles", "total_words"]]
by_publisher = by_publisher.sum().reset_index().rename(columns={
    "num_articles": "publisher_tot_articles", 
    "total_words": "publisher_tot_words"
})

by_publisher["is_big_publisher"] = by_publisher.publisher_tot_articles.apply(lambda x: x > 1000)
by_publisher.head()

Unnamed: 0,country,publisher,publisher_tot_articles,publisher_tot_words,is_big_publisher
0,AU,'Bourne this way,48,14834,False
1,AU,10 News First,37,6240,False
2,AU,10 daily,221,103872,False
3,AU,123Jump.com,3,778,False
4,AU,2DayFM,15,3771,False


In [5]:
big_publishers = summary_stats_df.merge(
    by_publisher.query("is_big_publisher"), 
    how="right", on=["country", "publisher"]
)

big_publishers.head()

Unnamed: 0,year,month,country,publisher,num_articles,total_words,publisher_tot_articles,publisher_tot_words,is_big_publisher
0,2016,11,AU,9Honey,14,5941,3798,1839552,True
1,2016,12,AU,9Honey,52,18680,3798,1839552,True
2,2016,11,AU,9Honey,14,5941,3798,1839552,True
3,2016,12,AU,9Honey,56,20555,3798,1839552,True
4,2017,1,AU,9Honey,75,31747,3798,1839552,True


In [6]:
CAP = 10000

def get_num_samples(x, num_samples):
    pct, max_ = x["pct"], x["total_articles"]
    n = int(np.around(num_samples * pct))
    if n > max_:
        n = max_
        
    return n if n > 0 else 1

sampled_counts = []
for country, country_df in big_publishers.groupby("country"):
    
    # Get rid of month
    # For each contry group by years. Articles per year. 
    articles_by_year = country_df.groupby("year")["num_articles"].sum()
    
    # Median from the number of articles that are of the year 2015 and below. 
    num_to_sample = articles_by_year[articles_by_year.index < 2016].median()
    num_to_sample = np.min([CAP, num_to_sample])
    
    # Going through each year at a time per country. 
    for year, year_df in country_df.groupby("year"):
        publishers = year_df.groupby("publisher")["num_articles"].sum().reset_index()
        publishers["country"] = country
        publishers["year"] = year
        
        publishers["pct"] = publishers.num_articles / publishers.num_articles.sum()
        publishers["total_articles"] = publishers.num_articles
        
        publishers["sampled_articles"] = publishers.apply(
            get_num_samples, axis=1, num_samples=num_to_sample
        )
    
        sampled_counts.append(publishers[[
            "country", "year", "publisher", "total_articles", "sampled_articles"
         ]].copy())
        
big_publisher_sampling = pd.concat(sampled_counts)
big_publisher_sampling.to_csv("misc/sampling_amounts.csv", index=False)
big_publisher_sampling.head()

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,ABC Local,338,248
1,AU,2010,ABC Online,2743,2010
2,AU,2010,AdNews,7,5
3,AU,2010,Architecture and Design,147,108
4,AU,2010,Australian Personal Computer,263,193


In [7]:
print(big_publisher_sampling[["total_articles", "sampled_articles"]].sum().apply(lambda x: f"{x:,d}"))

print("\nBy Year:")
big_publisher_sampling.groupby("year")[["total_articles", "sampled_articles"]].sum()

total_articles      17,336,805
sampled_articles     1,465,631
dtype: object

By Year:


Unnamed: 0_level_0,total_articles,sampled_articles
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,258826,84177
2011,347212,109529
2012,448073,133451
2013,510685,141856
2014,576154,142436
2015,734857,142400
2016,3006000,142405
2017,3064616,142362
2018,2575893,142433
2019,2956593,142388


## Sampling

In [9]:
CLEAN_PATH = r"D:\Data\peace-speech-project\clean_full"

def article_is_empty(text_file):
    return os.stat(text_file).st_size == 0

def simplify_path(text_file):
    return os.path.join(*pathlib.Path(text_file).parts[-4:])

def split_valid_and_empty_files(text_files):
    valid, empty = [], []
    for text_file in text_files:
        (valid, empty)[article_is_empty(text_file)].append(simplify_path(text_file))
        
    return valid, empty

def get_all_file_paths(country: str, publisher: str, year: int, slug=True):
    if slug:
        publisher = slugify.slugify(publisher)
        
    folder_path = os.path.join(CLEAN_PATH, country, publisher, str(year))
    if not os.path.exists(folder_path):
        print(folder_path)
        return None
    
    file_path = os.path.join(folder_path, "*.txt")
    text_files = glob.glob(file_path)
    return split_valid_and_empty_files(text_files)

def sample_files(text_files, num_articles):
    if num_articles > len(text_files):
        return text_files
    
    return np.random.choice(text_files, size=num_articles, replace=False)

def get_articles_to_sample(to_sample):
    valid_files, empty_files = get_all_file_paths(to_sample.country, to_sample.publisher, to_sample.year, slug=True)
    
    if not valid_files:
        sampled_text_file_paths = []
    else:
        sampled_text_file_paths = sample_files(valid_files, to_sample.sampled_articles)
    
    return (to_sample.country, to_sample.publisher, to_sample.year, len(valid_files), len(empty_files), sampled_text_file_paths)

def copy_sampled_file(file_path):
    from_path = pathlib.Path(os.path.join(CLEAN_PATH, file_path))
    to_path = pathlib.Path(os.path.join(utils.CLEAN_DATA_FOLDER, file_path))

    if not to_path.parent.exists():
        to_path.parent.mkdir(parents=True)

    shutil.copyfile(from_path, to_path)

def get_id_from_path(file_path):
    if not pd.isna(file_path):
        file_name = pathlib.Path(file_path).parts[-1]
        return int(file_name.split("_")[0])

    return np.nan

def get_month_from_path(file_path):
    if not pd.isna(file_path):
        file_name = pathlib.Path(file_path).parts[-1]
        date_str = file_name.split("_")[-1][:-4]
        return datetime.strptime(date_str, "%d-%m-%y").month

    return np.nan

In [10]:
bp_sample = big_publisher_sampling.copy() #big_publisher_sampling[big_publisher_sampling.country == "AU"].copy()

tqdm.pandas()
to_sample_df = pd.DataFrame(
    bp_sample.progress_apply(get_articles_to_sample, axis=1).to_list(), 
    columns=["country", "publisher", "year", "valid_articles_found", "empty_articles_found", "to_sample_file_path"]
).explode("to_sample_file_path")

to_sample_df["id"] = to_sample_df.to_sample_file_path.apply(get_id_from_path)
to_sample_df["month"] = to_sample_df.to_sample_file_path.apply(get_month_from_path)

  from pandas import Panel
100%|████████████████████████████████████████████████████████████████████████████| 14529/14529 [51:35<00:00,  4.69it/s]


In [11]:
bp_sample.rename(columns={"sampled_articles": "num_articles_to_sample"}, inplace=True)

sampled_articles = bp_sample.merge(
    to_sample_df, 
    on=["country", "publisher", "year"], 
    how="left"
)

sampled_articles["total_articles_found"] = sampled_articles.valid_articles_found + sampled_articles.empty_articles_found

sampled_articles.to_csv("data/sampled_articles.csv", index=False)
sampled_articles.head()

Unnamed: 0,country,year,publisher,total_articles,num_articles_to_sample,valid_articles_found,empty_articles_found,to_sample_file_path,id,month,total_articles_found
0,AU,2010,ABC Local,338,248,338,0,AU\abc-local\2010\1706268_AU_09-11-10.txt,1706268.0,11.0,338
1,AU,2010,ABC Local,338,248,338,0,AU\abc-local\2010\1520721_AU_14-06-10.txt,1520721.0,6.0,338
2,AU,2010,ABC Local,338,248,338,0,AU\abc-local\2010\1492348_AU_20-05-10.txt,1492348.0,5.0,338
3,AU,2010,ABC Local,338,248,338,0,AU\abc-local\2010\1617875_AU_01-09-10.txt,1617875.0,9.0,338
4,AU,2010,ABC Local,338,248,338,0,AU\abc-local\2010\1641111_AU_20-09-10.txt,1641111.0,9.0,338


In [12]:
repeat_ids_check = sampled_articles.groupby(["country", "publisher", "year", "id"]).size().rename("num_sampled").reset_index()
repeat_ids_check[repeat_ids_check.num_sampled > 1]

Unnamed: 0,country,publisher,year,id,num_sampled


In [13]:
key = ["country", "publisher", "year"]
num_articles_sampled = sampled_articles.groupby(key).to_sample_file_path.count().to_frame().reset_index()
num_articles_sampled.rename(columns={"to_sample_file_path": "num_articles_sampled"}, inplace=True)

sanity_check_df = sampled_articles.merge(num_articles_sampled, on=key, how="left")
sanity_check_df["articles_in_folder_check"] = sanity_check_df.total_articles == sanity_check_df.total_articles_found
sanity_check_df["num_articles_sampled_check"] = sanity_check_df.num_articles_to_sample == sanity_check_df.num_articles_sampled
sanity_check_df["sanity_check"] = sanity_check_df.articles_in_folder_check & sanity_check_df.articles_in_folder_check

In [14]:
total_articles_check = sanity_check_df.groupby(key + ["num_articles_to_sample"])
total_articles_check = total_articles_check.size().rename("num_articles_sampled").reset_index()
total_articles_check[["num_articles_to_sample", "num_articles_sampled"]].sum()

num_articles_to_sample    1465631
num_articles_sampled      1465240
dtype: int64

In [17]:
sanity_check_df[~sanity_check_df.sanity_check]

Unnamed: 0,country,year,publisher,total_articles,num_articles_to_sample,valid_articles_found,empty_articles_found,to_sample_file_path,id,month,total_articles_found,num_articles_sampled,articles_in_folder_check,num_articles_sampled_check,sanity_check
59999,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15895978_AU_13-12-16.txt,15895978.0,12.0,70,6,False,True,False
60000,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15855997_AU_11-12-16.txt,15855997.0,12.0,70,6,False,True,False
60001,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15849634_AU_11-12-16.txt,15849634.0,12.0,70,6,False,True,False
60002,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\16008320_AU_20-12-16.txt,16008320.0,12.0,70,6,False,True,False
60003,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15732244_AU_04-12-16.txt,15732244.0,12.0,70,6,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454255,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30726246_ZA_01-04-19.txt,30726246.0,4.0,2399,37,False,True,False
1454256,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30456195_ZA_15-01-19.txt,30456195.0,1.0,2399,37,False,True,False
1454257,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30938411_ZA_03-06-19.txt,30938411.0,6.0,2399,37,False,True,False
1454258,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30961580_ZA_10-06-19.txt,30961580.0,6.0,2399,37,False,True,False


In [18]:
sanity_check_df[~sanity_check_df.articles_in_folder_check]

Unnamed: 0,country,year,publisher,total_articles,num_articles_to_sample,valid_articles_found,empty_articles_found,to_sample_file_path,id,month,total_articles_found,num_articles_sampled,articles_in_folder_check,num_articles_sampled_check,sanity_check
59999,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15895978_AU_13-12-16.txt,15895978.0,12.0,70,6,False,True,False
60000,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15855997_AU_11-12-16.txt,15855997.0,12.0,70,6,False,True,False
60001,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15849634_AU_11-12-16.txt,15849634.0,12.0,70,6,False,True,False
60002,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\16008320_AU_20-12-16.txt,16008320.0,12.0,70,6,False,True,False
60003,AU,2016,9Honey,136,6,70,0,AU\9honey\2016\15732244_AU_04-12-16.txt,15732244.0,12.0,70,6,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454255,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30726246_ZA_01-04-19.txt,30726246.0,4.0,2399,37,False,True,False
1454256,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30456195_ZA_15-01-19.txt,30456195.0,1.0,2399,37,False,True,False
1454257,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30938411_ZA_03-06-19.txt,30938411.0,6.0,2399,37,False,True,False
1454258,ZA,2019,moneyweb.co.za,671,37,2095,304,ZA\moneyweb-co-za\2019\30961580_ZA_10-06-19.txt,30961580.0,6.0,2399,37,False,True,False


In [19]:
sanity_check_df[~sanity_check_df.num_articles_sampled_check]

Unnamed: 0,country,year,publisher,total_articles,num_articles_to_sample,valid_articles_found,empty_articles_found,to_sample_file_path,id,month,total_articles_found,num_articles_sampled,articles_in_folder_check,num_articles_sampled_check,sanity_check
22843,AU,2012,Bendigo Advertiser,1,1,0,1,,,,1,0,True,False,True
52796,AU,2015,Australasian Lawyer,8,1,0,8,,,,8,0,True,False,True
52802,AU,2015,Ballarat Courier,3,1,0,3,,,,3,0,True,False,True
54164,AU,2015,E! Online,21,4,0,21,,,,21,0,True,False,True
54188,AU,2015,Eurosport.com AU,10,2,0,10,,,,10,0,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664615,JM,2020,loopjamaica.com,1839,1839,1535,304,JM\loopjamaica-com\2020\72288381_JM_30-09-20.txt,72288381.0,9.0,1839,1535,True,False,True
664616,JM,2020,loopjamaica.com,1839,1839,1535,304,JM\loopjamaica-com\2020\72288384_JM_30-09-20.txt,72288384.0,9.0,1839,1535,True,False,True
664617,JM,2020,loopjamaica.com,1839,1839,1535,304,JM\loopjamaica-com\2020\72288461_JM_30-09-20.txt,72288461.0,9.0,1839,1535,True,False,True
664618,JM,2020,loopjamaica.com,1839,1839,1535,304,JM\loopjamaica-com\2020\72288881_JM_30-09-20.txt,72288881.0,9.0,1839,1535,True,False,True


In [23]:
%%time
_ = to_sample_df[~to_sample_df.to_sample_file_path.isna()].to_sample_file_path.progress_apply(copy_sampled_file)

100%|█████████████████████████████████████████████████████████████████████| 1465233/1465233 [2:15:06<00:00, 180.74it/s]

Wall time: 2h 15min 6s





### Number of paths where the directory wasn't found.

In [20]:
sampled_articles[sampled_articles.to_sample_file_path.isna()].shape[0]

7

In [21]:
sampled_articles[sampled_articles.to_sample_file_path.isna()]

Unnamed: 0,country,year,publisher,total_articles,num_articles_to_sample,valid_articles_found,empty_articles_found,to_sample_file_path,id,month,total_articles_found
22843,AU,2012,Bendigo Advertiser,1,1,0,1,,,,1
52796,AU,2015,Australasian Lawyer,8,1,0,8,,,,8
52802,AU,2015,Ballarat Courier,3,1,0,3,,,,3
54164,AU,2015,E! Online,21,4,0,21,,,,21
54188,AU,2015,Eurosport.com AU,10,2,0,10,,,,10
59954,AU,2015,Western Advocate,7,1,0,7,,,,7
100129,AU,2020,CyclingTips,1,1,0,1,,,,1


#### Two reasons for this. 
#### 1. The folder wasn't created beacuse none of the files in the folder were selected when subsampled. <br> (i.e. our 'clean' folder is output of running Clean Text Files- SAMPLE.ipynb, with MAX_SIZE = 50). <br> For example, none of the articles from year 2010 of the publisher skysports-com were selected thus the folder wasn't created. This won't be a problem on your side since you are using all the data. 
#### 2. Most of the US publishers don't have a 2019/2020 year folder. I remember you saying that there was a problem in how some of the US raw files were formatted. I'm not sure if you solved this problem but I think this is the reason why there are so many US 2019/2020 directories not found. 

In [36]:
to_sample_df[to_sample_df.to_sample_file_paths.isna()]

Unnamed: 0,country,year,publisher,articles_found,to_sample_file_paths,id
0,AU,2010,ABC Local,,,
2263,AU,2010,AdNews,,,
2265,AU,2010,AdNews,,,
2367,AU,2010,Architecture and Design,,,
2568,AU,2010,Ballarat Courier,,,
...,...,...,...,...,...,...
1511934,ZA,2020,Reuters,,,
1514118,ZA,2020,channel24.co.za,,,
1515955,ZA,2020,fin24.com,,,
1516052,ZA,2020,goal.com,,,
