In [36]:
import os
import glob
import pathlib
from datetime import datetime

from tqdm import tqdm

import numpy as np
import pandas as pd

CLEAN_DATA_FOLDER = os.path.join("/Users", "mmackenzie", "Data", "peace-speech-project", "clean_sample")

In [64]:
def get_countries(): 
    country_folders = glob.glob(os.path.join(CLEAN_DATA_FOLDER, "*/"))
    return [pathlib.Path(x).parts[-1] for x in country_folders]

def get_all_files(country: str, publisher="*", year="*"):
    text_files = glob.glob(os.path.join(CLEAN_DATA_FOLDER, country, publisher, year, "*.txt"))
    return [os.path.join(*pathlib.Path(x).parts[-4:]) for x in text_files]

def clean_file_name(file_name):
    id_, _, date = file_name[:-4].split("_")
    month = datetime.strptime(date, "%d-%m-%y").month
    return (month, id_)

def get_details_from_path(file_path):
    parts = pathlib.Path(file_path).parts[-4:]
    extra = clean_file_name(parts[-1])
    names = ["country", "publisher", "year", "month", "id"]
    return pd.Series(
        (*parts[:-1], *extra),
        index=names
    )

def get_text_from_file(file_path, path=CLEAN_DATA_FOLDER):
    with open(os.path.join(path, file_path), "r", encoding="ISO-8859-1") as f:
        lines = f.readlines()
        f.close()
        
    title = lines[1].strip()
    text = lines[-1].strip()
        
        
    return pd.Series(
        (title, text),
        index=["title", "text"]
    )

In [27]:
countries = get_countries()
article_paths = []
for country in tqdm(countries):
    article_paths.append(pd.Series(get_all_files(country)))
    
articles = pd.concat(article_paths).rename("path")

100%|██████████| 20/20 [00:28<00:00,  1.43s/it]


In [28]:
tqdm.pandas()
path_details = articles.progress_apply(get_details_from_path).reset_index(drop=True)
path_details = path_details.join(articles.reset_index(drop=True))

  from pandas import Panel
100%|██████████| 1415825/1415825 [07:27<00:00, 3166.91it/s]


In [34]:
sample_counts = path_details.groupby(
    ["country", "publisher", "year"]
).size().rename("num_articles").reset_index()

In [37]:
CAP = 500

def get_num_samples(x, num_samples):
    pct, max_ = x["pct"], x["total_articles"]
    n = int(np.around(num_samples * pct))
    if n > max_:
        n = max_
        
    return n if n > 0 else 1

sampled_counts = []
for country, country_df in sample_counts.groupby("country"):
    
    articles_by_year = country_df.groupby("year")["num_articles"].sum()
    
    # Median from the number of articles that are of the year 2015 and below. 
    num_to_sample = 500
    num_to_sample = np.min([CAP, num_to_sample])
    
    # Going through each year at a time per country. 
    for year, year_df in country_df.groupby("year"):
        publishers = year_df.groupby("publisher")["num_articles"].sum().reset_index()
        publishers["country"] = country
        publishers["year"] = year
        
        publishers["pct"] = publishers.num_articles / publishers.num_articles.sum()
        publishers["total_articles"] = publishers.num_articles
        
        publishers["sampled_articles"] = publishers.apply(
            get_num_samples, axis=1, num_samples=num_to_sample
        )
    
        sampled_counts.append(publishers[[
            "country", "year", "publisher", "total_articles", "sampled_articles"
         ]].copy())
        
small_sample = pd.concat(sampled_counts)
small_sample.head()

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,abc-local,248,12
1,AU,2010,abc-online,2010,101
2,AU,2010,adnews,5,1
3,AU,2010,architecture-and-design,108,5
4,AU,2010,australian-personal-computer,193,10


In [46]:
print(small_sample[["total_articles", "sampled_articles"]].sum().apply(lambda x: f"{x:,d}"))

total_articles      1,415,825
sampled_articles      111,854
dtype: object


In [56]:
import sys
def sample_paths(x):
    country, publisher, year = x.iloc[0, :3]
    to_sample = small_sample.loc[
        (small_sample.country == country) &
        (small_sample.publisher == publisher) &
        (small_sample.year == year)
    ].iloc[0, -1]
    
    return x.sample(to_sample)["path"]
    sys.exit()

paths_to_sample = path_details.groupby(["country", "publisher", "year"]).progress_apply(sample_paths)
paths_to_sample = paths_to_sample.reset_index().rename(columns={"level_3": "id"})
paths_to_sample.head()

100%|██████████| 14288/14288 [01:09<00:00, 206.75it/s]


country  publisher  year         
AU       9honey     2016  1194839    AU/9honey/2016/16005675_AU_20-12-16.txt
                    2017  1194702    AU/9honey/2017/19839531_AU_04-08-17.txt
                          1194701    AU/9honey/2017/17381786_AU_12-03-17.txt
                          1194706    AU/9honey/2017/18885940_AU_09-06-17.txt
                          1194653    AU/9honey/2017/21389159_AU_30-10-17.txt
Name: path, dtype: object

In [67]:
text = paths_to_sample.path.progress_apply(get_text_from_file)

100%|██████████| 111854/111854 [10:04<00:00, 185.16it/s]


In [70]:
paths_to_sample.join(text).to_csv("../data/small_sample_text.csv", index=False)