In [1]:
import os
import glob
import pathlib
from datetime import datetime

from tqdm import tqdm

import numpy as np
import pandas as pd

# CLEAN_DATA_FOLDER = os.path.join("/Users", "mmackenzie", "Data", "peace-speech-project", "clean_sample")
CLEAN_DATA_FOLDER = r"C:\Users\mattb\Documents\GitHub\peace-speech-project\data\clean_sample"
CLEAN_DATA_FOLDER

'C:\\Users\\mattb\\Documents\\GitHub\\peace-speech-project\\data\\clean_sample'

In [2]:
def get_countries(): 
    country_folders = glob.glob(os.path.join(CLEAN_DATA_FOLDER, "*/"))
    return [pathlib.Path(x).parts[-1] for x in country_folders]

def get_all_files(country: str, publisher="*", year="*"):
    text_files = glob.glob(os.path.join(CLEAN_DATA_FOLDER, country, publisher, year, "*.txt"))
    return [os.path.join(*pathlib.Path(x).parts[-4:]) for x in text_files]

def clean_file_name(file_name):
    id_, _, date = file_name[:-4].split("_")
    month = datetime.strptime(date, "%d-%m-%y").month
    return (month, id_)

def get_details_from_path(file_path):
    parts = pathlib.Path(file_path).parts[-4:]
    extra = clean_file_name(parts[-1])
    names = ["country", "publisher", "year", "month", "id"]
    return pd.Series(
        (*parts[:-1], *extra),
        index=names
    )

def get_text_from_file(file_path, path=CLEAN_DATA_FOLDER):
    with open(os.path.join(path, file_path), "r", encoding="ISO-8859-1") as f:
        lines = f.readlines()
        f.close()
        
    title = lines[1].strip()
    text = lines[-1].strip()
        
        
    return pd.Series(
        (title, text),
        index=["title", "text"]
    )

In [3]:
countries = get_countries()
article_paths = []
for country in tqdm(countries, desc="Finding files"):
    article_paths.append(pd.Series(get_all_files(country)))
    
articles = pd.concat(article_paths).rename("path")

Finding files: 100%|███████████████████████████████████████████████████████████████████| 20/20 [01:01<00:00,  3.05s/it]


In [8]:
tqdm.pandas(desc="Getting details")
path_details = articles.progress_apply(get_details_from_path).reset_index(drop=True)
path_details = path_details.join(articles.reset_index(drop=True))

  from pandas import Panel
Getting details: 100%|█████████████████████████████████████████████████████| 1415825/1415825 [07:04<00:00, 3335.10it/s]


In [9]:
sample_counts = path_details.groupby(
    ["country", "publisher", "year"]
).size().rename("num_articles").reset_index()

In [10]:
CAP = 2500

def get_num_samples(x, num_samples):
    pct, max_ = x["pct"], x["total_articles"]
    n = int(np.around(num_samples * pct))
    if n > max_:
        n = max_
        
    return n if n > 0 else 1

sampled_counts = []
for country, country_df in sample_counts.groupby("country"):
    
    articles_by_year = country_df.groupby("year")["num_articles"].sum()
    
    num_to_sample = CAP
#     num_to_sample = np.min([CAP, num_to_sample])
    
    # Going through each year at a time per country. 
    for year, year_df in country_df.groupby("year"):
        publishers = year_df.groupby("publisher")["num_articles"].sum().reset_index()
        publishers["country"] = country
        publishers["year"] = year
        
        publishers["pct"] = publishers.num_articles / publishers.num_articles.sum()
        publishers["total_articles"] = publishers.num_articles
        
        publishers["sampled_articles"] = publishers.apply(
            get_num_samples, axis=1, num_samples=num_to_sample
        )
    
        sampled_counts.append(publishers[[
            "country", "year", "publisher", "total_articles", "sampled_articles"
         ]].copy())
        
small_sample = pd.concat(sampled_counts)
small_sample.head()

Unnamed: 0,country,year,publisher,total_articles,sampled_articles
0,AU,2010,abc-local,248,62
1,AU,2010,abc-online,2010,503
2,AU,2010,adnews,5,1
3,AU,2010,architecture-and-design,108,27
4,AU,2010,australian-personal-computer,193,48


In [11]:
print(small_sample[["total_articles", "sampled_articles"]].sum().apply(lambda x: f"{x:,d}"))

total_articles      1,415,825
sampled_articles      483,870
dtype: object


In [12]:
import sys
def sample_paths(x):
    country, publisher, year = x.iloc[0, :3]
    to_sample = small_sample.loc[
        (small_sample.country == country) &
        (small_sample.publisher == publisher) &
        (small_sample.year == year)
    ].iloc[0, -1]
    
    return x.sample(to_sample)["path"]
    sys.exit()

paths_to_sample = path_details.groupby(["country", "publisher", "year"]).progress_apply(sample_paths)
paths_to_sample = paths_to_sample.reset_index().rename(columns={"level_3": "id"})
paths_to_sample.head()

Getting details: 100%|██████████████████████████████████████████████████████████| 14288/14288 [00:57<00:00, 246.39it/s]


Unnamed: 0,country,publisher,year,id,path
0,AU,9honey,2016,4,AU\9honey\2016\16005675_AU_20-12-16.txt
1,AU,9honey,2016,3,AU\9honey\2016\15895978_AU_13-12-16.txt
2,AU,9honey,2017,49,AU\9honey\2017\18514519_AU_18-05-17.txt
3,AU,9honey,2017,104,AU\9honey\2017\21657914_AU_13-11-17.txt
4,AU,9honey,2017,8,AU\9honey\2017\16640224_AU_29-01-17.txt


In [13]:
text = paths_to_sample.path.progress_apply(get_text_from_file)

Getting details: 100%|███████████████████████████████████████████████████████| 483870/483870 [05:54<00:00, 1363.25it/s]


In [14]:
paths_to_sample.join(text).to_csv("../data/medium_sample_text.csv", index=False)

In [15]:
text_df = paths_to_sample.join(text)[["country", "publisher", "year", "id", "text"]]
text_df["n_words"] = text_df.text.str.split().str.len()

text_df.head()

Unnamed: 0,country,publisher,year,id,text,n_words
0,AU,9honey,2016,4,meghan markle target cruel topless photo leak ...,181
1,AU,9honey,2016,3,kid go viral perfect review london chicken sho...,187
2,AU,9honey,2017,49,look come today blow dry bubble stunningjasyar...,81
3,AU,9honey,2017,104,mum four try one last baby mother four mary mc...,253
4,AU,9honey,2017,8,'s amazing love good haircut first meet bindi ...,73
