In [5]:
import os
import glob
import pathlib
from datetime import datetime

from tqdm import tqdm
import pandas as pd

CLEAN_DATA_FOLDER = os.path.join("..", "data", "clean")

In [35]:
def get_countries(): 
    country_folders = glob.glob(os.path.join(CLEAN_DATA_FOLDER, "*/"))
    return [pathlib.Path(x).parts[-1] for x in country_folders]

def get_all_files(country: str, publisher="*", year="*"):
    text_files = glob.glob(os.path.join(CLEAN_DATA_FOLDER, country, publisher, year, "*.txt"))
    return [os.path.join(*pathlib.Path(x).parts[-4:]) for x in text_files]

def clean_file_name(file_name):
    id_, _, date = file_name[:-4].split("_")
    month = datetime.strptime(date, "%d-%m-%y").month
    return (month, id_)

def get_details_from_path(file_path):
    parts = pathlib.Path(file_path).parts[-4:]
    extra = clean_file_name(parts[-1])
    names = ["country", "publisher", "year", "month", "id"]
    return pd.Series(
        (*parts[:-1], *extra),
        index=names
    )

def get_details_from_file(file_path, path=CLEAN_DATA_FOLDER):
    with open(os.path.join(path, file_path), "r", encoding="ISO-8859-1") as f:
        lines = f.readlines()
        f.close()
        
    id_ = lines[0].strip()
    publisher = lines[2].strip()
    has_text = bool(lines[-1].strip())
        
        
    return pd.Series(
        (id_, publisher, has_text),
        index=["article_id", "article_publisher", "has_text"]
    )

In [13]:
countries = get_countries()
sampled_article_paths = []
for country in tqdm(countries):
    sampled_article_paths.append(pd.Series(get_all_files(country)))
    
sampled_articles = pd.concat(sampled_article_paths).rename("path")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:23<00:00,  1.15s/it]


In [16]:
tqdm.pandas()
path_details = sampled_articles.progress_apply(get_details_from_path).reset_index(drop=True)
in_file_details = sampled_articles.progress_apply(get_details_from_file).reset_index(drop=True)

  from pandas import Panel
100%|██████████████████████████████████████████████████████████████████████| 1464949/1464949 [06:34<00:00, 3709.60it/s]


In [55]:
sampled_articles_details = path_details.join(in_file_details)
sampled_articles_details["path"] = sampled_articles.values
sampled_articles_details.to_csv("../data/actual_sampled_articles.csv", index=False)

In [56]:
print(sampled_articles_details.shape)
sampled_articles_details.head()

(1464949, 9)


Unnamed: 0,country,publisher,year,month,id,article_id,article_publisher,has_text,path
0,AU,9honey,2016,12,15732244,15732244,9Honey,True,AU\9honey\2016\15732244_AU_04-12-16.txt
1,AU,9honey,2016,12,15849634,15849634,9Honey,True,AU\9honey\2016\15849634_AU_11-12-16.txt
2,AU,9honey,2016,12,15855997,15855997,9Honey,True,AU\9honey\2016\15855997_AU_11-12-16.txt
3,AU,9honey,2016,12,15895978,15895978,9Honey,True,AU\9honey\2016\15895978_AU_13-12-16.txt
4,AU,9honey,2016,12,16005675,16005675,9Honey,True,AU\9honey\2016\16005675_AU_20-12-16.txt


In [57]:
sampled_articles_details[~sampled_articles_details.has_text]

Unnamed: 0,country,publisher,year,month,id,article_id,article_publisher,has_text,path
