Code to produce folders for people to review the preprocessing.

REQUIRES:
- text folders: "orig", "n_gram", "cosine", and "merged" containing outputed and cleaned text files. These should contain the exact same articles in each.

In [26]:
import os
import glob
import random
import pathlib

import numpy as np 
import pandas as pd


DIRECTORIES = {
    "ORIGINAL": "orig",
    "NGRAM": "n_gram", 
    "COSINE": "cosine", 
    "MERGED": "merged"
}

NUM_REVIEWERS = 2
NUM_ARTICLES_PER_COUNTRY = 1

In [50]:
def get_countries(): 
    country_folders = glob.glob(os.path.join(DIRECTORIES["ORIGINAL"], "*/"))
    return [pathlib.Path(x).parts[-1] for x in country_folders]

def get_all_files(country: str):
    text_files = glob.glob(os.path.join(DIRECTORIES["ORIGINAL"], country, "*", "*", "*.txt"))
    return [os.path.join(*pathlib.Path(x).parts[-4:]) for x in text_files]

def sample_files(text_files, num_articles):
    return random.choices(text_files, k=num_articles)

def get_all_versions(text_file, paths):
    text = []
    for name, path in paths.items():
        full_path = os.path.join(path, text_file)
        with open(full_path, "r") as f:
            lines = f.readlines()
            text.append(lines)
            f.close()
                
    return text

def output_files(reviewer_num, article_num, text_file, texts, paths, base_dir="to_review"):
    country, publisher, year, file = pathlib.Path(text_file).parts
    for text, name in zip(texts, paths.keys()):
        new_file_name = f"{name}__{publisher}__{year}__{file}"
        new_path = os.path.join(base_dir, f"reviewer_{reviewer_num}", country, f"article_{article_num}")
        
        pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
        with open(os.path.join(new_path, new_file_name), "w") as f:
            f.writelines(text)
            f.close()

In [51]:
countries = get_countries()

exported, id_and_title = [], []
for country in countries:
    all_text_files = get_all_files(country)
    for reviewer_num in range(1, NUM_REVIEWERS + 1):
        sampled_files = sample_files(all_text_files, NUM_ARTICLES_PER_COUNTRY)
        for article_num, text_file in enumerate(sampled_files, 1):
            
            texts = get_all_versions(text_file, DIRECTORIES)
            output_files(reviewer_num, article_num, text_file, texts, DIRECTORIES)
            exported.append(list(pathlib.Path(text_file).parts) + texts[0][:2])

In [62]:
exported_df = pd.DataFrame(exported, columns=["country", "publisher", "year", "text_file", "id", "title"])
exported_df["id"] = exported_df.id.apply(lambda x: int(x.strip()))
exported_df["title"] = exported_df.title.apply(lambda x: x.strip())
exported_df[["id", "title", "country", "publisher", "year", "text_file"]].to_csv("review.csv", index=False)