# Prepare Texts

This notebook combines code by Moacir P. de SÃ¡ Pereira with default TDMStudio code provded by ProQuest/Clarivate.

It assumes the existence of a set of corpora available as various directories like `./data/{corpus_name}`, each of which contains $n$ xml files of the name `{goid}.xml`, where `goid` is a global id used by ProQuest for their articles.

For each corpus, it generates a set of csv files of at most 10,000 records, where each row corresponds to an article in the corpus. The csvs are written to `./dataframe_files` with the name `{corpus}_nnn.csv`. The csvs have the following columns:

- `goid`: Int. As above
- `title`: Str. The headline of the article
- `date`: Str. The publication date, in `YYYY-MM-DD` format
- `publisher`: Str. The article's publisher
- `pub_title`: Str. The title of the publication
- `author`: Str. The display name of the author, when available
- `tokens`: Int. A naive word count, derived from splitting the full text on whitespace.
    
The csvs are subsequently used in the `concatenate-corpora` notebook.

In [None]:
%conda install lxml

In [None]:
# Libraries for parsing data
import os
import random
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [None]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [None]:
def getxmlcontent(corpus_path, file, strip_html=True):
    try:
        result = {
            "goid": None,
            "title": None,
            "date": None,
            "publisher": None,
            "pub_title": None,
            "author": None,
            "text": None
        }
        
        tree = etree.parse(corpus_path + file)
        root = tree.getroot()
        
        if root.find(".//ISOExpansion").text == "English": # Only use English articles
            if root.find('.//GOID') is not None:
                result["goid"] = root.find('.//GOID').text
            if root.find('.//Title') is not None:
                result["title"] = root.find('.//Title').text
            if root.find('.//PubFrosting/Title') is not None:
                result["pub_title"] = root.find('.//PubFrosting/Title').text
            if root.find('.//NumericDate') is not None:
                result["date"] = root.find('.//NumericDate').text
            if root.find('.//PublisherName') is not None:
                result["publisher"] = root.find('.//PublisherName').text
            if root.find('.//Author/NormalizedDisplayForm') is not None:
                result["author"] = root.find('.//Author/NormalizedDisplayForm').text
            # Check for text in various potential places in the XML tree
            if root.find('.//FullText') is not None:
                result["text"] = root.find('.//FullText').text
            elif root.find('.//HiddenText') is not None:
                result["text"] = root.find('.//HiddenText').text
            elif root.find('.//Text') is not None:
                result["text"] = root.find('.//Text').text

            # Strip html from text portion
            if result["text"] is not None and strip_html == True:
                result["text"] = strip_html_tags(result["text"])

    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return result

In [None]:
def prepare_files(corpus, sample_size = None, batch_size=10000):
    corpus_path = f"/home/ec2-user/SageMaker/data/{corpus}/"
    files = os.listdir(corpus_path)
    if sample_size:
        files = random.sample(files, sample_size)
    
    rows = []
    for i, file in enumerate(tqdm(files)):
        file_count = len(files)
        result = getxmlcontent(corpus_path, file, strip_html=True)
        rows.append(result)
        if (i != 0 and i % batch_size == 0) or i == file_count - 1 :
            df = pd.DataFrame(rows)
            # Drop rows with no text.
            df = df.dropna(subset=['text'])
            # Naively calculate a word count for each article.
            df["tokens"] = df["text"].apply(lambda x: len(x.split(" ")))
            # Drop the text column.
            df = df.drop(columns=["text"])
            
            # Write csv.
            file_name = f"./dataframe_files/{corpus}_{str(i//batch_size).zfill(3)}.csv"
            df.to_csv(file_name)
            print(f"Wrote {file_name}")
            rows = []

In [None]:
corpora = [
    "dollar-tree",
    "lululemon",
    "ulta",
    "walgreens",
    "walmart"
]

results = {}

for corpus in corpora:
    print(f"Starting {corpus}")
    df = prepare_files(corpus)