In [2]:
import pandas as pd
import glob
import xml.etree.ElementTree as ET
import pickle
import os
import numpy as np
from tqdm import tqdm


In [5]:
#given a xml file corresponding to an article in NYT corpus, return a dictionary with title, date, text, and doc_id
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    title = root.find('head/title')
    if title is not None: title = title.text
    # Get the publication date
    year = root.find('head/meta[@name="publication_year"]')
    if year is not None:
        year = year.attrib['content']
    month = root.find('head/meta[@name="publication_month"]')
    if month is not None:
        month = month.attrib['content']
    day = root.find('head/meta[@name="publication_day_of_month"]')
    if day is not None:
        day = day.attrib['content']

    date = f"{year}-{month}-{day}"
    full_text_block = root.find(".//block[@class='full_text']")
    if full_text_block is None:
        text_by_para = None
    else:
        text_by_para = [e.text for e in full_text_block.findall(".//p")]
    doc_id = root.find(".//doc-id")
    if doc_id is not None:
        doc_id = doc_id.attrib['id-string']
    return {'title': title, 'date': date, 'text': text_by_para, 'doc_id': doc_id}

In [2]:
from tqdm import tqdm
XML_DIR = '/scr/em7/LDC2008T19'
file_info = []
for xml_file in tqdm(glob.glob(f'{XML_DIR}/**/*.xml', recursive=True)):
    file_info.append(parse_xml(xml_file))
corpus_df = pd.DataFrame(file_info)
corpus_df.to_pickle('NYT_aggregated_info.pkl')

KeyboardInterrupt: 