This notebook can be used to generate the ArchivalQA question-document pairings 

In [8]:
import pandas as pd
import glob
import xml.etree.ElementTree as ET
from tqdm import tqdm
import os

#edit this path to point to the csv file containing questions and source document IDs 
archivalqa_path = '/data/archivalqa/ArchivalQA_val.csv'

#edit this path to point to the directory containing the NYC corpus
nyc_corpus_path = '/data/archivalqa/NYC_corpus/'
save_path = '/data/archivalqa/ArchivalQA_paired/'

archival_df = pd.read_csv(archivalqa_path)

archival_df['doc_id'] = archival_df.apply(lambda x: x['para_id'].split('_')[0], axis=1)

In [54]:
#given a xml file corresponding to an article in NYT corpus, return a dictionary with title, date, text, and doc_id
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    title = root.find('head/title')
    if title is not None: title = title.text
    # Get the publication date
    year = root.find('head/meta[@name="publication_year"]')
    if year is not None:
        year = year.attrib['content']
    month = root.find('head/meta[@name="publication_month"]')
    if month is not None:
        month = month.attrib['content']
    day = root.find('head/meta[@name="publication_day_of_month"]')
    if day is not None:
        day = day.attrib['content']

    date = f"{year}-{month}-{day}"
    full_text_block = root.find(".//block[@class='full_text']")
    if full_text_block is None:
        text_by_para = None
    else:
        text_by_para = [e.text for e in full_text_block.findall(".//p")]
    doc_id = root.find(".//doc-id")
    if doc_id is not None:
        doc_id = doc_id.attrib['id-string']
    return {'title': title, 'date': date, 'text': text_by_para, 'doc_id': doc_id}


file_info = []
for xml_file in tqdm(glob.glob(os.path.join(nyc_corpus_path, '**/*.xml'), recursive=True)):
    file_info.append(parse_xml(xml_file))
    
corpus_df = pd.DataFrame(file_info)
corpus_df.to_pickle(os.path.join(save_path, 'NYT_aggregated_info.pkl'))

In [67]:
archival_df = pd.merge(archival_df, corpus_df, on='doc_id', how='left')
archival_df['para_num'] = archival_df.apply(lambda x: int(x['para_id'].split('_')[1]), axis=1)
archival_df['ans_paragraph'] = archival_df.apply(lambda x: x['text'][x['para_num']], axis=1)
archival_df['ans_text'] = archival_df.apply(lambda x: '\n'.join(x['text']), axis=1)

In [98]:
#add 0s to date
def pad_date(date):
    date = date.split('-')
    date = [d.zfill(2) for d in date]
    return '-'.join(date)
archival_df['date'] = archival_df.apply(lambda x: pad_date(x['date']), axis=1)
archival_df['year'] =  archival_df.apply(lambda x: x['date'].split('-')[0], axis=1)

In [104]:
#qa_lt years 1987-1990 10k 
#qa_lt_val years 1991-1992 5k
#meta_train years 1993-2001 20k
#meta_val years 2002-2003 5k
#test years 2004-2007 10k

qa_lt_df = archival_df[archival_df['year'].isin(['1987', '1988', '1989', '1990'])]
qa_lt_val_df = archival_df[archival_df['year'].isin(['1991', '1992'])]
train_df = archival_df[archival_df['year'].isin(['1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001'])]
val_df = archival_df[archival_df['year'].isin(['2002', '2003'])]
test_df = archival_df[archival_df['year'].isin(['2004', '2005', '2006', '2007'])]

print('qa_lt_df size:', len(qa_lt_df))
print('qa_lt_val_df size:', len(qa_lt_val_df))
print('train_df size:', len(train_df))
print('val_df size:', len(val_df))
print('test_df size:', len(test_df))

qa_lt_df.to_csv(os.path.join(save_path + 'qa_lt.csv'), index=False)
qa_lt_val_df.to_csv(os.path.join(save_path + 'qa_lt_val.csv'), index=False)
train_df.to_csv(os.path.join(save_path + 'train.csv'), index=False)
val_df.to_csv(os.path.join(save_path + 'val.csv'), index=False)
test_df.to_csv(os.path.join(save_path + 'test.csv'), index=False)


qa_lt_df 12378
qa_lt_val_df 5121
train_df 21716
val_df 5285
test_df 8743
