# Create Long Form QA Training Data

In this notebook, a long form question answering data set is created from the addiction recovery links scraped from lds.org. There are two flavors to this: (1) one patterned to the *Explain like I'm 5* (ELI5) dataset from Reddit and (2) another like that of the *Wiki40b* dataset. The structure that we are going for are shown in the figures that follow.

The primary objective is to create two sets for each of those flavors mentioned--one for general conference texts and another for the magazines texts.

In [3]:
import json
from glob import glob
import os
from tqdm import tqdm
import random
import string
import re
import uuid
import pandas as pd

## The ELI5 Flavor
<img src='images/eli5.png'></img>
</br>
<center><b>Figure 1. ELI5 sample structure</b></center>
</br>
</br>

In [3]:
# inspect actual data set
answer_size = 2
source_dir =  f'../data/qa-{answer_size}/general-conference'
files = glob(os.path.join(source_dir, '*.json'))

for file in files:
    with open(file) as f:
        data = json.load(f)
    break
data

{'page': 14,
 'index': 118,
 'title': 'Are You Sleeping through the Restoration?',
 'context': 'Rip makes his way back to his village only to discover that everything has changed. His wife has died, his friends are gone, and the portrait of King George III in the tavern has been replaced by a portrait of someone he does not recognize—by General George Washington. Rip Van Winkle had been sleeping for 20 years! And in the process, he had missed one of the most exciting periods in the history of his country—he had slept through the American Revolution. In May 1966, Dr. Martin Luther King Jr. used this story as an illustration for his speech “Don’t Sleep Through the Revolution.”1 Sometimes we think of the Restoration of the gospel as something that is complete, already behind us—Joseph Smith translated the Book of Mormon, he received priesthood keys, the Church was organized. In reality, the Restoration is an ongoing process; we are living in it right now. It includes “all that God has rev

In [4]:
# significance of two-answer questions is that new q-a pairs are created
# from all the answers available if the answer exceed a score threshold
# otherwise no new q-a pair is constructed
def _generate_id(length):
    # number string
    nums = "".join([str(i) for i in range(10)])
    digits = string.ascii_lowercase + nums
    
    return "".join(random.choice(digits) for i in range(length))

def restrucuture_lfqa(source_dir, dest_dir):
    """Restructure QA data to the flavor of ELI5 LFQA dataset
    
    Parameters
    ----------
    source_dir : str
        path to qa strucutred dataset to the flavor of SQuAD
    dest_dir : str
        save directory
    """
    files = glob(os.path.join(source_dir, '*.json'))
    for file in tqdm(files):
        with open(file) as f:
            data = json.load(f)
       
        
        lfqa_data = {}
        lfqa_data['article_title'] = data['title']
        
        # to be consistent with lfqa_utils.py
        # this is the question
        lfqa_data['title'] = data['question'] 
        lfqa_data['q_id'] = _generate_id(length=6) # question id creation
        
        a_ids = []
        texts = []
        scores = []
        for text in data['answers']['text']:
            a_ids.append(_generate_id(length=7)) # answer id creation
            texts.append(text)
            scores.append(5) # arbitrary score for the answer
            
        lfqa_data['answers'] = {
            'a_id' : a_ids,
            'text' : texts,
            'score': scores
        }
        
        lfqa_data['document'] = ''
        lfqa_data['selftext'] = ''
        lfqa_data['url'] = ''
        lfqa_data['page'] = data['page']
        lfqa_data['index'] = data['index']

        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        new_filename = f"{lfqa_data['q_id']}.json"    
        save_path = os.path.join(dest_dir, new_filename)
        with open(save_path, 'w') as f:
            json.dump(lfqa_data, f)      

In [7]:
!ls ../data/

contexts  expanded  filtered  qa-1  qa-2  qa-3	urls


In [6]:
!rm -rf ../data/lfqa

In [8]:
facets = ['general-conference', 'magazines']
answer_sizes = [1, 2, 3] # sentences after the question sentence

for answer_size in answer_sizes:
    for facet in facets:
        source_dir =  f'../data/qa-{answer_size}/{facet}'
        dest_dir = f'../data/lfqa/qa-{answer_size}/{facet}'
        restrucuture_lfqa(source_dir, dest_dir)

100%|██████████| 48/48 [00:00<00:00, 617.54it/s]
100%|██████████| 274/274 [00:00<00:00, 589.99it/s]
100%|██████████| 48/48 [00:00<00:00, 729.25it/s]
100%|██████████| 269/269 [00:00<00:00, 494.11it/s]
100%|██████████| 48/48 [00:00<00:00, 491.26it/s]
100%|██████████| 265/265 [00:00<00:00, 448.59it/s]


In [33]:
answer_size = 3
files = glob(f"../data/lfqa/qa-{answer_size}/*/*.json")

to_pandas = []
for file in tqdm(files):
    fname = file.split("/")[-1]
    source = file.split('/')[-2]
    with open(file, 'r') as f:
        data = json.load(f)
    to_write = {
        "filename": fname,
        "question": data['title'],
        "answer": data["answers"]["text"][0],
        "source": source
    }
    to_pandas.append(to_write)
json_str = json.dumps(to_pandas)

100%|██████████| 313/313 [00:00<00:00, 2353.82it/s]


In [34]:
df = pd.read_json(json_str, orient='records')
df = df.drop_duplicates(subset='question')
print(df.shape)
df.to_csv("../data/manual-qa-filtering.csv", index=False)

(225, 4)


## The Wiki40b Flavor
</br>
<img src='images/wiki40b.png'></img>
</br>
<center><b>Figure 2. wiki40b sample structure</b></center>

In [9]:
def restructure_context(data, context_size=100, inspect_only=False, dest_dir=None):
    start_characters = []
    end_characters = []
    start_paragraphs = []
    end_paragraphs = []
    snippets = []

    body_text = re.sub("\n", " </PAR> ", data['body'])
    body_tokens = re.split("\s", body_text)
    
    word_counter = 0
    par_counter = 0
    char_counter = 0
    num_pars = len(data['body'].split('\n')) - 1 # number of </PAR> tokens
    max_word_count = len(re.split("\s|\n", data['body'])) # counts excluding the </PAR>
    
    while word_counter < max_word_count + num_pars - context_size:
        snippet_tokens = []

        start_paragraphs.append(par_counter)
        start_characters.append(char_counter)

        while len(snippet_tokens) < context_size:
           
            token = body_tokens[word_counter]
            if token != "</PAR>":
                snippet_tokens.append(token)
                word_counter += 1
            else:
                word_counter += 1
                par_counter += 1

        snippet = " ".join(snippet_tokens)
        snippets.append(snippet)

        char_counter += len(snippet) + 1 # includes space or new line connecting two snippets
        end_paragraphs.append(par_counter)
        end_characters.append(char_counter - 2) # excludes space/newline char and accounts for exclusive endpoints

    for index, snippet in enumerate(snippets):
        to_dump = {
            'article_title' : data['title'],
            'section_title': '',
            'start_paragraph': start_paragraphs[index],
            'end_paragraph': end_paragraphs[index],
            'start_character': start_characters[index],
            'end_character': end_characters[index],
            'passage_text': snippet,
            'kicker' : data['kicker'],
            'page': data['page'],
            'index': data['index'],
        }
        random_suffix = _generate_id(length=4)
        id_ = "-".join([str(to_dump['page']), 
                        str(to_dump['index']), 
                        str(to_dump['start_paragraph']),
                        str(to_dump['start_character']),
                        random_suffix])
        
        to_dump['context_id'] = id_
        
        if inspect_only:
            return to_dump
        
        # write to file
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        save_filename = to_dump['context_id'] + '.json'    
        save_path = os.path.join(dest_dir, save_filename)
    
        with open(save_path, 'w') as f:
            json.dump(to_dump, f)    

In [20]:
# loop through all files
source_glob =  f'../data/filtered/magazines/*.json'
dest_dir = f'../data/contexts/magazines'

files = glob(source_glob)
with open(files[10]) as f:
    data = json.load(f)

In [21]:
data

{'page': 260,
 'index': 2581,
 'title': 'My Visiting Neighbors',
 'kicker': 'I would like to tell you a love story. I am not a member of The Church of Jesus Christ of Latter-day Saints, but I consider the women of your church as my sisters. The two main reasons I feel that way are Leora Duke and Loraine Stoddard.',
 'body': 'Shortly after my husband and I moved to Farmington, Utah, a few years ago, those two ladies appeared at our door. They delivered the following statement—all in one breath, if I remember correctly:\n“Hi! We’re Loraine and Leora, your neighbors and visiting teachers from the LDS Church. We’d like to visit with you monthly—with or without a spiritual message, whichever you prefer but we would like to come by to be sure that you and your family are okay.”\nUp to this point, my experience with any type of visiting Mormon had been less than positive. Young and somewhat intolerant myself, I felt Mormons were rigid and pushy, so previous visitors were never invited to come

In [22]:
restructure_context(data, inspect_only=True)

{'article_title': 'My Visiting Neighbors',
 'section_title': '',
 'start_paragraph': 0,
 'end_paragraph': 2,
 'start_character': 0,
 'end_character': 577,
 'passage_text': 'Shortly after my husband and I moved to Farmington, Utah, a few years ago, those two ladies appeared at our door. They delivered the following statement—all in one breath, if I remember correctly: “Hi! We’re Loraine and Leora, your neighbors and visiting teachers from the LDS Church. We’d like to visit with you monthly—with or without a spiritual message, whichever you prefer but we would like to come by to be sure that you and your family are okay.” Up to this point, my experience with any type of visiting Mormon had been less than positive. Young and somewhat intolerant',
 'kicker': 'I would like to tell you a love story. I am not a member of The Church of Jesus Christ of Latter-day Saints, but I consider the women of your church as my sisters. The two main reasons I feel that way are Leora Duke and Loraine Stodda

In [19]:
# loop through all files
facets = ['general-conference', 'magazines']

for facet in facets:
    source_glob =  f'../data/filtered/{facet}/*.json'
    dest_dir = f'../data/contexts/{facet}'

    files = glob(source_glob)
    for file in tqdm(files):
        with open(file) as f:
            data = json.load(f)
        restructure_context(
            data=data, 
            context_size=100,
            inspect_only=False,
            dest_dir=dest_dir
        )

100%|██████████| 10/10 [00:04<00:00,  2.03it/s]
100%|██████████| 177/177 [00:02<00:00, 83.06it/s]


### Code Dump

In [None]:
# pack json data into one json file
source_dir = '../data/contexts/general-conference'
files = glob(os.path.join(source_dir, '*.json'))

dest_file = '../data/general-conference_contexts.json'

with open(dest_file, 'w+') as handle:
    for file in tqdm(files):
        with open(file, 'r') as f:
            data = json.load(f)
        json.dump(data, handle)

In [None]:
# dask
from dask.distributed import Client
import dask.bag as db
client = Client()

In [None]:
# dask this
source_glob =  f'../data/filtered/general-conference/*.json'
dest_dir = f'../data/contexts/general-conference'

bag = (db.read_text(source_glob)
       .map(json.loads)
       .map(lambda x: restructure_context(
           data=x, 
           inspect_only=False, 
           dest_dir=dest_dir)))
bag.compute()

# file exists error

In [27]:
source_dir =  f'../data/filtered/general-conference'
files = glob(os.path.join(source_dir, '*.json'))

for file in files:
    with open(file) as f:
        data = json.load(f)
    break
data

{'page': 14,
 'index': 118,
 'title': 'Are You Sleeping through the Restoration?',
 'kicker': 'There is too much at stake for us as individuals, as families, and as Christ’s Church to give only a halfhearted effort to this sacred work.',
 'author': 'President Dieter F. Uchtdorf',
 'calling': 'Second Counselor in the First Presidency',
 'url': 'https://www.churchofjesuschrist.org/study/general-conference/2014/04/are-you-sleeping-through-the-restoration'}