# Data Processing

## Filtering URLs

Crawling step must account for search results that do not span the page range inputted for crawling. Example: General conference results for the *addiction recovery* search phrase only returns one (1) page but the crawler still stretches out the crawl to some total number of pages to crawl.

In this step, we fix the returned urls by way of filtering out the duplicates.

In [48]:
from glob import glob
from tqdm import tqdm
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from nltk import tokenize
import uuid

In [49]:
def filter_dup_urls(source_file, dest_dir, save_file, inspect=True): 
    with open(source_file, 'r') as f:
        url_list = json.load(f)
    url_list = [i[2] for i in url_list]
    filtered_url_list = sorted(list(set(url_list)))

    if not os.path.exists(dest_dir):
        os.mkdir(dest_dir)
    save_path = os.path.join(dest_dir, save_file)
    with open(save_path, 'w') as f:
        json.dump(filtered_url_list, f)
    if inspect:
        return filtered_url_list

In [50]:
source_file = '../general-conference-1-1000.json'
dest_dir = '../data/urls'
save_file = 'general-conference-urls.json'

gc_urls = filter_dup_urls(source_file, dest_dir, save_file)
print("New length: ", len(gc_urls))

New length:  13


In [51]:
source_file = '../magazines-1-1000.json'
dest_dir = '../data/urls'
save_file = 'magazines-urls.json'

mag_urls = filter_dup_urls(source_file, dest_dir, save_file)
print("New length: ", len(mag_urls))

New length:  209


## Filter scraped data

In this step, we filter the already scraped data to avoid the scraping step again.

* Remove duplicates
* Remove non-english text based on `?lang=eng`
* Remove files with no body text

In [71]:
def filter_scraped_data(source_dir, dest_dir, inspect=True):
    """Discards json files with no body text, exclude duplicates, also filters non-english urls"""
    files = glob(os.path.join(source_dir, '*.json'))
    
    urls_filtered = []
    no_body = []
    non_english = []
    for file in tqdm(files):
        with open(file) as f:
            data = json.load(f)
        
        if data['url'] in urls_filtered:
            continue
            
        # check if english
        if "?lang=" in data['url']:
            if data['url'].split("?lang=")[1][:3] != "eng":
                non_english.append(data['url'])
                continue
            
        if data['body']:
            urls_filtered.append(data['url'])
            filename = file.split('/')[-1]
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
            save_path = os.path.join(dest_dir, filename)
            with open(save_path, 'w') as f:
                json.dump(data, f)
        else:
            no_body.append(data)
            
    if inspect:
        return urls_filtered, no_body, non_english

In [72]:
source_dir = '../general-conference'
dest_dir = '../data/filtered/general-conference'
gc_urls_new, gc_nb, gc_noneng = filter_scraped_data(source_dir, dest_dir)
print("New length: ", len(gc_urls_new))

100%|██████████| 8965/8965 [00:04<00:00, 1890.58it/s]

New length:  10





In [74]:
# number of filtered out urls
(set(gc_urls) - set(gc_urls_new))

{'https://www.churchofjesuschrist.org/study/general-conference/1981/10/_manifest?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/1981/10?lang=eng',
 'https://www.churchofjesuschrist.org/study/general-conference/2007/10/blessed-are-all-the-pure-in-heart?lang=ase'}

In [76]:
source_dir = '../magazines'
dest_dir = '../data/filtered/magazines'
mag_urls_new, mag_nb, mag_noneng = filter_scraped_data(source_dir, dest_dir)
print("New length: ", len(mag_urls_new))

100%|██████████| 9978/9978 [00:06<00:00, 1640.31it/s]

New length:  177





In [79]:
# number of filtered out urls
len(set(mag_urls) - set(mag_urls_new))

32

## Transform to SQuAD flavor

In this task, we create the Q&A dataset by parsing each body text to find a question sentence, an answer sentence, and a context.

Assumptions:

1. The sentence following the question sentence is the answer.
2. To mimic popular Q&A datasets, we also need to input a starting index, and a context. See figure that follows.
3. The context is defined as the sentences before a questions if it exists, and the sentences after the question. Including the answer sentence. The choices for how many sentances to take in from before or after a question is arbitrary.

The format we are going for is shown below:

<img src="images/squad_format.png"></img>
<center>Figure 1. A snippert of the SQuAD dataset</center>

In [81]:
def extract_qa(source_dir, dest_dir, context_size=5, answer_size=1):
    """Expand json file into body paragraphs"""
    files = glob(os.path.join(source_dir, '*.json'))
    for file in tqdm(files):
        with open(file) as f:
            data = json.load(f)
        
        paragraphs = data['body'].split('\n')
        sentences = []
        for paragraph in paragraphs:
            sentences.extend(tokenize.sent_tokenize(paragraph))
        question_indices = []
        for index, sentence in enumerate(sentences):
            if sentence.endswith('?'):
                question_indices.append(index)
        s_length = len(sentences)
        for q in question_indices:
            qa_data = {k:v for k,v in data.items() if k not \
                       in ['kicker', 'body', 'author', 'calling', 'url']}    
            
            # right context
            if q + answer_size >= len(sentences): # last sentence of the body
                continue
            elif q + 5 < len(sentences):
                context_right = ' '.join(sentences[q + 1: q + 6])
            else:
                # sentence after, then onwards
                context_right = ' '.join(sentences[q + 1: ])
            
            # left context
            if q == 0:
                context_left = None
            elif q - 5 >= 0:
                context_left = ' '.join(sentences[q - 5: q])
            else:
                context_left = ' '.join(sentences[0: q])
            
            
            # build context
            if bool(context_left):
                context = context_left + ' ' + context_right
                answer_start = len(context_left) + 1
            else:
                context = context_right
                answer_start = len(sentences[q])
            
            # build qa_data
            qa_data['context'] = context
            qa_data['question'] = sentences[q]
            
            # get answer indices : the next answer_size sentences to the question
            answer_index = [q + 1]
            while answer_index[-1] != q + answer_size:
                answer_index.append(answer_index[-1] + 1)
            answer_list = list(np.array(sentences)[answer_index])
            if len(answer_list) > 1:
                text = ' '.join(answer_list)
            else:
                text = answer_list[0]
            answers = {
                'answer_start': [answer_start],
                'text': [text]
            }
            qa_data['answers'] = answers
            if not os.path.exists(dest_dir):
                os.makedirs(dest_dir)
            new_filename = str(uuid.uuid4()) + '.json'    
            save_path = os.path.join(dest_dir, new_filename)
            with open(save_path, 'w') as f:
                json.dump(qa_data, f)      

In [94]:
facets = ['general-conference', 'magazines']
answer_sizes = [1, 2, 3] # sentences after the question sentence

for answer_size in answer_sizes:
    for facet in facets:
        source_dir = f'../data/filtered/{facet}'
        dest_dir = f'../data/qa-{answer_size}/{facet}'
        extract_qa(source_dir, dest_dir, answer_size=answer_size)

100%|██████████| 10/10 [00:00<00:00, 53.58it/s]
100%|██████████| 177/177 [00:00<00:00, 182.53it/s]
100%|██████████| 10/10 [00:00<00:00, 66.62it/s]
100%|██████████| 177/177 [00:00<00:00, 194.35it/s]
100%|██████████| 10/10 [00:00<00:00, 68.76it/s]
100%|██████████| 177/177 [00:00<00:00, 221.45it/s]


## Expanding to paragraph level documents for LM

In [95]:
def expand(source_dir, dest_dir, facet):
    """Expand json file into body paragraphs"""
    files = glob(os.path.join(source_dir, '*.json'))
    for file in tqdm(files):
        with open(file) as f:
            data = json.load(f)
        for index, paragraph in enumerate(data['body'].split('\n')):
            paragraph_data = {k:v for k,v in data.items() if k != 'body'}
            paragraph_data['body'] = paragraph
            paragraph_data['paragraph_index'] = index
            new_filename = f"{facet}-index-{paragraph_data['index']}-paragraph-{index}.json"
            save_dir = os.path.join(dest_dir, facet)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            save_path = os.path.join(save_dir, new_filename)
            with open(save_path, 'w') as f:
                json.dump(paragraph_data, f)

In [96]:
for facet in facets:
    source_dir = f'../data/filtered/{facet}'
    dest_dir = '../data/expanded'
    expand(source_dir, dest_dir, facet)

100%|██████████| 10/10 [00:00<00:00, 25.79it/s]
100%|██████████| 177/177 [00:04<00:00, 36.23it/s]
