In [1]:
from spacy_stanfordnlp import StanfordNLPLanguage
import stanfordnlp
from copy import deepcopy
from spacy_conll import ConllFormatter
from pathlib import Path
from qa2nli.qa_readers.race import read_data as race_read_data
from qa2nli.qa_readers.race import process_samples_lazy
from rule import Question, AnswerSpan
from conllu import parse
from sacremoses import MosesTokenizer, MosesDetokenizer
import tqdm
import multiprocessing as mp
from typing import *

In [2]:
#stanfordnlp.download('en')
# Config
data_path = Path('../qa-labeling/RACE/dev')
output_path = Path('.data/RACE/converted_dev.json')

In [3]:
snlp = stanfordnlp.Pipeline(lang='en')
nlp = StanfordNLPLanguage(snlp)
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)
detokenizer = MosesDetokenizer()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/dhruv/stanfordnlp_resources/en_ewt_models/en_

In [9]:
# load data
qa_samples = race_read_data(data_path, qa_only=False)
num_processed_samples = 4*len(qa_samples)
processed_samples = process_samples_lazy(qa_samples)

Read 1020 files from /Users/dhruv/UnsyncedDocuments/IBM_project/qa-to-nli/../qa-labeling/RACE/dev/high
Read 367 files from /Users/dhruv/UnsyncedDocuments/IBM_project/qa-to-nli/../qa-labeling/RACE/dev/middle


In [10]:
import warnings
warnings.filterwarnings("ignore")
def convert(qa_samples: List[Dict]):
    total_questions=0
    num_converted_questions = 0
    num_nli_samples = 0
    invalid_questions = 0
    invalid_options = 0 
    failed_questions = []
    nli_samples = []
    failed_cases = []
    for qa_sample in tqdm.tqdm(qa_samples):
        total_questions+=1
        q_doc = nlp(qa_sample['question'])
        a_doc = nlp(qa_sample['answer'])
        q_conll_dict = parse(q_doc._.conll_str)[0].tokens
        a_conll_dict = parse(a_doc._.conll_str)[0].tokens
        positive_sample = {}
        q = Question(deepcopy(q_conll_dict))
        if not q.isvalid:
            invalid_questions+=1
            failed_cases.append(qa_sample['id'])
            continue
        a = AnswerSpan(a_conll_dict)
        if not a.isvalid:
            failed_cases.append(qa_sample['id'] + '_1')
            invalid_options+=1
        else:
            q.insert_answer_default(a)
            hypo = detokenizer.detokenize(q.format_declr(), return_str=True)
            positive_sample['premise'] = qa_sample['article']
            positive_sample['hypothesis'] = hypo
            positive_sample['label'] = 1
            positive_sample['id'] = qa_sample['id'] + '_1'
        if positive_sample:
            nli_samples.append(positive_sample)
        for i, opt in enumerate(qa_sample['other_options']):
            q = Question(deepcopy(q_conll_dict))
            o_conll_dict = parse(nlp(opt)._.conll_str)[0].tokens
            o = AnswerSpan(o_conll_dict)
            negative_sample = {}
            if not o.isvalid:
                invalid_options+=1
                failed_cases.append(qa_sample['id'] + '_' + str(i+1))
                continue
            else:
                q.insert_answer_default(o)
                negative_sample['premise'] = qa_sample['article']
                negative_sample['hypothesis'] = detokenizer.detokenize(q.format_declr(), return_str=True)
                negative_sample['label'] = 0
                negative_sample['id'] = qa_sample['id'] + '_' + str(i+1)
            if negative_sample:
                nli_samples.append(negative_sample)
        num_converted_questions+=1
        return {'nli_samples': nli_samples, 'failed_questions': failed_questions, 'total_questions': total_questions,
               'failed_cases': failed_cases}




In [11]:
def convert2(qa_sample: Dict):
    q_doc = nlp(qa_sample['question'])
    a_doc = nlp(qa_sample['option'])
    q_conll_dict = parse(q_doc._.conll_str)[0].tokens
    a_conll_dict = parse(a_doc._.conll_str)[0].tokens
    qa_sample.update({'valid_question': False, 'valid_option': False, 'conversion_success': False})
    q = Question(deepcopy(q_conll_dict))
    if not q.isvalid:
        qa_sample['hypothesis'] = ""
        return qa_sample
    else:
        qa_sample['valid_question'] = True
    a = AnswerSpan(a_conll_dict)
    if not a.isvalid:
        qa_sample['hypothesis'] = ""
        return qa_sample
    else:
        q.insert_answer_default(a)
        qa_sample['hypothesis'] = detokenizer.detokenize(q.format_declr(), return_str=True)
    return qa_sample

In [13]:
converted = []
with mp.Pool(processes=3) as p:
        with tqdm.tqdm(total=num_processed_samples) as pbar:
            for i, _ in enumerate(p.imap_unordered(convert2, processed_samples)):
                converted.append(_)
                pbar.update()

  4%|▍         | 850/19548 [01:16<27:58, 11.14it/s] Process ForkPoolWorker-10:



KeyboardInterrupt: 

In [None]:
with 
json.dump()