In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
import jsonlines
import pandas as pd

def read_jsonl(addr):
    jsonlist = []
    with open(addr, "r", encoding="utf8") as f:
        for item in jsonlines.Reader(f):
            jsonlist.append(item)
    return jsonlist

## answerable

In [4]:
from tqdm import tqdm
import os
import traceback

def process_answerable(df, cats, spl, size):
    df_cats = pd.DataFrame()
    for cat in cats:
        df_cats = pd.concat([df_cats, df[df['category'] == cat]])
    try:
        df_cats = df_cats.sample(n=size, random_state=seed)
    except:
        traceback.print_exc()
    stru_data = []
    for index, row in tqdm(df_cats.iterrows(), total=len(df_cats)):
        new_entry = {}
        new_entry["instruction"] = "Given a question and the related document, predict if the question is answerable based on the information provided in the document. Output only yes or no."
        new_entry["input"] = json.dumps({
            "question": row['questionText'],
            "document": row['review_snippets']
        })
        new_entry["output"] = 'yes' if row['is_answerable'] == 1 else 'no'
        stru_data.append(new_entry)

    if len(cats) > 1:
        dir = './stru_data/answerable_mix/'
    else:
        dir = './stru_data/answerable_OOD/'
    if not os.path.exists(dir):
        os.makedirs(dir)
    dump_json(stru_data, '{}/{}_{}k.json'.format(dir, spl, len(stru_data)//1000))

def process_split_answerable(path, cats, spl, size):
    qa = read_jsonl('{}/{}-qar.jsonl'.format(path, spl))
    df = pd.DataFrame(qa)
    process_answerable(df, cats, spl, size)

def call_process_answerable(cats, path, train_size, val_size, test_size):
    if len(cats) > 1:
        process_split_answerable(path, cats, 'train', train_size)
        process_split_answerable(path, cats, 'val', val_size)
    process_split_answerable(path, cats, 'test', test_size)

In [5]:
path = './datasets/productQA'
train_size, val_size, test_size = 10000, 1000, 1000
call_process_answerable(["Sports_and_Outdoors","Tools_and_Home_Improvement"], path, train_size, val_size, test_size)
call_process_answerable(["Cell_Phones_and_Accessories"], path, train_size, val_size, test_size)

100%|██████████| 10000/10000 [00:00<00:00, 17540.35it/s]
100%|██████████| 1000/1000 [00:00<00:00, 17350.55it/s]
100%|██████████| 1000/1000 [00:00<00:00, 17419.87it/s]
100%|██████████| 1000/1000 [00:00<00:00, 18200.89it/s]


## generation

In [20]:
from tqdm import tqdm
import os
import traceback

def process_generation(df, cats, spl, size):
    df_cats = pd.DataFrame()
    for cat in cats:
        df_cats = pd.concat([df_cats, df[df['category'] == cat]])
    df_cats = df_cats[(df_cats["questionType"] == "descriptive") & (df_cats["is_answerable"] == 1)]
    df_cats = df_cats.sample(frac=1, random_state=seed)
    stru_data = []
    for index, row in tqdm(df_cats.iterrows(), total=len(df_cats)):
        answers = row["answers"]
        gt = answers[0]
        gtscore = 0
        for answer in answers:
            if answer["helpful"][1] == 0:
                continue
            score = answer["helpful"][0]
            if score > gtscore:
                gtscore = score
                gt = answer
        if gtscore == 0:
            continue
        new_entry = {}
        new_entry["instruction"] = "Given a question and the related document, and generate the answer to the question based on the information provided in the document."
        new_entry["input"] = json.dumps({
            'question': row['questionText'],
            'document': row['review_snippets']
        })
        new_entry["output"] = gt["answerText"]
        stru_data.append(new_entry)
        if len(stru_data) == size:
            break

    if len(cats) > 1:
        dir = './stru_data/generation_mix/'
    else:
        dir = './stru_data/generation_OOD/'
    if not os.path.exists(dir):
        os.makedirs(dir)
    print(len(stru_data))
    dump_json(stru_data, '{}/{}_{}k.json'.format(dir, spl, len(stru_data)//1000))

def process_split_generation(path, cats, spl, size):
    qa = read_jsonl('{}/{}-qar.jsonl'.format(path, spl))
    df = pd.DataFrame(qa)
    process_generation(df, cats, spl, size)

def call_process_generation(cats, path, train_size, val_size, test_size):
    if len(cats) > 1:
        process_split_generation(path, cats, 'train', train_size)
        process_split_generation(path, cats, 'val', val_size)
    process_split_generation(path, cats, 'test', test_size)

In [21]:
path = './datasets/productQA'
train_size, val_size, test_size = 10000, 1000, 1000
call_process_generation(["Electronics","Home_and_Kitchen"], path, train_size, val_size, test_size)
call_process_generation(["Cell_Phones_and_Accessories"], path, train_size, val_size, test_size)

 12%|█▏        | 18260/148443 [00:00<00:06, 18665.73it/s]


10000


 10%|▉         | 1811/18820 [00:00<00:00, 18614.81it/s]


1000


  9%|▉         | 1791/19245 [00:00<00:00, 18060.38it/s]


1000


 72%|███████▏  | 2062/2870 [00:00<00:00, 21455.20it/s]


1000


## diverse instruction

In [10]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [11]:
random.seed(seed)
base_instr = "Given a question and the related document, predict if the question is answerable based on the information provided in the document. Output only yes or no."
instrs = [
"Evaluate the answerability of a question by analyzing the related document, outputting yes if the document contains information addressing the question, and no otherwise.",
"Predict whether it is possible to answer the given question using the supporting document, and output a yes or no response.",
"Analyze a question and its supporting document. Predicting answerability based on the information provided in the document. Output yes if the document contains relevant information to answer the question, otherwise output no.",
"Given a question and its related document, determine if the question is answerable by analyzing the information in the document. Output yes if the document addresses the question, or no otherwise.",
"Output yes if the supporting document can answer the given question. Otherwise, output no."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
diverse_instruction('./stru_data/answerable_mix', instrs, unseen)
diverse_instruction('./stru_data/answerable_OOD', instrs, unseen)

Predict whether it is possible to answer the given question using the supporting document, and output a yes or no response.


## few-shot

In [4]:
import os
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/answerable_mix/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [5]:
few_shot('./stru_data/answerable_mix')
few_shot('./stru_data/answerable_mix_di')
few_shot('./stru_data/answerable_OOD')
few_shot('./stru_data/answerable_OOD_di')

## generation - di & few-shot

In [22]:
random.seed(seed)
base_instr = "Given a question and the related document, and generate the answer to the question based on the information provided in the document."
instrs = [
"Generate an answer to the question by utilizing the information contained in the document.",
"Utilize the information provided in the supporting document to generate an answer to the given question.",
"Extract information from the supporting document to answer the given question.",
"Answer the given question using the supporting document.",
"Answer the given question by extracting information from the supporting document."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
diverse_instruction('./stru_data/generation_mix', instrs, unseen)
diverse_instruction('./stru_data/generation_OOD', instrs, unseen)

Utilize the information provided in the supporting document to generate an answer to the given question.


In [6]:
import os
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/generation_mix/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [7]:
few_shot('./stru_data/generation_mix')
few_shot('./stru_data/generation_mix_di')
few_shot('./stru_data/generation_OOD')
few_shot('./stru_data/generation_OOD_di')