In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
import jsonlines
import pandas as pd
import re
import os

def readjsonl(file):
    with open(file, "r", encoding="utf8") as f:
        jsonlist = []
        for item in jsonlines.Reader(f):
            jsonlist.append(item)
        return jsonlist
    
def concat_text(raw_text):
    concated_text = ''
    for s in raw_text:
        concated_text += ' ' + s
    return concated_text.strip()

def is_similar(text, target):
    if re.sub(r'[\W_]+', '', text) == re.sub(r'[\W_]+', '', target):
        return True
    return False

In [4]:
def read_split(path):
    data = {'train': [], 'val': [], 'test': []}
    for key in data:
        pos = readjsonl(os.path.join(path, key, '00_All/mave_positives.jsonl'))
        neg = readjsonl(os.path.join(path, key, '00_All/mave_negatives.jsonl'))
        data[key] = pos + neg
    return data

def add_attr(path):
    data = read_split(path)
    for key in data:
        spl = data[key]
        for entry in spl:
            entry["attr_key"] = entry["attributes"][0]["key"]
        data[key] = pd.DataFrame(spl)
    return data

def holdout_attr(df_test):
    attrs = df_test["attr_key"].unique().tolist()
    random.shuffle(attrs)
    random.shuffle(attrs)
    holdouts = []
    cnt = 0
    for attr in attrs:
        if cnt >= 1000:
            break 
        df_holdout = df_test[df_test["attr_key"] == attr]
        cnt += len(df_holdout)
        holdouts.append(attr)
    return holdouts

def filter_df(data):
    holdouts = holdout_attr(data['test'])
    data_holdouts = {'train': [], 'val': [], 'test': []}
    for key in data:
        df, df_woh = data[key], data[key]
        df_wh = pd.DataFrame()
        for h in holdouts:
            df_woh = df_woh[df_woh["attr_key"] != h]
            df_wh = pd.concat([df_wh, df[df["attr_key"] == h]])
        data[key] = df_woh.sample(frac=1, random_state=seed)
        data_holdouts[key] = df_wh.sample(frac=1, random_state=seed)
    return data, data_holdouts

In [5]:
from tqdm import tqdm

def process_stru_data(df, size):
    stru_data = []
    for index, entry in tqdm(df.iterrows(), total=len(df)):
        if len(stru_data) == size:
            return stru_data
        paragraphs = entry['paragraphs']
        attributes = entry['attributes']
        texts = {}
        pids = {}
        for i in range(len(paragraphs)):
            para = paragraphs[i]
            key = 'product ' + para['source']
            if key not in texts:
                texts[key] = []
            texts[key].append(para['text'])
            pids[i] = key
        
        for key in texts.keys():
            texts[key] = ', '.join(texts[key])

        attr_cand = []
        pairs = set()
        for attr in attributes:
            attr_cand.append(attr['key'])
            evids = attr['evidences']
            if len(evids) == 0:
                pairs.add((attr['key'].lower(), 'None', 'None'))
            for evid in evids:
                pairs.add((attr['key'].lower(), evid['value'].lower(), pids[evid['pid']].lower()))

        new_entry = {}
        new_entry["instruction"] = "Given the title, description, feature, price, and brand of a product and a set of target attributes, extract the value of each target attribute from the product information. Output the extracted value and the corresponding source (e.g., title or feature) denoting where the value is extracted."
        texts['target attributes'] = ' and '.join(attr_cand)
        new_entry["input"] = json.dumps(texts)
        new_entry["output"] = json.dumps([{
            'attribute': i[0],
            'value': i[1],
            'source': i[2]
        } for i in pairs])
        stru_data.append(new_entry)
    return stru_data

def process_df(data, train_size, vt_size, ood=False):
    dir = './stru_data/attr_value_extraction'
    if ood:
        dir = './stru_data/attr_value_extraction_OOD'
    if not os.path.exists(dir):
        os.makedirs(dir)
    for key in data:
        if ood and key != 'test':
            continue
        size = vt_size    # size of test and validation set
        if key == 'train':
            size = train_size    # size of training set
        stru_data = process_stru_data(data[key], size)
        dump_json(stru_data, '{}/{}_{}k.json'.format(dir, key, len(stru_data)//1000))
    return

def process_ave(path, train_size, vt_size):
    data = add_attr(path)
    data, data_holdouts = filter_df(data)
    process_df(data, train_size, vt_size)
    process_df(data_holdouts, train_size, vt_size, ood=True)

In [None]:
random.seed(seed)
path = './datasets/mave/datasets/splits/PRODUCT'
train_size, vt_size = 10000, 1000
process_ave(path, train_size, vt_size)

## diverse instruction

In [5]:
import os
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [7]:
random.seed(seed)
base_instr = "Given the title, description, feature, price, and brand of a product and a set of target attributes, extract the value of each target attribute from the product information. Output the extracted value and the corresponding source (e.g., title or feature) denoting where the value is extracted."
instrs = [
"Extract the value of the target attribute from the given product information and output it along with the corresponding source.",
"Retrieve the value associated with the target attribute from the product information and specify the source (e.g., title, description, feature, or title) where the value was found.",
"Parse the product information to locate the target attribute, and then provide the extracted value of the target attribute and its source in the output, specifying None if the attribute is not present.",
"First, identify the target attributes from the provided list. Then, scan the product title, description, feature, and brand to extract the values associated with each target attribute. Finally, create a list of dictionaries, each containing the extracted attribute, its corresponding value, and the source where it was found.",
"Using the product's title, description, features, price, and brand, identify and retrieve the values associated with a specified set of target attributes. Output the extracted values along with their respective sources (e.g., title or feature) indicating where each value was found."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
print(instrs)
diverse_instruction('./stru_data/attr_value_extraction', instrs, unseen)
diverse_instruction('./stru_data/attr_value_extraction_OOD', instrs, unseen)

Retrieve the value associated with the target attribute from the product information and specify the source (e.g., title, description, feature, or title) where the value was found.
['Extract the value of the target attribute from the given product information and output it along with the corresponding source.', 'Parse the product information to locate the target attribute, and then provide the extracted value of the target attribute and its source in the output, specifying None if the attribute is not present.', 'First, identify the target attributes from the provided list. Then, scan the product title, description, feature, and brand to extract the values associated with each target attribute. Finally, create a list of dictionaries, each containing the extracted attribute, its corresponding value, and the source where it was found.', "Using the product's title, description, features, price, and brand, identify and retrieve the values associated with a specified set of target attribute

## few-shot

In [8]:
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/attr_value_extraction/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [9]:
few_shot('./stru_data/attr_value_extraction')
few_shot('./stru_data/attr_value_extraction_di')
few_shot('./stru_data/attr_value_extraction_OOD')
few_shot('./stru_data/attr_value_extraction_OOD_di')