In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
import pandas as pd
import unicodedata
import traceback
import os
from tqdm import tqdm

def nor_unicode(text):
    return str(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore'), encoding = "utf-8")

def load_raw_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        df_task = pd.DataFrame(json.load(f))
    df_train = df_task[df_task["split"] == "train"]
    df_test_dev = df_task[df_task["split"] == "test"]
    df_test = df_test_dev.sample(frac=0.5, random_state=seed)
    df_val = pd.concat([df_test_dev, df_test, df_test]).drop_duplicates(keep=False)
    return df_train, df_val, df_test

## ranking

In [4]:
def rank_query(df, spl, n):
    exampleids = set()
    rankinglist = []
    df_1 = df[df['small_version'] == 1]
    queries = df_1['query'].unique().tolist()
    while len(rankinglist) < n:
        try:
            query = random.sample(queries, 1)[0]    # random query
            rank = []    # items in this ranking set
            labels = set()    # unique labels in this ranking set
            items = df_1[(df_1['query'] == query)]
            length = random.randint(2,4)    # number of items in this ranking set
            if len(items) == 0 or len(items['esci_label'].unique()) < length:    # if there are enough unique items
                continue
            esci = ['E','S','C','I']    # candidate labels
            pair = []
            while len(labels) < length:
                label = random.sample(esci, 1)[0]
                try:
                    item = items[items['esci_label'] == label].sample(n=1, random_state=seed).iloc[0]
                    pair.append(item['example_id'])
                    labels.add(label)
                    rank.append(item)
                    esci.remove(label)
                except:
                    esci.remove(label)
                    continue
            pair_set = tuple(sorted(pair))
            if pair_set not in exampleids:
                rankinglist.append(rank)
                exampleids.add(pair_set)
        except:
            traceback.print_exc()

    dump_pickle(rankinglist, './raw_data/rankinglist_{}_{}k.pickle'.format(spl, n//1000))
    return rankinglist

def process_rank(df, spl, size):
    dir = './raw_data/rankinglist_{}_{}k.pickle'.format(spl, size//1000)
    if not os.path.exists(dir):
        rankinglist = rank_query(df, spl, size)
    else:
        rankinglist = load_pickle(dir)
    stru_data = []
    label_list = []
    for rank in tqdm(rankinglist):
        query = nor_unicode(rank[0]['query'])
        # random.shuffle(rank)
        products = []
        labels = []
        output = ['','','','']
        for i in range(len(rank)):
            entry = rank[i]
            products.append([chr(65+i), nor_unicode(entry['product_title']), entry['esci_label']])
        for prod in products:
            labels.append(prod[2])
            if prod[2] == 'E':
                output[0] = prod[0]
            elif prod[2] == 'S':
                output[1] = prod[0]
            elif prod[2] == 'C':
                output[2] = prod[0]
            else:
                output[3] = prod[0]
        new_entry = {}
        new_entry["instruction"] = "Given a query and a list of products denoted as A, B, C, ... with their titles, rank the products according to their relevance to the query. Output only a ranked list in which the most relevant product is at the top of the list."
        new_entry["input"] = json.dumps({
            'query': query,
            'product list': [(str(pr[0]) + ': ' + str(pr[1])) for pr in products]
        })
        new_entry["output"] = ','.join(list(''.join(output))).strip(',')
        stru_data.append(new_entry)
        label_list.append(labels)

    dir = './stru_data/rank'
    if not os.path.exists(dir):
        os.makedirs(dir)
    dump_json(stru_data, '{}/{}_{}k.json'.format(dir, spl, len(stru_data)//1000))
    dump_json(label_list, '{}/label_{}_{}k.json'.format(dir, spl, len(stru_data)//1000))
    # return stru_data

def process_split_rank(train, val, test, train_size, val_size, test_size):
    process_rank(train, 'train', train_size)
    process_rank(val, 'val', val_size)
    process_rank(test, 'test', test_size)

## multiclass product classification

In [11]:
def process_mulclass(df, spl, size):
    df = df.sample(n=size, random_state=seed+1)
    labels = {'E':'A: The product is relevant to the query, and satisfies all the query specifications.',
          'S':'B: The product is somewhat relevant. It fails to fulfill some aspects of the query but the product can be used as a functional substitute.',
          'C':'C: The product does not fulfill the query, but could be used in combination with a product exactly matching the query.',
          'I':'D: The product is irrelevant to the query.'}
    stru_data = []
    baseline = []
    for index, row in df.iterrows():
        query = nor_unicode(row['query'])
        title = nor_unicode(row['product_title'])
        new_entry = {}
        new_entry["instruction"] = "What is the relevance between the query and the product title below? Answer from one of the options."
        new_entry["input"] = json.dumps({
            'query': query,
            'product title': title
        })
        new_entry['options'] = json.dumps([
            'A: The product is relevant to the query, and satisfies all the query specifications.',
            'B: The product is somewhat relevant. It fails to fulfill some aspects of the query but the product can be used as a functional substitute.',
            'C: The product does not fulfill the query, but could be used in combination with a product exactly matching the query.',
            'D: The product is irrelevant to the query.'])
        new_entry["output"] = labels[row['esci_label']]
        stru_data.append(new_entry)
        baseline.append([query, title, row['esci_label']])
    
    dir = './stru_data/multi_classification'
    if not os.path.exists(dir):
        os.makedirs(dir)
    dump_json(stru_data, '{}/{}_{}k.json'.format(dir, spl, len(stru_data)//1000))
    dump_json(baseline, './baseline/data_mc/{}_{}k.json'.format(spl,len(stru_data)//1000))
    # return stru_data

def process_split_mulclass(train, val, test, train_size, val_size, test_size):
    process_mulclass(train, 'train', train_size)
    process_mulclass(val, 'val', val_size)
    process_mulclass(test, 'test', test_size)

## substitute identification

In [12]:
def process_iden(df, spl, size):
    df = df.sample(n=size, random_state=seed)
    stru_data = []
    for index, row in df.iterrows():
        label = 'S'
        new_entry = {}
        new_entry["instruction"] = "Given a query and a product, identify if the product is somewhat relevant to the query. It fails to fulfill some aspects of the query but the product can be used as a functional substitute. Only output yes or no."
        new_entry["input"] = json.dumps({
            'query': nor_unicode(row['query']),
            'product': nor_unicode(row['product_title'])
        })
        new_entry["output"] = 'yes' if row['esci_label'] == label else 'no'
        stru_data.append(new_entry)

    dir = './stru_data/s_identification'
    if not os.path.exists(dir):
        os.makedirs(dir)
    dump_json(stru_data, '{}/{}_{}k.json'.format(dir, spl, len(stru_data)//1000))

def process_split_iden(train, val, test, train_size, val_size, test_size):
    process_iden(train, 'train', train_size)
    process_iden(val, 'val', val_size)
    process_iden(test, 'test', test_size)

## Processing

In [None]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from tqdm import tqdm
import json
import pandas as pd
import traceback

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)
def is_en(text):
    if not text or nlp(text)._.language['language'] == 'en':
        return True
    return False

In [None]:
df_examples = pd.read_parquet('./datasets/shopping_queries_dataset/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('./datasets/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("./datasets/shopping_queries_dataset/shopping_queries_dataset_sources.csv")
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)
df_task_2 = df_examples_products[df_examples_products['product_locale'] == 'us']
df_task_2.fillna("", inplace=True)
df_task_2['product_title'].replace('\n',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('\r',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('\s',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('</p>',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('<p>',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('<br>',' ', regex=True, inplace=True)
df_task_2['product_title'].replace('</br>',' ', regex=True, inplace=True)

list_en = []
for index, row in tqdm(df_task_2.iterrows(), total=len(df_task_2), mininterval=20):
    if is_en(row['query']) and is_en(row['product_title']):
        list_en.append(row.to_dict())

with open('./datasets/query_dataset.json', 'w', encoding="utf-8") as f:
    json.dump(list_en, f)

In [6]:
random.seed(seed)
path = './datasets/query_dataset.json'
train, val, test = load_raw_data(path)
train_size, val_size, test_size = 10000, 1000, 1000
process_split_mulclass(train, val, test, train_size, val_size, test_size)
process_split_iden(train, val, test, train_size, val_size, test_size)
process_split_rank(train, val, test, train_size, val_size, test_size)

100%|██████████| 10000/10000 [00:00<00:00, 29699.07it/s]


done train


100%|██████████| 1000/1000 [00:00<00:00, 30297.56it/s]


done val


100%|██████████| 1000/1000 [00:00<00:00, 31201.82it/s]

done test





## diverse instruction

In [14]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

def run_di(instrs, base_instr, path):
    unseen = random.sample(instrs, k=1)[0]
    instrs.remove(unseen)
    print(unseen)
    instrs.append(base_instr)
    diverse_instruction(path, instrs, unseen)

In [15]:
random.seed(seed)
base_rank = "Given a query and a list of products denoted as A, B, C, ... with their titles, rank the products according to their relevance to the query. Output only a ranked list in which the most relevant product is at the top of the list."
instrs_rank = [
"Evaluate each product title in the given list, assess its relevance to the given query, and then arrange the products in descending order of relevance, with the most relevant product at the top of the ranked list.",
"Evaluate the query against each product's title, determine the relevance between the query and the product, and organize the products in descending order of relevance, ensuring that the product with the highest relevance is positioned at the top of the list.",
"Rank the products A, B, C, ... based on their relevance to the provided query, and produce a ranked list with the most relevant product positioned at the top of the list.",
"Analyze the query and each product title. Sort the products in descending order based on their relevance to the query. The most relevant product should be at the top of the list, and output the ranked list.",
"Evaluate the relevance of each product title in the input to the given query, and then sort the products in descending order of relevance, placing the most relevant product at the top of the ranked list."
]
base_mc = "What is the relevance between the query and the product title below? Answer from one of the options."
instrs_mc = [
"Analyze the query and product title to determine the relevance between the query and product, and select the appropriate option from the provided options.",
"Evaluate the relevance between the query and product title, and choose the most accurate option from the given options.",
"Analyze the query and product title to assess the level of relevance between them, and then output the corresponding option that best describes this relevance.",
"Determine the relevance between the query and the product title provided, and select your response from one of the available options.",
"Compare the query and the product title to determine if the product fully meets the query specifications. Choose the option that best describes the relevance between them."
]
base_iden = "Given a query and a product, identify if the product is somewhat relevant to the query. It fails to fulfill some aspects of the query but the product can be used as a functional substitute. Only output yes or no."
instrs_iden = [
"Assess whether the product is a substitute for the query and provide a yes or no response.",
"Answer yes if the product is a substitute for the query and no otherwise.",
"Please respond with yes if the product is a suitable substitute for the query, and no if it is not.",
"Check if a product can function as a substitute for a given query, even if it doesn't fully meet all requirements. Output yes if it can or no otherwise.",
"Assess the relevance of a product to a given query by determining if it can function as a substitute, despite not fully meeting certain aspects of the query. Provide a binary output of yes or no based on this evaluation."
]

run_di(instrs_rank, base_rank, './stru_data/rank')
run_di(instrs_mc, base_mc, './stru_data/multi_classification')
run_di(instrs_iden, base_iden, './stru_data/s_identification')

Evaluate the query against each product's title, determine the relevance between the query and the product, and organize the products in descending order of relevance, ensuring that the product with the highest relevance is positioned at the top of the list.
Compare the query and the product title to determine if the product fully meets the query specifications. Choose the option that best describes the relevance between them.
Assess whether the product is a substitute for the query and provide a yes or no response.


## few-shot

In [4]:
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    train_data = load_json('{}/train_10k.json'.format(path))
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [5]:
import os
def few_shot_woption(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    train_data = load_json('{}/train_10k.json'.format(path))
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'options': train_data[index]['options'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'options': entry['options'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [6]:
few_shot_woption('./stru_data/multi_classification_di')

In [6]:
few_shot('./stru_data/rank')
few_shot('./stru_data/rank_di')
few_shot('./stru_data/s_identification')
few_shot('./stru_data/s_identification_di')
few_shot_woption('./stru_data/multi_classification')
few_shot_woption('./stru_data/multi_classification_di')