In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
import pandas as pd

def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

def load_df(addr):
    with open(addr, 'r') as f:
        return pd.read_csv(f)

In [3]:
import os

def read_df(path):
    amazon = load_df(os.path.join(path, 'Amazon.csv'))
    google = load_df(os.path.join(path, 'GoogleProducts.csv'))
    matched = load_df(os.path.join(path, 'Amzon_GoogleProducts_perfectMapping.csv'))
    dfmerge_a = matched.merge(amazon,how='inner',left_on='idAmazon',right_on='id').drop('id', axis=1)
    dfmerge = dfmerge_a.merge(google,left_on='idGoogleBase',right_on='id',suffixes=('_a','_g')).drop('id', axis=1)
    return dfmerge, amazon, google

In [22]:
import random
from sklearn.model_selection import train_test_split
import json
import numpy as np

def process_desc(text):
    if len(str(text).split(' ')) > 50:
        return ' '.join(str(text).split(' ')[:50]).strip('.') + '...'
    return text

def process_positives(df):
    stru_data, product_pair = [], set()
    for index, entry in df.iterrows():
        prod_a = {
            'title': entry['title'],
            'description': process_desc(entry['description_a']),
            'manufacturer': str(entry['manufacturer_a']),
            'price': str(entry['price_a'])
        }
        prod_g = {
            'title': entry['name'],
            'description': process_desc(entry['description_g']),
            'manufacturer': str(entry['manufacturer_g']),
            'price': str(entry['price_g'])
        }
        if (str(prod_a), str(prod_g)) in product_pair or (str(prod_g), str(prod_a)) in product_pair:
            continue
        product_pair.add((str(prod_a), str(prod_g)))
        product_pair.add((str(prod_g), str(prod_a)))

        new_entry = {}
        new_entry["instruction"] = "Given the title, description, manufacturer, and price of two products, identify if they are the same product. Only output yes or no."
        new_entry["input"] = json.dumps({'product 1': prod_a, 'product 2': prod_g})
        new_entry["output"] = 'yes'
        stru_data.append(new_entry)

    return stru_data, product_pair
    
def process_negatives(size, product_pair, amazon, google):
    stru_data = []
    amazon = amazon.sample(frac=1, random_state=seed)
    google = google.sample(frac=1, random_state=seed)
    for index, prod_a in amazon.iterrows():
        if len(stru_data) == size:
            break
        prod_g = google.iloc[index]
        prod_a = {'title': prod_a['title'],
                  'description': process_desc(prod_a['description']),
                  'manufacturer': str(prod_a['manufacturer']),
                  'price': str(prod_a['price'])}
        prod_g = {'title': prod_g['name'],
                  'description': process_desc(prod_g['description']),
                  'manufacturer': str(prod_g['manufacturer']),
                  'price': str(prod_g['price'])}
        if (str(prod_a), str(prod_g)) not in product_pair and (str(prod_g), str(prod_a)) not in product_pair:
            new_entry = {}
            new_entry["instruction"] = "Given the title, description, manufacturer, and price of two products, identify if they are the same product. Only output yes or no."
            new_entry["input"] = json.dumps({'product 1': prod_a, 'product 2': prod_g})
            new_entry["output"] = 'no'
            stru_data.append(new_entry)
            product_pair.add((str(prod_a), str(prod_g)))
            product_pair.add((str(prod_g), str(prod_a)))
    return stru_data

def process_matching(path):
    df, amazon, google = read_df(path)
    pos, pairs = process_positives(df)
    neg = process_negatives(len(pos), pairs, amazon, google)
    stru_data = [*pos, *neg]
    train, test_val = train_test_split(stru_data, test_size=0.2, random_state=seed)
    val, test = train_test_split(test_val, test_size=0.5, random_state=seed)
    print(test[0]['input'])
    print(len(train))
    dump_json(train, './stru_data/matching/train_10k.json')
    dump_json(val, './stru_data/matching/val_1k.json')
    dump_json(test, './stru_data/matching/test_1k.json')

In [23]:
path = './datasets/Amazon_Google_products'
process_matching(path)

{"product 1": {"title": "weekly reader preparing for kindergarten", "description": NaN, "manufacturer": "fogware publishing", "price": "19.99"}, "product 2": {"title": "mcafee inc total protection 2007 3-user", "description": "mcafee\ufffd total protection peace of mind for your entire family today the variety of threats to your pc fi les and online identity is bewildering.viruses spyware hackers spam and emailscams online predators identitythie", "manufacturer": "nan", "price": "92.51"}}
2022


## diverse instruction

In [26]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [27]:
random.seed(seed)
base_instr = "Given the title, description, manufacturer, and price of two products, identify if they are the same product. Only output yes or no."
instrs = [
"Analyze the title, description, manufacturer, and price between the two products below and generate an output of yes if the two products are the same, otherwise respond with no.",
"Determine whether the two products are the same by comparing their title, description, manufacturer, and price, and provide a simple yes or no answer as the output.",
"Check the details of the two products to see if they refer to the same product. Output only yes or no.",
"Based on the product information, predict if the two products are identical or not. Output yes if they are identical or no otherwise.",
"Compare the details of two given products to determine if they are identical. Output yes if they are identical or no otherwise."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
diverse_instruction('./stru_data/matching', instrs, unseen)

Determine whether the two products are the same by comparing their title, description, manufacturer, and price, and provide a simple yes or no answer as the output.


## few-shot

In [28]:
test_data = load_json('./stru_data/matching/test_1k.json')
train_data = load_json('./stru_data/matching/train_10k.json')
few_shot = []
for index, entry in enumerate(test_data):
    new_entry = {}
    new_entry['instruction'] = base_instr
    new_entry['example'] = json.dumps({
        'input': train_data[index]['input'],
        'output': train_data[index]['output']
    })
    new_entry['test example'] = json.dumps({
        'input': entry['input'],
        'output': entry["output"]
    })
    few_shot.append(new_entry)
dump_json(few_shot, './stru_data/matching_few_shot/test_1k.json')

In [4]:
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/matching/train_10k.json'.format(path))
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [5]:
few_shot('./stru_data/matching')
few_shot('./stru_data/matching_di')