In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [7]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')       
    for l in g:
        yield json.loads(l)

def getgz(path):
    id2item = {}
    for d in parse(path):
        id2item[d['asin']] = d
    return id2item

In [8]:
getgz('./datasets/Amazon_review/meta/all_category/meta_Tools_and_Home_Improvement.json.gz')

{'001212835X': {'category': ['Tools & Home Improvement',
   'Lighting & Ceiling Fans',
   'Lamps & Shades',
   'Table Lamps'],
  'tech1': '',
  'description': ['collectible table lamp'],
  'fit': '',
  'title': "Everett's Cottage Table Lamp",
  'also_buy': [],
  'tech2': '',
  'brand': '',
  'feature': [],
  'rank': ['>#3,780,135 in Tools & Home Improvement (See top 100)',
   '>#45,028 in Tools & Home Improvement > Lighting & Ceiling Fans > Lamps & Shades > Table Lamps'],
  'also_view': [],
  'main_cat': 'Tools & Home Improvement',
  'similar_item': '',
  'date': 'October 30, 2010',
  'price': '',
  'asin': '001212835X',
  'imageURL': [],
  'imageURLHighRes': []}}

In [90]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os

def process_pairs(id, rawdata, mode):
    try:
        item = rawdata[id]
        if mode == 0:
            candidates = item['also_buy']
        elif mode == 1:
            candidates = item['also_view']
        else:
            candidates = [i[1:11] for i in item['similar_item'].split('asin')]
        pairs = set()
        for cand in candidates:
            if id != cand and cand in rawdata and item['title'] != rawdata[cand]['title']:
                pairs.add(tuple(sorted([item['title'], rawdata[cand]['title']])))    
        return pairs
    except:
        return set()

def process_rawdata(addr):
    rawdata = getgz(addr)
    processed_data = []
    also_buys, also_views, similars = set(), set(), set()
    for id in tqdm(rawdata):
        also_buys |= process_pairs(id, rawdata, 0)
        also_views |= process_pairs(id, rawdata, 1)
        similars |= process_pairs(id, rawdata, 2)
    comb_set = list(also_buys | also_views | similars)
    for pair in comb_set:
        label = -1
        if pair in also_buys:
            label = 0
        if pair in also_views:
            if label != -1:
                continue
            label = 1
        if pair in similars:
            if label != -1:
                continue
            label = 2
        processed_data.append([pair[0], pair[1], label])
    return processed_data

def process_stru_split(data, size):
    stru_data = []
    for pair in data:
        if len(stru_data) == size:
            break
        new_entry = {}
        new_entry['instruction'] = "Given the title of two products, predict if the two products are similar, if the two products will be purchased or viewed together. Answer only from the options."
        new_entry["input"] = json.dumps({
            "Product 1:": pair[0],
            "Product 2:": pair[1]
        })
        new_entry["options"] = json.dumps([
            "A: Users who buy product 1 may also buy product 2.",
            "B: Users who view product 1 may also view product 2.",
            "C: The product 1 is similar with the product 2."
        ])
        new_entry["output"] = chr(pair[2] + ord('A'))
        stru_data.append(new_entry)
    return stru_data

def process_stru_data(addr, train_size, val_size, test_size, ood=False):
    processed_data = process_rawdata(addr)
    train, val_test = train_test_split(processed_data, test_size=0.2, random_state=seed)
    val, test = train_test_split(val_test, test_size=0.5, random_state=seed)
    print(train[0])
    print(test[0])
    rawdir = './raw_data/final_version/{}'.format(addr.split('/')[-1][5:-8])
    if not os.path.exists(rawdir):
        os.makedirs(rawdir)
    dump_pickle(train, '{}/train.pickle'.format(rawdir))
    dump_pickle(val, '{}/val.pickle'.format(rawdir))
    dump_pickle(test, '{}/test.pickle'.format(rawdir))
    dir = './stru_data/in_cat/relation_prediction_{}'.format(addr.split('/')[-1][5:-8])
    if ood:
        dir = './stru_data/relation_prediction_OOD'
    if not os.path.exists(dir):
        os.makedirs(dir)
    if not ood:
        stru_train = process_stru_split(train, train_size)
        stru_val = process_stru_split(val, val_size)
        dump_json(stru_train, '{}/train_{}k.json'.format(dir, len(stru_train)//1000))
        dump_json(stru_val, '{}/val_{}k.json'.format(dir, len(stru_val)//1000))
    stru_test = process_stru_split(test, test_size)
    dump_json(stru_test, '{}/test_{}k.json'.format(dir, len(stru_test)//1000))

def process_mix_data(cats, train_size, val_size, test_size):
    trains, vals, tests = [], [], []
    for cat in cats:
        dir = './stru_data/in_cat/relation_prediction_{}'.format(cat)
        if not os.path.exists(dir):
            process_stru_data('./datasets/Amazon_review/meta/all_category/meta_{}.json.gz'.format(cat), train_size, val_size, test_size)
        trains += load_json('{}/train_10k.json'.format(dir))
        vals += load_json('{}/val_1k.json'.format(dir))
        tests += load_json('{}/test_1k.json'.format(dir))
    mix_train = random.sample(trains, k=train_size)
    mix_val = random.sample(vals, k=val_size)
    mix_test = random.sample(tests, k=test_size)
    print(mix_train[0]['input'])
    print(mix_test[0]['input'])
    dump_json(mix_train, './stru_data/relation_prediction_mix/train_{}k.json'.format(len(mix_train)//1000))
    dump_json(mix_val, './stru_data/relation_prediction_mix/val_{}k.json'.format(len(mix_val)//1000))
    dump_json(mix_test, './stru_data/relation_prediction_mix/test_{}k.json'.format(len(mix_test)//1000))

In [91]:
# train, val, and IND test sets
random.seed(seed)
train_size, val_size, test_size = 10000, 1000, 1000
process_mix_data(["Electronics","Home_and_Kitchen","Sports_and_Outdoors"], train_size, val_size, test_size)

{"Product 1:": "Cooler Master HAF X - Full Tower Computer Case with USB 3.0 Ports and Windowed Side Panel (RC-942-KKN1)", "Product 2:": "G.Skill Ripjaws V Series 16GB (2 x 8GB) 288-Pin SDRAM DDR4 3200 (PC4 25600) Intel Z170 Desktop Memory F4-3200C16D-16GVGB"}
{"Product 1:": "Cerwin-Vega XED52 Speaker 275 W PMPO 2-Way, 2 Count, Black", "Product 2:": "Rockford R169X2 6 x 9 Inches Full Range Coaxial Speaker, Set of 2"}


In [75]:
# OOD test set
random.seed(seed)
train_size, val_size, test_size = 10000, 1000, 1000
process_mix_data(["Electronics","Home_and_Kitchen","Sports_and_Outdoors"], train_size, val_size, test_size)
process_stru_data('./datasets/Amazon_review/meta/all_category/meta_Tools_and_Home_Improvement.json.gz', train_size, val_size, test_size, ood=True)

{"Product 1:": "Cooler Master HAF X - Full Tower Computer Case with USB 3.0 Ports and Windowed Side Panel (RC-942-KKN1)", "Product 2:": "G.Skill Ripjaws V Series 16GB (2 x 8GB) 288-Pin SDRAM DDR4 3200 (PC4 25600) Intel Z170 Desktop Memory F4-3200C16D-16GVGB"}
{"Product 1:": "Cerwin-Vega XED52 Speaker 275 W PMPO 2-Way, 2 Count, Black", "Product 2:": "Rockford R169X2 6 x 9 Inches Full Range Coaxial Speaker, Set of 2"}


100%|██████████| 559340/559340 [00:11<00:00, 47491.64it/s]


1866414
1625725
['18&quot; Wire Burner 3 Piece Set', 'Anchorseal 1 gal, 2 Green Wood Sealer Gallon', 0]
['DEWALT Drill/Driver Set, 80-Piece  (DW2587)', 'DeWalt (2 Pack) Bit Holder for 20V Max DCD980 DCD985 DCD980L2 DCD985L2 # N131745-2pk', 0]


## diverse instruction

In [100]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [101]:
random.seed(seed)
instrs = [
    "Analyze the titles of Product 1 and Product 2 to determine if they are similar, if they will be purchased or viewed together, and choose the corresponding option.",
"Analyze the titles of Product 1 and Product 2 and select the option that indicates the relation of the two products.",
"Evaluate the titles of Product 1 and Product 2, then choose the option that best describes the relation between the two products.",
"Evaluate the titles of Product 1 and Product 2 to assess their similarity and whether they are likely to be purchased or viewed together. Then, select the appropriate option.",
"Predict whether two products are similar, whether two products are likely to be purchased or viewed together based on their titles. Choose your answer from the provided options."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append("Given the title of two products, predict if the two products are similar, if the two products will be purchased or viewed together. Answer only from the options.")
diverse_instruction('./stru_data/relation_prediction_mix', instrs, unseen)
diverse_instruction('./stru_data/relation_prediction_OOD', instrs, unseen)

Analyze the titles of Product 1 and Product 2 and select the option that indicates the relation of the two products.


## few-shot

In [5]:
import os
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/relation_prediction_mix/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'options': train_data[index]['options'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'options': entry['options'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [6]:
few_shot('./stru_data/relation_prediction_mix')
few_shot('./stru_data/relation_prediction_mix_di')
few_shot('./stru_data/relation_prediction_OOD')
few_shot('./stru_data/relation_prediction_OOD_di')