In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
import collections
import gzip
import json
import os
from tqdm import tqdm
import re
from bs4 import BeautifulSoup

# clean text from title, description, feature and others
def clean_text(raw_text):
    if isinstance(raw_text, list):
        cleaned_text = ''
        for s in raw_text:
            s = clean_str(s)
            if s != '':
                cleaned_text += ' ' + s

    elif isinstance(raw_text, dict):
        cleaned_text = clean_str(str(raw_text))
    else:
        cleaned_text = clean_str(raw_text)
    return cleaned_text.strip()

def clean_str(s):
    if s == '' or len(s) >= 2000:
        return ''
    s = re.sub(r'["\n\r]*', '', s)
    s = BeautifulSoup(s, 'html.parser').get_text().strip('.')
    return s + '.'

def load_ratings(file):
    users, items, inters = set(), set(), set()
    with open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load ratings'):
            try:
                item, user, rating, time = line.strip().split(',')
                users.add(user)
                items.add(item)
                inters.add((user, item, float(rating), int(time)))
            except ValueError:
                print(line)
    return users, items, inters

def load_meta_items(file):
    items = set()
    with gzip.open(file, 'r') as fp:
        for line in tqdm(fp, desc='Load metas'):
            data = json.loads(line)
            items.add(data['asin'])
    return items

def get_user2count(inters):
    user2count = collections.defaultdict(int)
    for unit in inters:
        user2count[unit[0]] += 1
    return user2count

def get_item2count(inters):
    item2count = collections.defaultdict(int)
    for unit in inters:
        item2count[unit[1]] += 1
    return item2count

def generate_candidates(unit2count, threshold):
    cans = set()
    for unit, count in unit2count.items():
        if count >= threshold:
            cans.add(unit)
    return cans, len(unit2count) - len(cans)

def filter_inters(inters, can_items=None, user_k_core_threshold=0, item_k_core_threshold=0):
    new_inters = []

    # filter by meta items
    if can_items:
        for unit in inters:
            if unit[1] in can_items:
                new_inters.append(unit)
        inters, new_inters = new_inters, []
        user2count = get_user2count(inters)
        item2count = get_item2count(inters)

    # filter by k-core
    if user_k_core_threshold or item_k_core_threshold:
        print('Filtering by {}-core:'.format(user_k_core_threshold))
        idx = 0
        user2count = get_user2count(inters)
        item2count = get_item2count(inters)

        while True:
            new_user2count = collections.defaultdict(int)
            new_item2count = collections.defaultdict(int)
            users, n_filtered_users = generate_candidates(
                user2count, user_k_core_threshold)
            items, n_filtered_items = generate_candidates(
                item2count, item_k_core_threshold)
            if n_filtered_users == 0 and n_filtered_items == 0:
                break
            for unit in inters:
                if unit[0] in users and unit[1] in items:
                    new_inters.append(unit)
                    new_user2count[unit[0]] += 1
                    new_item2count[unit[1]] += 1
            idx += 1
            inters, new_inters = new_inters, []
            user2count, item2count = new_user2count, new_item2count
    return inters

def make_inters_in_order(inters):
    user2inters, new_inters = collections.defaultdict(list), list()
    cnt = 0
    for inter in inters:
        user, item, rating, timestamp = inter
        user2inters[user].append((user, item, rating, timestamp))
    for user in user2inters:
        user_inters = user2inters[user]
        user_inters.sort(key=lambda d: d[3])
        for inter in user_inters:
            new_inters.append(inter)
        if len(user_inters) < 5:
            cnt += 1
    # print('invalid number: {}'.format(cnt))
    return new_inters

def preprocess_rating(input_path, dataset, user_k, item_k):

    print('Process rating data: ')
    print('Dataset: ', dataset)

    # load ratings
    rating_file_path = os.path.join(input_path, 'rating/all_category', dataset + '.csv')
    rating_users, rating_items, rating_inters = load_ratings(rating_file_path)

    # load item IDs with meta data
    meta_file_path = os.path.join(input_path, 'meta/all_category', f'meta_{dataset}.json.gz')
    meta_items = load_meta_items(meta_file_path)

    # 1. Filter items w/o meta data;
    # 2. K-core filtering;
    # print('The number of raw inters: ', len(rating_inters))
    rating_inters = filter_inters(rating_inters, can_items=meta_items, user_k_core_threshold=user_k, item_k_core_threshold=item_k)

    # sort interactions chronologically for each user
    rating_inters = make_inters_in_order(rating_inters)

    # return: list of (user_ID, item_ID, rating, timestamp)
    return rating_inters

def load_inters(path, dataset):
    # load interactions from raw rating file
    rating_inters = preprocess_rating(path, dataset, user_k=5, item_k=5)
    itemcnt = get_item2count(rating_inters)
    return rating_inters, itemcnt

In [4]:
def load_metadata(input_path, dataset, itemcnt):
    path14 = os.path.join(input_path, 'meta/all_category_2014', f'meta_{dataset}.json.gz')
    path18 = os.path.join(input_path, 'meta/all_category', f'meta_{dataset}.json.gz')

    metadata18 = {}
    g_file18 = gzip.GzipFile(path18)
    for line in g_file18.readlines():
        dic = json.loads(line)
        id = dic['asin']
        # filter the items that in interaction
        if id in itemcnt.keys():
            metadata18[id] = dic
    g_file18.close()

    if path14 == '':
        return {}, metadata18

    metadata14 = {}
    g_file14 = gzip.GzipFile(path14)
    for line in g_file14.readlines():
        dic = eval(line)
        # filter the items that also in interaction
        if dic['asin'] in itemcnt.keys():
            metadata14[dic['asin']] = dic
    g_file14.close()
    
    return metadata14, metadata18

def generate_text(input_path, dataset, contents):
    content = 'title'
    rating_inters, items = load_inters(input_path, dataset)
    print('Process text data: ')
    item_text_dict = {}
    already_items = set()
    metadata14, metadata18 = load_metadata(input_path, dataset, items)

    for id in metadata18:
        item18 = metadata18[id]
        item14 = {}
        if id in metadata14:
            item14 = metadata14[id]
        if id in items and id not in already_items:
            already_items.add(id)
            text = ''
            # merge the 2018 metainfo with 2014, and generate text for item id
            for content in contents:
                con18, con14 = '', ''
                if content in item18:
                    con18 = item18[content]
                if content in item14:
                    con14 = item14[content]
                if len(con18) == 0 and len(con14) != 0:
                    con_value = con14
                else:
                    con_value = con18
                if content == 'category' and len(con_value) > 2:
                    con_value = con_value[:2]
                text += clean_text(con_value) + ' '
            item_text_dict[id] = text.strip(' ')
    return item_text_dict, rating_inters

def process_history(input_path, dataset, inter_threshold):
    item_text_dict, rating_inters = generate_text(input_path, dataset, ['title', 'category', 'brand'])
    user_inters = {}
    inters = []
    cntlen = 0
    for inter in rating_inters:
        user = inter[0]
        item = inter[1]
        if user not in user_inters:
            user_inters[user] = []
        if len(user_inters[user]) < inter_threshold:
            user_inters[user].append(item_text_dict[item])
        cntlen += len(item_text_dict[item].split(' '))
    for user in user_inters:
        inters.append(user_inters[user])
    print('average text length: {}'.format(cntlen/len(rating_inters)))
    return inters

In [5]:
def ordinal(i):
    if i > 20:
        if i % 10 == 1:
            return str(i) + 'st'
        if i % 10 == 2:
            return str(i) + 'nd'
        if i % 10 == 3:
            return str(i) + 'rd'
        return str(i) + 'th'
    if i == 1:
        return str(i) + 'st'
    if i == 2:
        return str(i) + 'nd'
    if i == 3:
        return str(i) + 'rd'
    return str(i) + 'th'

def cutitem(item, word_threshold=25):
    words = item.split(' ')
    if len(words) > word_threshold:
        return ' '.join(words[:word_threshold]).strip('.') + '...'
    return item

def allitems(datas):
    items = set()
    for data in datas:
        for entry in data:
            for item in entry:
                items.add(item)
    return items

In [6]:
def process_candidates(entry, cand_potential, num_cand, mode):
    candidates = random.sample(cand_potential, k=num_cand-1)
    candidates.append(entry[mode-3])
    random.shuffle(candidates)
    return candidates

def process_entry(entry, base_instr, cand_potential, num_cand, mode, word_threshold=25):
    candidates = process_candidates(entry, cand_potential, num_cand, mode)
    target_idx = candidates.index(entry[mode-3])
    new_entry = {}
    new_entry["instruction"] = base_instr
    new_entry["input"] = str([ordinal(i+1) + ': ' + cutitem(entry[i], word_threshold) for i in range(len(entry)+mode-3)])
    new_entry["options"] = str([chr(65+i) + ': ' + cutitem(candidates[i], word_threshold) for i in range(len(candidates))])
    new_entry["output"] = chr(65+target_idx)
    return new_entry, candidates

In [7]:
def dump_stru_data(dir, data_dict, ood=False):
    if not os.path.exists(dir):
        os.makedirs(dir)
    if not ood:
        dump_json(data_dict[0], '{}/train_{}k.json'.format(dir, len(data_dict[0])//1000))
        dump_json(data_dict[1], '{}/val_{}k.json'.format(dir, len(data_dict[1])//1000))
    dump_json(data_dict[2], '{}/test_{}k.json'.format(dir, len(data_dict[2])//1000))

def process_stru_data(data, base_instr, cand_pool, cat, train_size=10000, test_size=1000, num_cand=20, ood=False, isbaseline=False, word_threshold=25):
    stru_dict = {0: [], 1: [], 2: []}
    raw_data = {0: [], 1: [], 2: []}
    if ood:
        train_size = test_size
    for entry in tqdm(data):
        if len(stru_dict[0]) == train_size:
            break
        cand_potential = list(cand_pool - set([entry[-3], entry[-2], entry[-1]]))
        for mode in [0,1,2]:
            if mode != 0 and len(stru_dict[2]) == test_size:
                continue
            new_entry, candidates = process_entry(entry, base_instr, cand_potential, num_cand, mode, word_threshold=word_threshold)
            stru_dict[mode].append(new_entry)
            raw_data[mode].append([[entry[i] for i in range(len(entry)+mode-2)], candidates])
    print(stru_dict[2][0]['options'])
    return stru_dict, raw_data

In [8]:
def preprocess(cats, base_instr, train_size=10000, test_size=1000, ood=False):
    path = './datasets/Amazon_review'
    datas = {}
    inter_threshold = 50
    word_threshold = 25
    for cat in cats:
        inter_path = './raw_data/raw_inters/{}.pickle'.format(cat)
        if not os.path.exists(inter_path):
            inters = process_history(path, cat, inter_threshold)
            dump_pickle(inters, inter_path)
        else:
            inters = load_pickle(inter_path)
        random.shuffle(inters)
        datas[cat] = inters
    cand_pool = allitems([datas[i] for i in datas])
    for cat in datas:
        if ood:
            stru_dict, raw_data = process_stru_data(datas[cat], base_instr, cand_pool, cat, train_size, test_size, ood=True, word_threshold=word_threshold)
            dump_stru_data('./stru_data/sequential_rec_OOD', stru_dict, ood=True)
            dump_stru_data('./raw_data/sequential_rec_OOD', raw_data, ood=True)
        else:
            stru_dict, raw_data = process_stru_data(datas[cat], base_instr, cand_pool, cat, train_size, test_size, isbaseline=True, word_threshold=word_threshold)
            dump_stru_data('./stru_data/in_cat/sequential_rec_{}'.format(cat), stru_dict)
            dump_stru_data('./raw_data/sequential_rec_{}'.format(cat), raw_data)

def mix_data(dir, dump_dir, cats, train_size=10000, test_size=1000):
    trains, vals, tests = [],[],[]
    for idx, cat in enumerate(cats):
        if idx == len(cats)-1:
            trains += load_json('{}_{}/train_10k.json'.format(dir, cat))[:(train_size-len(trains))]
            vals += load_json('{}_{}/val_1k.json'.format(dir, cat))[:test_size-len(vals)]
            tests += load_json('{}_{}/test_1k.json'.format(dir, cat))[:test_size-len(tests)]
        else:
            trains += load_json('{}_{}/train_10k.json'.format(dir, cat))[:train_size//len(cats)]
            vals += load_json('{}_{}/val_1k.json'.format(dir, cat))[:test_size//len(cats)]
            tests += load_json('{}_{}/test_1k.json'.format(dir, cat))[:test_size//len(cats)]
    dump_json(trains, '{}/train_{}k.json'.format(dump_dir, len(trains)//1000))
    dump_json(vals, '{}/val_{}k.json'.format(dump_dir, len(vals)//1000))
    dump_json(tests, '{}/test_{}k.json'.format(dump_dir, len(tests)//1000))

def process_mix_data(cats, base_instr, train_size=10000, test_size=1000):
    dir = './stru_data/sequential_rec_mix'
    raw_mix_dir = './raw_data/sequential_rec_mix'
    if not os.path.exists(dir):
        os.makedirs(dir)
        preprocess(cats, base_instr, train_size, test_size)
    if not os.path.exists(raw_mix_dir):
        os.makedirs(raw_mix_dir)
    mix_data('./stru_data/in_cat/sequential_rec', dir, cats, train_size=10000, test_size=1000)
    mix_data('./raw_data/sequential_rec', raw_mix_dir, cats, train_size=10000, test_size=1000)

In [10]:
seed = 1
random.seed(seed)
np.random.seed(seed)

base_instr = "Given the products the user has purchased in history, rank the items in the listed options and output the item that the user is most likely to purchase next. Answer from one of the options."
preprocess(['Tools_and_Home_Improvement'], base_instr, ood=True)
process_mix_data(["Electronics","Home_and_Kitchen","Sports_and_Outdoors"], base_instr)

  0%|          | 1000/219152 [00:03<13:43, 265.06it/s]


['A: IIT 48795 Glow-In-The Dark Rope. Tools & Home Improvement. Hardware. IIT.', 'B: 8 Kwikset Emergency Keys for Interior Door Locksets (8). Tools & Home Improvement. Hardware. Kwikset.', 'C: Krylon K05125000 ColorMaster Paint & Primer Brushed Metallic Spray Paint, Caramel Latte, 11 Ounce. Tools & Home Improvement. Paint, Wall Treatments & Supplies. Krylon.', 'D: Hitachi UC18YSL3 18V Lithium-Ion Battery Rapid Charger w/ USB Port. Tools & Home Improvement. Power & Hand Tools. Hitachi.', 'E: Dragway Tools Rubber Sandblasting Gloves for Model 60, 90, 110, 260 Sandblast Cabinets. Tools & Home Improvement. Safety & Security. Dragway Tools.', 'F: UltraFire 1000 Lumens CREE XM-L T6 LED Flashlight Torch+2x18650 Battery+Charger. Tools & Home Improvement. Safety & Security. ULTRAFIRE.', 'G: Full Overlay Blum 110 deg Soft-Close BLUMotion Clip Top Frameless Hinges, Pair. Tools & Home Improvement. Hardware. Blum.', 'H: Industrial Air Contractor CTA5090412 4-Gallon Grade Direct Drive Pontoon Air Co

  1%|▏         | 10000/695321 [07:17<8:19:24, 22.87it/s]


['A: Case Logic TBC-302BLACK Black Ultra-Compact Camera Case With Storage. Electronics. Camera & Photo. Case Logic.', 'B: DreamCatcher~ DreamCatcher Feathers~ Approx 4.5" Diameter 12" long. Home & Kitchen. Home Dcor. PennyLaneGifts.com.', 'C: LQM 671731-001 MO06 MO09 Laptop Battery for HP Pavilion DV4-5000 DV6-7000 DV7-7000 Envy DV4-5200 Compatible 671567-321 H2L55AA (MO06). Electronics. Computers & Accessories. LQM.', 'D: Crystal Allies Natural Himalayan Salt Lamp and 2 Piece Cylinder Tea Candle Holder Combo with Dimmable Cord and Bulb. Home & Kitchen. Home Dcor. Crystal...', 'E: Personalized Mailman Christmas Holiday Gift Expertly Handwritten Ornament. Home & Kitchen. Home Dcor. Ornaments.', 'F: Chillz 3-in-1 Wine Bottle Cooler Stick - Best Barware Tool - Stainless Steel Chiller Cooling Rod - Air Aerator and Pourer (1pack). Home & Kitchen...', 'G: HiRO H50320 Dual Band Wireless 802.11ac AC1200 11ac WiFi 2T2R 867Mbps Low Profile PCIe PCI Express PCI-E x1 Adapter 2x 2dBi Dipole Antenna

  1%|▏         | 10000/731913 [06:52<8:16:41, 24.22it/s]


['A: Vacu Vin Food Saver for Tea and Nuts .65 liter. Home & Kitchen. Kitchen & Dining. Vacu Vin.', 'B: Kanex K166-1013 MultiSync Aluminum Bluetooth Full Size Keyboard w/Numeric keypad-Compatible w/iPhone/iPad/MacBook/Mac. Electronics. Computers & Accessories. Kanex.', 'C: OAproda 2 Pack EN-EL12 Battery and Ultra Slim USB Charger for Nikon KeyMission 360, KeyMission 170, Coolpix AW100, AW110, AW100s, AW120, AW130, S9500, S9300, S9200,...', 'D: Teyeleec Qanliiy 10-100x21 Pocket-size Mini Hd Monocular Telescope Objective Lens 21mm 66m/8000m. Electronics. Camera & Photo. TRADERPLUS.', 'E: Noctua NF-S12B FLX 120 mm 3 Speed Setting Beveled Blade Tips Design SSO Bearing Fan SCD 2 - Retail. Electronics. Computers & Accessories. Axpertec Inc.', 'F: Adorama 20-Inch Standard Cable Release with Screw Lock. Electronics. Camera & Photo. Adorama.', 'G: Eathtek Replacement CPU Cooling Cooler Fan for Dell Inspiron 17R N7010 series, Compatible with part numbers 0RKVVP RKVVP MF60100V1-C010-G99 (Note: Onl

  3%|▎         | 10000/300007 [06:46<3:16:40, 24.58it/s]


['A: Spring Creek Horse Sculpted Mug by Chris Cummings. Home & Kitchen. Kitchen & Dining. Wild Wings.', 'B: Bentology Lunch Bag and Box Set - Includes Insulated Bag with Handle, Bento Box, 5 Containers and Ice Pack (Kitty). Home & Kitchen. Kitchen &...', 'C: Black Mountain Products Yoga and Exercise Mat, 1/2 x 73 1/2 x 24 1/2-Inch. Sports & Outdoors. Sports & Fitness. Black Mountain.', 'D: SWFA SS 10x42 Tactical Riflescope MOA-Quad Reticle 1/4 MOA Adjustments Rear Focus SS10X42MOA. Sports & Outdoors. Sports & Fitness. SWFA.', 'E: Krismile New Romantic Personalised Wooden Mr & Mrs Love Wedding Table Decoration Favour (white). Home & Kitchen. Wall Art. Krismile.', 'F: Lorex QLR464 4-Channel PCI DVR Card with 4 Indoor/Outdoor Night Vision Security Camera (Black). Electronics. Camera & Photo. Lorex.', 'G: Household Essentials 1842 Three-Shelf Natural Canvas Hanging Sweater Organizer. Home & Kitchen. Storage & Organization. Household Essentials.', 'H: SET OF TWO (2) - 6-Inch Serving Ladle, 

## diverse instruction

In [11]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [12]:
random.seed(seed)
base_instr = "Given the products the user has purchased in history, rank the items in the listed options and output the item that the user is most likely to purchase next. Answer from one of the options."
instrs = [
"Based on the user's historical purchases, rank the items in options and predict the next product of the user's interest from the provided options.",
"Estimate the user's intent based on the user's purchase history, and predict the next product that the user is most likely to purchase from the given options.",
"Rank the items in options and predict the user's next purchase from the listed options by analyzing her historical purchases.",
"The user's purchase history implies her preferences. Rank the items in the options based on the user's preferences. Output the item that the user is most likely to purchase next from the options.",
"Rank items in listed options based on the user's purchase history to determine the item that the user is most likely to purchase next. Output the item with the highest likelihood of being the next purchase."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
diverse_instruction('./stru_data/sequential_rec_mix', instrs, unseen)
diverse_instruction('./stru_data/sequential_rec_OOD', instrs, unseen)

Estimate the user's intent based on the user's purchase history, and predict the next product that the user is most likely to purchase from the given options.


## few-shot

In [4]:
import os
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/sequential_rec_mix/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'options': train_data[index]['options'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'options': entry['options'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [5]:
few_shot('./stru_data/sequential_rec_mix')
few_shot('./stru_data/sequential_rec_mix_di')
few_shot('./stru_data/sequential_rec_OOD')
few_shot('./stru_data/sequential_rec_OOD_di')