In [1]:
import random
import numpy as np

seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
import pickle
import json
def load_pickle(addr):
    with open(addr, 'rb') as f:
        return pickle.load(f)
    
def dump_pickle(data, addr):
    with open(addr, 'wb') as f:
        pickle.dump(data, f)

def load_json(addr):
    with open(addr, 'r') as f:
        return json.load(f)
    
def dump_json(data, addr):
    with open(addr, 'w') as f:
        json.dump(data, f, indent=4)

In [3]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')       
    for l in g:
        yield json.loads(l)

def getgz(path):
    raw_data = []
    print('start to read')
    for d in parse(path):
        raw_data.append(d)
        if len(raw_data) == 200000:
            break
    return raw_data

In [4]:
from tqdm import tqdm
import random
import os
from sklearn.model_selection import train_test_split

def filter_review(path):
    raw_data = getgz(path)
    print('read done')
    filtered = []
    for entry in tqdm(raw_data):
        try:
            review = entry["reviewText"]
            rate = int(entry["overall"])
        except:
            continue
        if len(review.split(' ')) < 10:
            continue
        filtered.append(entry)
    print('after filter: {}'.format(len(filtered)))
    return filtered

def process_entry(datas, size):
    ratinglevel = {
        5: "A: very positive",
        4: "B: positive",
        3: "C: neutral",
        2: "D: negative",
        1: "E: very negative"
    }
    stru_data = []
    baseline = []
    for i in range(len(datas)):
        data = datas[i]
        for entry in tqdm(data):
            if len(stru_data) < (i+1)*size/len(datas):
                review = entry["reviewText"]
                rating = int(entry["overall"])
                new_entry = {}
                new_entry["instruction"] = "Given the user's review, identify the user's sentiment from the listed options. Answer using one of the options."
                new_entry["input"] = review
                new_entry["options"] = json.dumps(list(ratinglevel.values()))
                new_entry["output"] = ratinglevel[rating]
                stru_data.append(new_entry)
                rating -= 1
                if len(datas) > 1:
                    baseline.append([review, rating])

    return stru_data, baseline

def split_data(data):
    train, val_test = train_test_split(data, test_size=0.2, random_state=seed)
    val, test = train_test_split(val_test, test_size=0.5, random_state=seed)
    return train, val, test

def process_stru_data(cats, meta_dir, train_size, val_size, test_size):
    trains, vals, tests = [], [], []
    for cat in cats:
        path = '{}/{}.json.gz'.format(meta_dir, cat)
        data = filter_review(path)
        print('filter done')
        train, val, test = split_data(data)
        trains.append(train)
        vals.append(val)
        tests.append(test)
    if len(cats) == 1:
        dir = './stru_data/sentiment_OOD'
        if not os.path.exists(dir):
            os.makedirs(dir)
        stru_data, baseline = process_entry(tests, test_size)
        dump_json(stru_data, '{}/test_{}k.json'.format(dir, len(stru_data)//1000))
    else:
        dir = './stru_data/sentiment_mix'
        if not os.path.exists(dir):
            os.makedirs(dir)
        dir_baseline = './baseline/data/mix'
        if not os.path.exists(dir_baseline):
            os.makedirs(dir_baseline)
        stru_data, baseline = process_entry(trains, train_size)
        dump_json(stru_data, '{}/train_{}k.json'.format(dir, len(stru_data)//1000))
        dump_json(baseline, '{}/train_{}k.json'.format(dir_baseline, len(baseline)//1000))
        stru_data, baseline = process_entry(vals, val_size)
        dump_json(stru_data, '{}/val_{}k.json'.format(dir, len(stru_data)//1000))
        dump_json(baseline, '{}/val_{}k.json'.format(dir_baseline, len(baseline)//1000))
        stru_data, baseline = process_entry(tests, test_size)
        dump_json(stru_data, '{}/test_{}k.json'.format(dir, len(stru_data)//1000))
        dump_json(baseline, '{}/test_{}k.json'.format(dir_baseline, len(baseline)//1000))

In [None]:
meta_dir = './datasets/Amazon_review/review/all_category'
train_size, val_size, test_size = 10000, 1000, 1000
process_stru_data(['Tools_and_Home_Improvement'], meta_dir, train_size, val_size, test_size)
process_stru_data(["Electronics","Home_and_Kitchen","Sports_and_Outdoors"], meta_dir, train_size, val_size, test_size)

## diverse instruction

In [6]:
def diverse_instruction(path, instrs, unseen):
    for file in os.listdir(path):
        data = load_json(os.path.join(path, file))
        for entry in data:
            entry["instruction"] = random.sample(instrs, k=1)[0]
        if not os.path.exists(os.path.join('{}_di'.format(path))):
            os.makedirs(os.path.join('{}_di'.format(path)))
        dump_json(data, os.path.join('{}_di'.format(path), file))
    
        if file.startswith('test'):
            for entry in data:
                entry["instruction"] = unseen
            if not os.path.exists(os.path.join('{}_ui'.format(path))):
                os.makedirs(os.path.join('{}_ui'.format(path)))
            dump_json(data, os.path.join('{}_ui'.format(path), file))

In [7]:
random.seed(seed)
base_instr = "Given the user's review, identify the user's sentiment from the listed options. Answer using one of the options."
instrs = [
    "Assess the user's sentiment in the provided review and select the appropriate sentiment option from the list as the answer.",
"Analyze the user's review and determine the sentiment based on the listed options.",
"Determine the sentiment expressed by the user in her review from the provided choices, and respond by selecting one of the available options.",
"Carefully assess the user's review for any strong expressions of sentiment, either positive or negative. Based on your analysis, select the most fitting sentiment option from the provided list as output.",
"Analyze the user's review text and determine the overall sentiment expressed, then choose the corresponding sentiment option from the provided list (A: very positive, B: positive, C: neutral, D: negative, E: very negative) based on the identified sentiment."
]
unseen = random.sample(instrs, k=1)[0]
instrs.remove(unseen)
print(unseen)
instrs.append(base_instr)
diverse_instruction('./stru_data/sentiment_mix', instrs, unseen)
diverse_instruction('./stru_data/sentiment_OOD', instrs, unseen)

Analyze the user's review and determine the sentiment based on the listed options.


## few-shot

In [3]:
import os
def few_shot(path):
    test_data = load_json('{}/test_1k.json'.format(path))
    try:
        train_data = load_json('{}/train_10k.json'.format(path))
    except:
        train_data = load_json('./stru_data/sentiment_mix/train_10k.json')
    few_shot = []
    for index, entry in enumerate(test_data):
        new_entry = {}
        new_entry['instruction'] = entry['instruction']
        new_entry['example'] = json.dumps({
            'input': train_data[index]['input'],
            'options': train_data[index]['options'],
            'output': train_data[index]['output']
        })
        new_entry['test example'] = json.dumps({
            'input': entry['input'],
            'options': entry['options'],
            'output': entry["output"]
        })
        few_shot.append(new_entry)
    if not os.path.exists('{}_few_shot'.format(path)):
        os.makedirs('{}_few_shot'.format(path))
    dump_json(few_shot, '{}_few_shot/test_1k.json'.format(path))

In [4]:
few_shot('./stru_data/sentiment_mix')
few_shot('./stru_data/sentiment_mix_di')
few_shot('./stru_data/sentiment_OOD')
few_shot('./stru_data/sentiment_OOD_di')