In [131]:
import pandas as pd
import random
import numpy as np
from datasets import list_datasets, load_dataset
from nltk.corpus import wordnet

In [None]:
subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01']
results = []
for sub in subsets:
    print(sub)
    ds = load_dataset("amazon_us_reviews", sub, split='train', streaming=True)
    results.extend(list(ds.take(20000)))

In [95]:
# process yahoo_answer_data
ds = load_dataset("yahoo_answers_topics", split='train')
TOPICS = [
    "Society & Culture",
    "Science & Mathematics",
    "Health",
    "Education & Reference",
    "Computers & Internet",
    "Sports",
    "Business & Finance",
    "Entertainment & Music",
    "Family & Relationships",
    "Politics & Government",
]
label_to_topic = {k: v for k, v in enumerate(TOPICS)}
df = ds.to_pandas()
part_df = df.sample(200000)[["topic", "question_title", "best_answer"]]
remain_df = df.drop(index=part_df.index)
remain_df.reset_index(inplace=True, drop=True)
part_df.reset_index(inplace=True, drop=True)
answer_pools = {k: remain_df[remain_df["topic"]==k]["best_answer"].reset_index(drop=True) for k in range(len(TOPICS))}

def gen_fake_answer(topic_id):
    candidate_topics = list(range(10))
    candidate_topics.pop(topic_id)
    fake_topic = random.choice(candidate_topics)
    fake_answer = random.choice(answer_pools[fake_topic])
    return fake_answer

part_df["fake_answer"] = part_df["topic"].apply(gen_fake_answer)
part_df.columns = ["topic_id", "title", "answer", "fake_answer"]
part_df["topic_text"] = part_df["topic_id"].apply(lambda x: label_to_topic[x])
part_df["topic"] = part_df["topic_text"] + ". " + part_df["title"]

pos_index = np.array([True if random.random() <0.5 else False for _ in range(len(part_df))])
pos = part_df[pos_index].copy()
neg = part_df[~pos_index].copy()
pos = pos[["topic", "answer"]]
pos.columns = ["topic", "context"]
neg = neg[["topic", "fake_answer"]]
neg.columns = ["topic", "context"]
pos["label"] = 1
pos = pos[pos["context"].str.split().apply(lambda x: len(x)>3)].copy()

neg["label"] = 0
merge_df = pd.concat([pos, neg])

def clean_text(text):
    return text.replace(r"\n", " ").replace(r"<br />", " ")
merge_df["topic"] = merge_df["topic"].apply(clean_text)
merge_df["context"] = merge_df["context"].apply(clean_text)

# merge_df.to_csv("data/pretrain/yahoo_answer_190722.csv", index=False)

In [121]:
amazon_review = pd.read_csv("data/pretrain/amazon_reviews_879819.csv")
amazon_review = amazon_review.sample(200000)
amazon_review = amazon_review[amazon_review["context"].str.split().apply(lambda x: len(x)>5)].copy()

In [130]:
pretrain_df = pd.concat([merge_df, amazon_review]).sample(frac=1)
pretrain_df.to_csv("data/pretrain/pretrain_data.csv", index=False)

In [None]:
df = amazon_review[["review_body", "product_category"]].copy()
df.columns = ["context", "topic"]
df["topic"] = df["topic"].apply(lambda x: x.lower().replace("_", " "))

# amazon_review.to_csv("data/amazon_review.csv", index=False)

In [None]:
import random
import numpy as np
pos_index = np.array([True if random.random() <0.3 else False for _ in range(len(df))])
pos = df[pos_index].copy()
neg = df[~pos_index].copy()
pos["label"] = 1
neg["label"] = 0



In [None]:
topics = df["topic"].unique()
knowledge = {}

for topic in topics:
    if len(topic.split(" ")) == 1:
        synonyms = []
        for syn in wordnet.synsets(topic):
            for lm in syn.lemmas():
                enrich = {'name': lm.name().replace("_", " "), 'definition': syn.definition()}
                if enrich not in synonyms:
                    synonyms.append(enrich)#adding into synonyms
        synonyms = synonyms[:5]
        knowledge[topic] = synonyms
    else:
        knowledge[topic] = dict()
        for each in topic.split(" "):
            if len(each) > 1 and each != "and":
                synonyms = []
                for syn in wordnet.synsets(each):
                    for lm in syn.lemmas():
                        enrich = {'name': lm.name().replace("_", " "), 'definition': syn.definition()}
                        if enrich not in synonyms:
                            synonyms.append(enrich)#adding into synonyms
                synonyms = synonyms[:5]
                knowledge[topic][each] = synonyms
        
    

    

In [None]:
import random
def topic_transform(topic):
    p = random.random()
    if p > 0.5:
        return topic
    
    if p < 0.25:
        # replace synonym
        if isinstance(knowledge[topic], list):
            l = knowledge[topic]
            names = list(set(each['name'] for each in l if each['name'] != topic))
            if len(names) == 0:
                new_topic = topic
            else:
                new_topic = random.choice(names)
        else:
            d = knowledge[topic]
            key = random.choice(list(d.keys()))
            names = list(set(each['name'] for each in d[key] if each['name'] != topic))
            if len(names) == 0:
                new_topic = topic
            else:
                new_topic = topic.replace(key, random.choice(names))

    else:
        if isinstance(knowledge[topic], list):
            defini = random.choice(knowledge[topic])['definition']
            new_topic = topic + " " + defini
        else:
            d = knowledge[topic]
            key = random.choice(list(d.keys()))
            if len(d[key]) == 0:
                new_topic = topic
            else:
                defini = random.choice(d[key])['definition']
                new_topic = topic + " " + defini

    return new_topic

def gen_fake_topic(topic):
    candidate = [each for each in knowledge.keys() if each != topic]
    return random.choice(candidate)

pos['topic'] = pos['topic'].apply(topic_transform)
neg['topic'] = neg['topic'].apply(gen_fake_topic)

In [None]:
all_df = pd.concat([pos, neg])
all_df = all_df.sort_index()
all_df.to_csv("data/amazon_review_processed.csv", index=False)

In [None]:
for k, v in knowledge.items():
    if isinstance(v, dict):
        if len(v.keys()) == 0:
            print(v)

In [None]:
df["topic"] = df["topic"].apply(lambda x: x.lower().replace("_", " "))
df["topic"].unique()
df["label"] = 1

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
from my_datasets import PreTrainDataset
from tqdm.notebook import tqdm
pt = PreTrainDataset("data/amazon_review_processed.csv", tokenizer=tokenizer)

In [None]:
ds = tf.data.Dataset.from_generator(
        pt.data_generator,
        output_types=(tf.string, tf.string, tf.int32)
    )

loader = ds.batch(16).map(pt.wrap_map)

In [None]:
iterator = iter(loader)
next(iterator)

In [None]:
for idx, each in tqdm(enumerate(loader), total=len(pt.data)/16):
    pass

In [None]:
tf.data.Dataset