In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import re
import json
from datasets import list_datasets, load_dataset
from nltk.corpus import wordnet
from nltk import word_tokenize
from tqdm.notebook import tqdm

In [None]:
# subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01']
# results = []
# for sub in subsets:
#     print(sub)
#     ds = load_dataset("amazon_us_reviews", sub, split='train', streaming=True)
#     results.extend(list(ds.take(20000)))

In [7]:
stopwords = set(each.strip() for each in (open("./stop_words.txt").readlines()))
stop_punct = [each.strip() for each in (open("./stop_punctaion.txt").readlines())]
def clean_text(text):
    if pd.isna(text):
        return text
    return text.replace(r"\n", " ").replace(r"<br />", " ").replace("&#34;", "\"")
def avg_token_len(l):
    return np.mean(l.apply(lambda x:len(x.split())))

def tokenize(content_list, stopwords, punct_pattern):
    result = []
    for s in tqdm(content_list):
        if pd.isna(s):
            continue
        s = re.sub(punct_pattern, "", s)
#         result.append([word for word in word_tokenize(s.lower()) if word not in stopwords])
        result.append([word for word in s.lower().split() if word not in stopwords])
    return result

def word_count(text):
    return len(text.split())

In [47]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
def lda_model(data, n_topics, n_top_words, n_jobs=1, method='lda', vectorizer='bow'):
    """
    return: lda模型， 单词主题dataframe， 困惑度， 文档主题分布矩阵
    """
    # 文档数*词汇表频率矩阵
    assert method in ['lda', 'nmf']
    assert vectorizer in ['bow', 'tfidf']
    if vectorizer == "bow":
        tf_vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english', max_features=10000)
    else:
        tf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, stop_words='english', max_features=10000)
    countvector = tf_vectorizer.fit_transform(data)
    # LDA模型
    if method == 'lda':
        lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50, learning_method='batch', n_jobs=n_jobs,
                                    random_state=10, batch_size=256)  # 变分推断EM
    else:
        lda = NMF(n_components=n_topics, max_iter=500, random_state=10) 
    docres = lda.fit_transform(countvector)
    # 文档的主题分布
    # 主题的词汇分布
    feature_names = tf_vectorizer.get_feature_names()
    res = pd.DataFrame()
    for topic_idx, topic in enumerate(lda.components_):
        # print('Topic %d' % topic_idx)
        res[f'Topic_{topic_idx}'] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        # print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return lda, res, docres

### yahoo_answer

In [48]:
# process yahoo_answer_data
ds = load_dataset("yahoo_answers_topics", split='train')
TOPICS = [
    "Society and Culture",
    "Science and Mathematics",
    "Health",
    "Education and Reference",
    "Computers and Internet",
    "Sports",
    "Business and Finance",
    "Entertainment and Music",
    "Family and Relationships",
    "Politics and Government",
]
label_to_topic = {k: v for k, v in enumerate(TOPICS)}
df = ds.to_pandas()
df = df.replace("", np.nan).dropna()

Downloading builder script:   0%|          | 0.00/3.60k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.02k [00:00<?, ?B/s]

Downloading and preparing dataset yahoo_answers_topics/yahoo_answers_topics to /home1/liumiao/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439...


Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Dataset yahoo_answers_topics downloaded and prepared to /home1/liumiao/.cache/huggingface/datasets/yahoo_answers_topics/yahoo_answers_topics/1.0.0/0edb353eefe79d9245d7bd7cac5ae6af19530439da520d6dde1c206ee38f4439. Subsequent calls will reuse this data.


In [49]:
df

Unnamed: 0,id,topic,question_title,question_content,best_answer
0,0,4,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,1,5,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,2,2,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,3,6,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,4,6,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...
...,...,...,...,...,...
1399991,1399991,0,Why believe in such hopelessness?,"Why believe in a religion, or lack there of, t...","Right on!! I live a sheltered life, honestly...."
1399992,1399992,3,where can i get a horse's skeletal and muscula...,mainly for a Breyer horse:Anatomy in Motion,Here's everything you can possibly wanna know....
1399993,1399993,9,In a quest to promote racial equality does the...,"For example, by allowing jobs to advertise for...",Yes. It also promotes gang culture in minorit...
1399996,1399996,6,Ways to sell your video games?,Like if you want to sell your video games how ...,"ebay, electronic boutique, babbages or flea ma..."


In [50]:
part_df = df.sample(300000)[["topic", "question_title", "question_content", "best_answer"]]
remain_df = df.drop(index=part_df.index)
remain_df.reset_index(inplace=True, drop=True)
part_df.reset_index(inplace=True, drop=True)
answer_pools = {k: remain_df[remain_df["topic"]==k]["best_answer"].reset_index(drop=True) for k in range(len(TOPICS))}

def gen_fake_answer(topic_id):
    candidate_topics = list(range(10))
    candidate_topics.pop(topic_id)
    fake_topic = random.choice(candidate_topics)
    fake_answer = random.choice(answer_pools[fake_topic])
    return fake_answer

part_df["fake_answer"] = part_df["topic"].apply(gen_fake_answer)
part_df.columns = ["topic_id", "title", "title_content", "answer", "fake_answer"]
part_df["topic_text"] = part_df["topic_id"].apply(lambda x: label_to_topic[x])
# for each in ["title", "title_content", "answer"]:
#     print(np.mean(part_df[each].apply(lambda x:len(x.split()))))
candiate_col = ["topic_text", "title", "title_content"]
def random_concat(row):
    text = ""
    p = [random.randint(0, 1) for _ in range(len(candiate_col))]
    for i in range(len(candiate_col)):
        if p[i]:
            text += row[candiate_col[i]]
            text += " "
    res = text.strip()
    return res if len(res) else row["title"]
        
    
part_df["topic"] = part_df.apply(random_concat, axis=1)
part_df["topic"] = part_df["topic"].apply(clean_text)
part_df["answer"] = part_df["answer"].apply(clean_text)
part_df["fake_answer"] = part_df["fake_answer"].apply(clean_text)
# part_df["topic"] = part_df["topic_text"] + ". " + part_df["title"]

In [55]:
pos_index = np.array([True if random.random() <0.5 else False for _ in range(len(part_df))])
pos = part_df[pos_index].copy()
neg = part_df[~pos_index].copy()
pos = pos[["topic", "answer"]]
pos.columns = ["topic", "context"]
neg = neg[["topic", "fake_answer"]]
neg.columns = ["topic", "context"]
pos["label"] = 1
pos = pos[pos["context"].str.split().apply(lambda x: len(x)>3)]
pos = pos.sample(100000)

neg["label"] = 0
neg = neg[neg["context"].str.split().apply(lambda x: len(x)>3)]
neg = neg.sample(100000)
merge_df = pd.concat([pos, neg], ignore_index=True)


# merge_df.to_csv("data/pretrain/yahoo_answer_aug_200000.csv", index=False)

### DBpedia14

In [4]:
# dbpedia_14
ds_dbpedia = load_dataset("dbpedia_14", split='train')
df_db = ds_dbpedia.to_pandas()

Found cached dataset dbpedia_14 (/home1/liumiao/.cache/huggingface/datasets/dbpedia_14/dbpedia_14/2.0.0/01dab9e10d969eadcdbc918be5a09c9190a24caeae33b10eee8f367a1e3f1f0c)


In [4]:
categories = [
    "Company",
    "EducationalInstitution",
    "Artist",
    "Athlete",
    "OfficeHolder",
    "MeanOfTransportation",
    "Building",
    "NaturalPlace",
    "Village",
    "Animal",
    "Plant",
    "Album",
    "Film",
    "WrittenWork"
]
label_to_cate_db14 = {k: v for k, v in enumerate(categories)}

cate_dic = {
    0:["institution created to conduct business", ["company", "corporation", "firm", "business", "commerce"]],
    1:["an institution dedicated to education", ["educational institution", "education", "school", "university", "college", "student", "teaching"]],
    2:["person whose creative work shows sensitivity and imagination", ["artist", "art", "singler", "writer", "drawer", "musician"]],
    3:["a person trained to compete in sports", ["athlete", "sports", "player", "sportsman", "ballplayer", "competition"]],
    4:["someone who is appointed or elected to an office and who holds a position of trust", ["officeholder", "politician", "party", "national", "governor", "election"]],
    5:["facility consisting of the means and equipment necessary for the movement of passengers or goods", ["transportation", "ship", "car", "railway", "aircraft"]],
    6: ["a structure that has a roof and walls and stands in one place", ["building", "house", "build", "place", "location"]],
    7: ["a place in the natural physical world including plants and animals and landscapes etc.", ["natural place", "river", "mountain", "lake", "sea", "nature"]],
    8: ["a community of people smaller than a town", ["village", "small town", "countryside", "rural"]],
    9: ["a living creature in nature characterized by voluntary movement", ["animal", "creature", "organism", "wild"]],
    10: ["a living organism lacking the power of locomotion and movement, has flower and leaves", ["plant", "flower", "tree", "leaves", "grow in soil"]],
    11:["one or more music recordings issued together", ["album", "record", "music", "song","studio", "band", "singer"]],
    12: ["a form of entertainment that enacts a story by a sequence of images and video", ["film", "movie", "actor", "director", "directed", "drama","screenplay", "story and plot", "role", "character", "theater"]],
    13: ["a written work or composition that has been published or printed on paper or online", ["written work", "book", "author", "content", "journal", "publish", "novel", "fiction", "magazine", "newspaper"]]
}

def words_augment(label):
    topic_word = cate_dic[label][-1][0]
    candidates = cate_dic[label][-1][1:]
    n_aug_words = random.randint(1, len(candidates))
    aug_words = [topic_word] + random.sample(candidates, n_aug_words)
    return " ".join(aug_words)

def augment_dbpedia(df, n_pos=100000, n_neg=100000):
    aug_type = np.random.randint(0, 3, len(df))
    ori = df[aug_type==0].copy()
    def_aug = df[aug_type==1].copy()
    word_aug = df[aug_type==2].copy()
    ori["topic"] = ori["label"].apply(lambda x: cate_dic[x][-1][0])
    def_aug["topic"] = def_aug["label"].apply(lambda x: cate_dic[x][0])
    word_aug["topic"] = word_aug["label"].apply(words_augment)
    df_aug = pd.concat([ori, def_aug, word_aug], ignore_index=True)[["label", "topic", "content"]]
    df_aug.columns = ["topic_id", "topic", "context"]
    pos = df_aug.sample(n_pos)
    pos["label"] = 1
    neg = df_aug.drop(pos.index, axis=0)
    neg_topics = {i: neg[neg["topic_id"] != i]["topic"].values for i in range(len(cate_dic))} 
    neg["topic"] = neg["topic_id"].apply(lambda x: random.choice(neg_topics[x]))
    neg["label"] = 0
    return pd.concat([pos.sample(n_pos), neg.sample(n_neg)], ignore_index=True)
    
    

In [204]:
# dbpedia_aug = augment_dbpedia(df_db)
# dbpedia_aug("topic_id", axis=1).to_csv("data/pretrain/dbpedia14_200000.csv", index=False)
dbpedia_aug = pd.read_csv("data/pretrain/dbpedia14_200000.csv")
print(avg_token_len(out["topic"]))
print(avg_token_len(out["context"]))
out["topic_id"].value_counts()

5.510215
46.085285


5     14523
9     14390
13    14334
2     14315
10    14311
12    14289
0     14286
3     14282
4     14250
1     14246
7     14235
6     14234
8     14222
11    14083
Name: topic_id, dtype: int64

In [109]:
class_df = df_db[df_db["label"] == 13]
# tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
# countvector = tf_vectorizer.fit_transform(class_df["content"])
# model, tfidf_res, tfidf_docres = lda_model(df_db["content"], 14, 10, n_jobs=-1, method='nmf', vectorizer='tfidf')
model, class_res, class_docres = lda_model(class_df["content"], 5, 20, n_jobs=-1, method='nmf', vectorizer='bow')

### amazon_review

In [14]:
amazon = pd.read_csv("data/amazon_review.csv")
amazon["review_body"] = amazon["review_body"].apply(clean_text)
amazon = amazon[["product_title", "product_category", "star_rating", "review_headline", "review_body"]]
amazon.dropna(inplace=True)
amazon = amazon[amazon["review_body"].apply(word_count) < 200].copy().reset_index(drop=True)

In [18]:
product_cate = ['Digital_Ebook_Purchase',
 'Baby',
 'Watches',
 'Digital_Software',
 'Jewelry',
 'Personal_Care_Appliances',
 'Music',
 'Beauty',
 'Pet Products',
 'Office Products',
 'Furniture',
 'Camera',
 'Major Appliances',
 'Mobile_Electronics',
 'Books',
 'Automotive',
 'Outdoors',
 'PC',
 'Apparel',
 'Lawn and Garden',
 'Mobile_Apps',
 'Health & Personal Care',
 'Grocery',
 'Kitchen',
 'Digital_Music_Purchase',
 'Digital_Video_Download',
 'Tools',
 'Gift Card',
 'Toys',
 'Video',
 'Software',
 'Video Games',
 'Electronics',
 'Video DVD',
 'Home Improvement',
 'Musical Instruments',
 'Sports',
 'Wireless',
 'Home',
 'Home Entertainment',
 'Luggage',
 'Digital_Video_Games',
 'Shoes'
]

amazon_idx_to_category = {k: v for k, v in enumerate(product_cate)}
amazon_category_to_idx = {v: k for k, v in amazon_idx_to_category.items()}
merge_category = {
    "book": ["Digital_Ebook_Purchase", "Books"],
    "video": ["Digital_Video_Download", "Video", "Video DVD"],
    "game and entertainment": ["Video Games", "Home Entertainment", "Digital_Video_Games"],
    "software and app": ["Digital_Software", "Mobile_Apps", "Software"],
    "household appliances and product": ["Furniture", "Major Appliances", "Kitchen", "Home", "Home Improvement"],
    "health and personal care": ["Personal_Care_Appliances", "Health & Personal Care"],
    "music and musical instruments":["Music", "Digital_Music_Purchase", "Musical Instruments"],
    "electronics product":["Mobile_Electronics", "Electronics", "Wireless"]
}
category_to_aggregated_category = dict()
for k, v in merge_category.items():
    for each in v:
        category_to_aggregated_category[each] = k
for oc in product_cate:
    if oc not in category_to_aggregated_category:
        category_to_aggregated_category[oc] = oc.replace("_", " ").lower()

aggregated_category = np.array(list(set(category_to_aggregated_category.values())))
cate_aug = json.load(open("data/pretrain/amazon_category_aug.json"))
aggregated_cate_aug = dict()
for k, v in cate_aug.items():
    ag_c = category_to_aggregated_category[k]
    if ag_c not in aggregated_cate_aug:
        aggregated_cate_aug[ag_c] = set(v)
    else:
        aggregated_cate_aug[ag_c] = aggregated_cate_aug[ag_c].union(set(v))
aggregated_cate_aug = {k: list(v) for k, v in aggregated_cate_aug.items()}

amazon["agg_category"] = amazon["product_category"].apply(lambda x: category_to_aggregated_category[x])

In [20]:
# category relevant
def amazon_topic_augment(category):
    candidates = aggregated_cate_aug[category]
    n_aug_words = random.randint(0, len(candidates))
    aug_words = [category] + random.sample(candidates, n_aug_words)
    return " ".join(aug_words)   

def category_relevant_aug(df, n_pos, n_neg):
    df["context"] = df.apply(lambda x: x["product_title"] + ". " + x["review_body"] if random.random() > 0.5 
                      else x["review_body"] + ". " + x["product_title"], axis=1)
    df["topic"] = df["agg_category"].apply(amazon_topic_augment)
    pos = df.sample(n_pos)
    pos["label"] = 1
    neg = df.drop(pos.index, axis=0).sample(n_neg)
    neg_topics = {c: neg[neg["agg_category"] != c]["topic"].values for c in aggregated_cate_aug} 
    neg["topic"] = neg["agg_category"].apply(lambda x: random.choice(neg_topics[x]))
    neg["label"] = 0
    pos = pos[["agg_category", "topic", "context", "label"]]
    neg = neg[["agg_category", "topic", "context", "label"]]
    return pd.concat([pos, neg], ignore_index=True)

amazon_cate_df = category_relevant_aug(amazon.sample(200000), n_pos=50000, n_neg=50000)

In [23]:
# sentiment relevant
senti_label_map = {
    0: ["negative", "bad", "worse", "terrible", "awful", "sucks", "useless", "disappoint", "frustrating"],
    1: ["positive", "good", "nice", "awesome", "comfortable", "excellent", "lovely", "perfect", "like"]
}
def sentiment_word_aug(senti_label):
    candidates = senti_label_map[senti_label]
    n = random.randint(1, len(candidates))
    return " ".join(random.sample(candidates, n))
def sentiment_relevant_aug(df, n_pos, n_neg):
    bad = df[df["star_rating"] < 3].copy()
    bad["senti_label"] = 0
    good = df[df["star_rating"]>3].sample(len(bad))
    good["senti_label"] = 1
    df = pd.concat([good, bad], ignore_index=True)
    df["context"] = df.apply(lambda x: x["review_headline"] + ". " + x["review_body"] if random.random() > 0.5 
                  else x["review_body"] + ". " + x["review_headline"], axis=1)
    pos = df.sample(n_pos)[["senti_label", "context"]]
    pos["topic"] = pos["senti_label"].apply(sentiment_word_aug)
    pos["label"] = 1
    neg = df.drop(pos.index, axis=0).sample(n_neg)[["senti_label", "context"]]
    neg["topic"] = neg["senti_label"].apply(lambda x: sentiment_word_aug(1-x))
    neg["label"] = 0
    return pd.concat([pos, neg], ignore_index=True)

amazon_senti_df = sentiment_relevant_aug(amazon, n_pos=50000, n_neg=50000)

In [37]:
# category + sentiment relevant
def sc_context_aug(row):
    text_list = [row["product_title"], row["review_headline"], row["review_body"]]
    random.shuffle(text_list)
    result = ""
    for each in text_list:
        result += each
        if result[-1] not in stop_punct:
            result += "."
        result += " "
    return result.strip()

def sc_topic_neg_aug(row):
    p = random.random()
    true_category = amazon_topic_augment(row["agg_category"])
    true_senti = sentiment_word_aug(row["senti_label"])
    fake_category = amazon_topic_augment(random.choice(aggregated_category[aggregated_category!=row["agg_category"]]))
    fake_senti = sentiment_word_aug(1 - row["senti_label"])
    if p < 0.2:
        topics = [fake_category, fake_senti]
    elif 0.2 <= p < 0.6:
        topics = [true_category, fake_senti]
    else:
        topics = [fake_category, true_senti]
    random.shuffle(topics)
    return " ".join(topics)
        

        
        
def sc_topic_pos_aug(row):
    topics = []
    topics.append(amazon_topic_augment(row["agg_category"]))
    topics.append(sentiment_word_aug(row["senti_label"]))
    random.shuffle(topics)
    return " ".join(topics)
    
    
    
def sentiment_category_relevant_aug(df, n_pos, n_neg):
    bad = df[df["star_rating"] < 3].copy()
    bad["senti_label"] = 0
    good = df[df["star_rating"]>3].sample(len(bad))
    good["senti_label"] = 1    
    df = pd.concat([good, bad], ignore_index=True)
    df["context"] = df.apply(sc_context_aug, axis=1)
    pos = df.sample(n_pos)[["senti_label", "agg_category", "context"]]
    pos["topic"] = pos.apply(sc_topic_pos_aug, axis=1)
    pos["label"] = 1
    
    neg = df.drop(pos.index, axis=0).sample(n_neg)[["senti_label", "agg_category", "context"]]
    neg["topic"] = neg.apply(sc_topic_neg_aug, axis=1)
    neg["label"] = 0
    return pd.concat([pos, neg], ignore_index=True)

amazon_senti_cate_relevant_aug = sentiment_category_relevant_aug(amazon, 50000, 50000)

In [60]:
a = amazon_cate_df[["topic", "context", "label"]]
b = amazon_senti_df[["topic", "context", "label"]]
c = amazon_senti_cate_relevant_aug[["topic", "context", "label"]]
amazon_aug = pd.concat([a, b, c], ignore_index=True)
amazon_aug.to_csv("data/pretrain/amazon_senti_cate_aug_300000", index=False)

Merge

In [68]:
pretrain_data = pd.concat([merge_df, dbpedia_aug, amazon_aug], ignore_index=True).sample(frac=1)

In [71]:
pretrain_data.to_csv("data/pretrain/pretrain_data_700000.csv", index=False)

In [181]:
# product_cate_aug = dict()
# for c in product_cate:
#     class_df = amazon[amazon["product_category"] == c]
#     model, res, docres = lda_model((class_df["product_title"] + " " + class_df["review_body"]).dropna(), 1, 20, n_jobs=-1, method='nmf', vectorizer='bow')
#     product_cate_aug[c.replace("_", " ").lower()] = res["Topic_0"].values.tolist()
#     print(c)
#     break


Unnamed: 0,senti_label,topic,context,label
0,1,excellent good nice,i love it. Five Stars,1
1,1,awesome like perfect nice lovely good excellent,Easy to put on. Much easier than the barbed fi...,1
2,1,good positive like excellent perfect comfortab...,I have ordered the six inches wide and liked w...,1
3,0,bad disappoint,"What I received was a complete joke, completel...",1
4,1,perfect,"Exactly as described, Fast Shipping and great ...",1
...,...,...,...,...
99995,1,frustrating terrible worse bad,Makes life easier. Wife loves these,0
99996,0,perfect excellent nice awesome like,Two Stars. Better if you know the source mater...,0
99997,0,good awesome,Waste of 8 dollars. This isn't even the origin...,0
99998,1,worse negative,Fond Memories. That was a wonderful night. I r...,0


In [167]:
amazon[amazon["star_rating"] >3]

Unnamed: 0,product_title,product_category,star_rating,review_headline,review_body,agg_category,context,topic
1,Selfie Stick Fiblastiq&trade; Extendable Wirel...,Wireless,4,A fun little gadget,"I’m embarrassed to admit that until recently, ...",electronics product,"I’m embarrassed to admit that until recently, ...",electronics product cable transmitter usb wire...
2,Tribe AB40 Water Resistant Sports Armband with...,Wireless,5,Five Stars,Fits iPhone 6 well,electronics product,Tribe AB40 Water Resistant Sports Armband with...,electronics product transmitter receiver reall...
3,RAVPower® Element 10400mAh External Battery US...,Wireless,5,Great charger,Great charger. I easily get 3+ charges on a S...,electronics product,RAVPower® Element 10400mAh External Battery US...,electronics product battery
4,Fosmon Micro USB Value Pack Bundle for Samsung...,Wireless,5,Five Stars,Great for the price :-),electronics product,Great for the price :-). Fosmon Micro USB Valu...,electronics product pad battery receiver headp...
5,"iPhone 6 Case, Vofolen Impact Resistant Protec...",Wireless,5,Five Stars,"Great Case, better customer service!",electronics product,"Great Case, better customer service!. iPhone 6...",electronics product wireless phone pad really ...
...,...,...,...,...,...,...,...,...
879992,Giving It to the Bad Boy (Tattooed and Pierced...,Digital_Ebook_Purchase,5,A Surprise Gem!,I am not a fan of the New Adult/Young Adult ge...,book,Giving It to the Bad Boy (Tattooed and Pierced...,book author fiction novel
879993,One Night in Bridgeport,Digital_Ebook_Purchase,4,Loved the way it ended!!,"Much to my surprise, I found myself getting so...",book,"Much to my surprise, I found myself getting so...",book series novel
879994,"Takeover (Comet Clement series, #9)",Digital_Ebook_Purchase,4,Takeover 9 comet Clement series,Less format issues. I have a system now to mak...,book,"Takeover (Comet Clement series, #9). Less form...",book author story series reading characters re...
879996,Pyromarne (The Heart of the Caveat Whale Book 2),Digital_Ebook_Purchase,5,Pyromarne,This book was much better than the first book ...,book,This book was much better than the first book ...,book


In [165]:
class_df = amazon[amazon["star_rating"] == 3]
model, res, docres = lda_model((class_df["review_headline"]).dropna(), 10, 20, n_jobs=-1, method='nmf', vectorizer='bow')
res

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9
0,stars,good,great,like,ok,nice,better,just,works,product
1,gave,quality,quality,really,price,looks,quality,okay,fine,love
2,given,price,price,don,game,small,price,fine,pretty,bad
3,reason,pretty,looks,looks,just,quality,work,bad,use,use
4,giving,bad,worked,didn,read,looking,expected,small,little,disappointed
5,fact,read,sound,look,guess,watch,little,work,doesn,fine
6,original,sound,fit,doesn,book,price,needs,little,expected,does
7,super,looks,work,does,best,size,version,don,cheap,decent
8,cover,book,game,feel,cheap,really,bad,really,time,work
9,time,game,idea,did,movie,little,thought,love,battery,did


In [61]:
class_df["product_title"]

500000    Aketek 1080P LED Protable Projector HD PC AV V...
500001               TiVo Mini with IR Remote (Old Version)
500002    Apple TV MD199LL/A Bundle including remote and...
500003               New Roku 3 6.5 Foot HDMI - Bundle - v1
500004    Generic DVI-I Dual-Link (M) to 15-Pin VGA (F) ...
                                ...                        
519995    TICTID Powertv Newest and Most Professional Ar...
519996    VIZIO E601i-A3 60-inch 1080p Razor LED Smart H...
519997    BenQ HT1075 1080P 3D DLP Home Theater Projecto...
519998       VIZIO E24-C1 E Series Class Razor LED Smart TV
519999               New Roku 3 6.5 Foot HDMI - Bundle - v1
Name: product_title, Length: 20000, dtype: object

In [1]:
from gensim.models import Word2Vec

In [9]:

pat = re.compile(r'[,.!?\'"()]')
sentences = tokenize(amazon["product_title"] + " " + amazon["review_body"], stopwords, pat)

  0%|          | 0/880000 [00:00<?, ?it/s]

In [30]:
w2v = Word2Vec(sentences=sentences, vector_size=100, sg=1, hs=0, negative=5, workers=16, window=8, max_vocab_size=7000)

In [174]:
w2v.wv.similar_by_word('worse', topn=20)

[('terrible', 0.7654217481613159),
 ('horrible', 0.7610396146774292),
 ('awful', 0.7588950395584106),
 ('worst', 0.7330824136734009),
 ('happened', 0.7127538919448853),
 ('sucks', 0.7082301378250122),
 ('basically', 0.7047510147094727),
 ('caused', 0.697221040725708),
 ('useless', 0.6885159611701965),
 ('disappointment', 0.679756760597229),
 ('luck', 0.6780253648757935),
 ('bad', 0.6755885481834412),
 ('constantly', 0.6664032936096191),
 ('disappointing', 0.6648461818695068),
 ('stuck', 0.65578293800354),
 ('avoid', 0.6516667008399963),
 ('eventually', 0.6492887139320374),
 ('happen', 0.6312088966369629),
 ('poor', 0.6267474889755249),
 ('frustrating', 0.6244062781333923)]

In [296]:
product_cate_aug = {k.replace("_", " ").lower(): v for k, v in product_cate_aug.items()}

In [297]:
json.dump(product_cate_aug, open("data/pretrain/amazon_category_aug.json", "w"))

In [63]:
avg_token_len(class_df["review_body"].dropna())

33.30516103220644

In [240]:
amazon.product_category.value_counts().to_dict()

{'Digital_Ebook_Purchase': 40000,
 'Baby': 20000,
 'Watches': 20000,
 'Digital_Software': 20000,
 'Jewelry': 20000,
 'Personal_Care_Appliances': 20000,
 'Music': 20000,
 'Beauty': 20000,
 'Pet Products': 20000,
 'Office Products': 20000,
 'Furniture': 20000,
 'Camera': 20000,
 'Major Appliances': 20000,
 'Mobile_Electronics': 20000,
 'Books': 20000,
 'Automotive': 20000,
 'Outdoors': 20000,
 'PC': 20000,
 'Apparel': 20000,
 'Lawn and Garden': 20000,
 'Mobile_Apps': 20000,
 'Health & Personal Care': 20000,
 'Grocery': 20000,
 'Kitchen': 20000,
 'Digital_Music_Purchase': 20000,
 'Digital_Video_Download': 20000,
 'Tools': 20000,
 'Gift Card': 20000,
 'Toys': 20000,
 'Video': 20000,
 'Software': 20000,
 'Video Games': 20000,
 'Electronics': 20000,
 'Video DVD': 20000,
 'Home Improvement': 20000,
 'Musical Instruments': 20000,
 'Sports': 20000,
 'Wireless': 20000,
 'Home': 20000,
 'Home Entertainment': 20000,
 'Luggage': 20000,
 'Digital_Video_Games': 20000,
 'Shoes': 20000}

In [7]:
# yahoo = pd.read_csv("data/pretrain/yahoo_answer_190722.csv")

In [121]:
amazon_review = pd.read_csv("data/pretrain/amazon_reviews_879819.csv")
amazon_review = amazon_review.sample(200000)
amazon_review = amazon_review[amazon_review["context"].str.split().apply(lambda x: len(x)>5)].copy()

In [130]:
pretrain_df = pd.concat([merge_df, amazon_review]).sample(frac=1)
pretrain_df.to_csv("data/pretrain/pretrain_data.csv", index=False)

In [None]:
df = amazon_review[["review_body", "product_category"]].copy()
df.columns = ["context", "topic"]
df["topic"] = df["topic"].apply(lambda x: x.lower().replace("_", " "))

# amazon_review.to_csv("data/amazon_review.csv", index=False)

In [None]:
import random
import numpy as np
pos_index = np.array([True if random.random() <0.3 else False for _ in range(len(df))])
pos = df[pos_index].copy()
neg = df[~pos_index].copy()
pos["label"] = 1
neg["label"] = 0



In [None]:
topics = df["topic"].unique()
knowledge = {}

for topic in topics:
    if len(topic.split(" ")) == 1:
        synonyms = []
        for syn in wordnet.synsets(topic):
            for lm in syn.lemmas():
                enrich = {'name': lm.name().replace("_", " "), 'definition': syn.definition()}
                if enrich not in synonyms:
                    synonyms.append(enrich)#adding into synonyms
        synonyms = synonyms[:5]
        knowledge[topic] = synonyms
    else:
        knowledge[topic] = dict()
        for each in topic.split(" "):
            if len(each) > 1 and each != "and":
                synonyms = []
                for syn in wordnet.synsets(each):
                    for lm in syn.lemmas():
                        enrich = {'name': lm.name().replace("_", " "), 'definition': syn.definition()}
                        if enrich not in synonyms:
                            synonyms.append(enrich)#adding into synonyms
                synonyms = synonyms[:5]
                knowledge[topic][each] = synonyms
        
    

    

In [None]:
import random
def topic_transform(topic):
    p = random.random()
    if p > 0.5:
        return topic
    
    if p < 0.25:
        # replace synonym
        if isinstance(knowledge[topic], list):
            l = knowledge[topic]
            names = list(set(each['name'] for each in l if each['name'] != topic))
            if len(names) == 0:
                new_topic = topic
            else:
                new_topic = random.choice(names)
        else:
            d = knowledge[topic]
            key = random.choice(list(d.keys()))
            names = list(set(each['name'] for each in d[key] if each['name'] != topic))
            if len(names) == 0:
                new_topic = topic
            else:
                new_topic = topic.replace(key, random.choice(names))

    else:
        if isinstance(knowledge[topic], list):
            defini = random.choice(knowledge[topic])['definition']
            new_topic = topic + " " + defini
        else:
            d = knowledge[topic]
            key = random.choice(list(d.keys()))
            if len(d[key]) == 0:
                new_topic = topic
            else:
                defini = random.choice(d[key])['definition']
                new_topic = topic + " " + defini

    return new_topic

def gen_fake_topic(topic):
    candidate = [each for each in knowledge.keys() if each != topic]
    return random.choice(candidate)

pos['topic'] = pos['topic'].apply(topic_transform)
neg['topic'] = neg['topic'].apply(gen_fake_topic)

In [None]:
all_df = pd.concat([pos, neg])
all_df = all_df.sort_index()
all_df.to_csv("data/amazon_review_processed.csv", index=False)

In [None]:
for k, v in knowledge.items():
    if isinstance(v, dict):
        if len(v.keys()) == 0:
            print(v)

In [None]:
df["topic"] = df["topic"].apply(lambda x: x.lower().replace("_", " "))
df["topic"].unique()
df["label"] = 1

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
from my_datasets import PreTrainDataset
from tqdm.notebook import tqdm
pt = PreTrainDataset("data/amazon_review_processed.csv", tokenizer=tokenizer)

In [None]:
ds = tf.data.Dataset.from_generator(
        pt.data_generator,
        output_types=(tf.string, tf.string, tf.int32)
    )

loader = ds.batch(16).map(pt.wrap_map)

In [None]:
iterator = iter(loader)
next(iterator)

In [None]:
for idx, each in tqdm(enumerate(loader), total=len(pt.data)/16):
    pass