In [8]:
import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter
import argparse
import multiprocessing
import math
import pdb
import random

import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy.sparse import csr_matrix
import nltk
from nltk import word_tokenize

from data_structure import Instance

np.random.seed(1234)
random.seed(1234)

pd.set_option('display.max_rows', 2000)

# configure

In [49]:
parser = argparse.ArgumentParser()

parser.add_argument('-seed', type=int, default=1234)
parser.add_argument('-n_reviews', type=int, default=8)
parser.add_argument('-item_min_reviews', type=int, default=8)
parser.add_argument('-n_per_item', type=int, default=12)
parser.add_argument('-filter_doc_l', type=int, default=50)
parser.add_argument('-filter_sent_l', type=int, default=40)
parser.add_argument('-min_tf', type=int, default=16)

parser.add_argument('-train_dir', type=str, default='data/yelp/train')
parser.add_argument('-val_dir', type=str, default='data/yelp/val')
parser.add_argument('-test_dir', type=str, default='data/yelp/test')
parser.add_argument('-ref_path', type=str, default='data/yelp/references.csv')

parser.add_argument('-stopwords_path', type=str, default='data/stopwords_mallet.txt')

config = parser.parse_args('')
config.raw_path = os.path.join('data', 'yelp/yelp_raw_df.pkl')
config.output_path = os.path.join('data', 'yelp/yelp_recursum_df.pkl')

In [10]:
# special tokens
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load data 

## load train raw data

In [13]:
def get_data_raw_df(data_paths):
    data_raw_dfs = []
    for data_path in data_paths:
        f = open(data_path, 'r')
        item_raw_df = pd.read_json(f)
        f.close()
        data_raw_dfs.append(item_raw_df)
    data_raw_df = pd.concat(data_raw_dfs)
    data_raw_df['tokens'] = data_raw_df['text'].apply(get_tokens)
    data_raw_df = data_raw_df[data_raw_df['tokens'].apply(lambda tokens: len(tokens) > 2)]
    return data_raw_df

In [14]:
get_data_paths = lambda data_dir: [os.path.join(data_dir, data_name) for data_name in os.listdir(data_dir) if not data_name == 'store-to-nreviews.json']
train_data_paths = get_data_paths(config.train_dir)
val_data_paths = get_data_paths(config.val_dir)
test_data_paths = get_data_paths(config.test_dir)

In [15]:
%%time
train_raw_df = apply_parallel(train_data_paths, num_split=32, map_func=get_data_raw_df).reset_index()
len(train_raw_df)

CPU times: user 52.3 s, sys: 12.6 s, total: 1min 4s
Wall time: 7min 13s


2008099

## load ref raw data

In [16]:
%%time
val_tmp_df = apply_parallel(val_data_paths, num_split=8, map_func=get_data_raw_df).reset_index()

CPU times: user 8.19 s, sys: 5.21 s, total: 13.4 s
Wall time: 1min 12s


In [17]:
%%time
test_tmp_df = apply_parallel(test_data_paths, num_split=8, map_func=get_data_raw_df).reset_index()

CPU times: user 3.01 s, sys: 3.18 s, total: 6.19 s
Wall time: 1min 5s


In [18]:
def get_ref_raw_df(ref_path, train_raw_df, val_tmp_df, test_tmp_df):
    def get_review_business_id_dict(ref_df, train_raw_df, val_tmp_df, test_tmp_df):
        ref_review_ids = []
        for _, row in ref_df.iterrows():
            ref_review_ids += [row['Input.original_review_%i_id' % i] for i in range(config.n_reviews)]
        ref_review_id_df = pd.DataFrame(ref_review_ids, columns=['review_id']) # only review_id in reference.csv
        concat_raw_df = pd.concat([train_raw_df, val_tmp_df, test_tmp_df])[['review_id', 'business_id', 'stars']] # filter review_id and business_id pair
        review_business_id_dict = {row.review_id: row.business_id for _, row in pd.merge(ref_review_id_df, concat_raw_df).iterrows()}
        review_stars_dict = {row.review_id: row.stars for _, row in pd.merge(ref_review_id_df, concat_raw_df).iterrows()}
        return review_business_id_dict, review_stars_dict
    
    ref_df = pd.read_csv(ref_path)
    ref_df['business_id_csv'] = ref_df.apply(lambda row: row['Input.business_id'] if row['Input.business_id'] != '#NAME?' else 'null_%i' % row.name, axis=1)
    review_business_id_dict, review_stars_dict = get_review_business_id_dict(ref_df, train_raw_df, val_tmp_df, test_tmp_df)
    
    ref_raw_dfs = []
    for index, row in ref_df.iterrows():
        business_id = None 
        texts, review_ids, stars = [], [], []
        summary = row['Answer.summary']
        for i in range(config.n_reviews):
            text = row['Input.original_review_%i' % i]
            review_id = row['Input.original_review_%i_id' % i]
            star = review_stars_dict[review_id] if review_id in review_stars_dict else 3
            
            texts.append(text)
            review_ids.append(review_id)
            stars.append(star)
            
            if review_id == '#NAME?': review_id = None
            if business_id is None and review_id in review_business_id_dict: business_id = review_business_id_dict[review_id] # get business_id from review_business_id_dict
            
        if business_id is None: business_id = row['business_id_csv'] # if all review_id not in review_business_id_dict, then business_id in csv is used
        for text, review_id, star in zip(texts, review_ids, stars):
            ref_raw_dfs.append({'business_id': business_id, 'text': text, 'review_id': review_id, 'summary': summary, 'stars': star})
        
    ref_raw_df = pd.DataFrame(ref_raw_dfs)
    ref_raw_df['tokens'] = ref_raw_df['text'].apply(get_tokens)
    return ref_raw_df

In [19]:
ref_raw_df = get_ref_raw_df(config.ref_path, train_raw_df, val_tmp_df, test_tmp_df)
assert len(ref_raw_df) == 200*config.n_reviews

# group data by business id

## group ref df

In [20]:
def get_ref_group_df(data_df):
    group_df = data_df.groupby('business_id').agg({
        'text': lambda text_list: ' </DOC> '.join(list(text_list)),
        'tokens': lambda token_idxs_list: [sent_idxs for token_idxs in list(token_idxs_list) for sent_idxs in token_idxs],
        'summary': lambda summary_series: list(summary_series)[0], # only first column for each business id
        'stars': lambda stars_list: list(stars_list)
    })
    group_df = group_df.reset_index()
    group_df['doc_l'] = group_df['tokens'].apply(lambda tokens: len(tokens))
    group_df['max_sent_l'] = group_df['tokens'].apply(lambda tokens: max([len(line) for line in tokens]))
    group_df['sent_l'] = group_df['tokens'].apply(lambda tokens: [len(line) for line in tokens])
    group_df['summary_tokens'] = group_df['summary'].apply(get_tokens)
    group_df['summary_doc_l'] = group_df['summary_tokens'].apply(lambda tokens: len(tokens))
    group_df['summary_max_sent_l'] = group_df['summary_tokens'].apply(lambda tokens: max([len(line) for line in tokens]))
    
    return group_df

In [21]:
ref_df = get_ref_group_df(ref_raw_df)
assert len(ref_df) == 200

## group train tmp df

In [22]:
def get_group_tmp_df(data_df, ref_df):
    group_df = data_df.groupby('business_id').agg({
        'tokens': lambda tokens_list: list(tokens_list),
        'stars': lambda stars_list: list(stars_list)
    })
    group_df = group_df.reset_index()
    ref_business_ids = ref_df['business_id'].values
    n_pre = len(group_df)
    group_df = group_df[group_df['business_id'].apply(lambda business_id: business_id not in ref_business_ids)]
    n_post = len(group_df)
    print('filtered %i business ids in references from train datasets' % (n_pre-n_post))
    return group_df

In [23]:
train_tmp_df = get_group_tmp_df(train_raw_df, ref_df)

filtered 166 business ids in references from train datasets


## batch train df

In [24]:
def get_group_df(group_tmp_df, n_reviews, filter_sent_l=np.inf, filter_doc_l=np.inf, item_max_reviews=None, item_min_reviews=0,  min_std_stars=np.inf):
    item_to_nreviews = {row.business_id: len(row.tokens) for _, row in group_tmp_df.iterrows()}
    n_per_item = np.mean([n for n in item_to_nreviews.values() if n <= item_max_reviews])
    n_per_item = math.ceil(n_per_item / n_reviews)
    print('Each item will appear {} times'.format(n_per_item))

    batch_list = []
    for _, row in group_tmp_df.iterrows():
        tokens_stars_list = [(sents, stars) for sents, stars in zip(row.tokens, row.stars) if max([len(sent) for sent in sents]) <= filter_sent_l] # filter
        if len(tokens_stars_list) < item_min_reviews: continue
        i_per_item = 0
        while i_per_item < n_per_item:
            batch_tokens_stars = random.sample(tokens_stars_list, n_reviews)
            tokens = [sent for sents, _ in batch_tokens_stars for sent in sents]
            doc_l = len(tokens)
            sent_l = [len(sent) for sent in tokens]
            max_sent_l = max(sent_l)
            stars = [star for _, star in batch_tokens_stars]
            mean_stars = np.mean(stars)
            std_stars = np.std(stars)
            if doc_l <= filter_doc_l:
                batch_list.append({'business_id': row.business_id, 'tokens': tokens, 'doc_l': doc_l, 'sent_l': sent_l, 'max_sent_l': max_sent_l, \
                                                   'stars': stars, 'mean_stars': mean_stars, 'std_stars': std_stars})
                i_per_item += 1
    
    assert len(batch_list) == len(item_to_nreviews) * n_per_item
    
    group_df = pd.DataFrame(batch_list)
    return group_df

In [25]:
%%time
# Each item will appear 12 times
# 185496
# CPU times: user 34.6 s, sys: 1.8 s, total: 36.4 s
# Wall time: 35 s
train_df = get_group_df(train_tmp_df, n_reviews=config.n_reviews, filter_sent_l=config.filter_sent_l, filter_doc_l=config.filter_doc_l, \
                                            item_max_reviews=config.item_max_reviews, min_std_stars=config.min_std_stars)

Each item will appear 12 times
185496
CPU times: user 34.6 s, sys: 1.8 s, total: 36.4 s
Wall time: 35 s


## split ref df into val & test df

In [37]:
test_indices = [111,  74, 106,  45, 143, 147,  56, 150, 184,  38,  61, 125, 116,
                             58, 159, 182,  86,   2,  22, 126,  55,  20, 161, 118, 141,  57,
                            123,  68, 164,  14, 122,  64,  53,  85, 135,  79, 163, 198, 109,
                            110,  25, 115, 113, 114,  78,  94, 151,  88, 162, 176,  66, 136,
                             62, 137, 158, 148, 171, 145,  52,   1,  82,   5, 173, 124, 190,
                            129, 185,  67, 107,   3, 193, 132,  69,  31,  41,  11, 108, 167,
                             96,  12, 139,  90,  23,  95,  21,   7,  54, 174,  65,  47, 194,
                            181, 153, 199, 121, 165,  80,  44, 188,  36]

In [38]:
test_df = ref_df.iloc[test_indices]
val_df = ref_df.drop(test_df.index)
assert len(set(test_df.index) & set(val_df.index)) == 0

# build token idxs for language modeling 

In [39]:
%%time
words_list = train_df['tokens'].apply(lambda tokens: [token for line in tokens for token in line])
word_tf_dict = sorted(Counter([word for words in words_list for word in words]).items(), key=lambda x: x[1])

CPU times: user 21.8 s, sys: 1.93 s, total: 23.7 s
Wall time: 23.7 s


In [47]:
def filter_words(word_tf_dict, max_tf_rate=0., min_tf_rate=0., min_tf=None, min_df=0., max_df=np.inf, stop_words=[]):
    filtered_word_tf_dict = dict([(word, tf) for word, tf in word_tf_dict if word not in stop_words])
    if min_tf: filtered_word_tf_dict = {word: tf for word, tf in filtered_word_tf_dict.items() if tf >= min_tf}
    filtered_words = [word for word, _ in filtered_word_tf_dict.items()]        
    return filtered_words

In [50]:
special_words = [PAD, UNK, BOS, EOS]
lm_words = special_words + filter_words(word_tf_dict, min_tf=config.min_tf) # large, usual
idx_to_word = {idx: word for idx, word in enumerate(lm_words)}
word_to_idx = {word: idx for idx, word in idx_to_word.items()}
len(lm_words) # 32956

32956

In [42]:
def apply_token_idxs(tokens_series):
    def get_token_idxs(tokens):
        return [[word_to_idx[token] if token in word_to_idx else word_to_idx[UNK] for token in sent] for sent in tokens]
    return tokens_series.apply(get_token_idxs)

In [43]:
%%time
train_df['token_idxs'] = apply_parallel(train_df['tokens'], num_split=64, map_func=apply_token_idxs)
val_df['token_idxs'] = apply_parallel(val_df['tokens'], num_split=64, map_func=apply_token_idxs)
test_df['token_idxs'] = apply_token_idxs(test_df['tokens'])

CPU times: user 39.9 s, sys: 56.8 s, total: 1min 36s
Wall time: 1min 38s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# write out

In [44]:
def get_save_df(data_df, summary=False):
    save_df = data_df[['business_id', 'doc_l', 'sent_l', 'max_sent_l', 'token_idxs', 'tokens']]
    if summary: save_df = data_df[['business_id', 'doc_l', 'sent_l', 'max_sent_l', 'tokens', 'token_idxs', \
                                                               'text', 'summary', 'summary_tokens', 'summary_doc_l', 'summary_max_sent_l']]
    return save_df

In [45]:
# (174794, 100, 100)
train_save_df = get_save_df(train_df)
val_save_df = get_save_df(val_df, summary=True)
test_save_df = get_save_df(test_df, summary=True)
len(train_save_df), len(val_save_df), len(test_save_df)

(185496, 100, 100)

In [46]:
config.output_path, len(word_to_idx)

('data/yelp/yelp_recursum_df.pkl', 32956)

In [59]:
print('saving preprocessed instances...')
cPickle.dump((train_save_df, val_save_df, test_save_df, word_to_idx, idx_to_word, None),open(config.output_path,'wb'))

saving preprocessed instances...
