In [15]:
%load_ext autoreload
%autoreload

import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter
import argparse
import multiprocessing
import math
import pdb
import random

import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy.sparse import csr_matrix
import nltk

from data_structure import Instance
from utils import apply_parallel, get_tokens, get_group_df, filter_words

np.random.seed(1234)
random.seed(1234)

pd.set_option('display.max_rows', 2000)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# configure

In [3]:
parser = argparse.ArgumentParser()

parser.add_argument('-seed', type=int, default=1234)
parser.add_argument('-n_reviews', type=int, default=8)
parser.add_argument('-item_min_reviews', type=int, default=8)
parser.add_argument('-n_per_item', type=int, default=2)
parser.add_argument('-filter_doc_l', type=int, default=50)
parser.add_argument('-filter_sent_l', type=int, default=40)

parser.add_argument('-train_dir', type=str, default='data/amazon/train')
parser.add_argument('-dev_path', type=str, default='data/amazon/dev.csv')
parser.add_argument('-test_path', type=str, default='data/amazon/test.csv')

config = parser.parse_args('')
config.output_path = os.path.join('data', 'amazon/amazon_recursum_df.pkl')

In [4]:
# special tokens
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

In [5]:
category_dict = {'cloth': 'clothing', 'electronics': 'electronics', 'health_personal_care': 'health', 'home_kitchen': 'home',
                               'reviews_clothing_shoes_and_jewelry': 'clothing', 'reviews_health_and_personal_care': 'health', 'reviews_electronics': 'electronics', 'reviews_home_and_kitchen': 'home'}

# load data 

## load val & test df

In [6]:
def get_ref_df(ref_path):
    ref_raw_df = pd.read_csv(ref_path, sep='\t')

    ref_list = []
    for _, row in ref_raw_df.iterrows():
        business_id = row['prod_id']
        category = category_dict[row['cat']]
        text_list = [row['rev%i'%(i_rev+1)] for i_rev in range(8)]
        tokens = [sent_tokens for text in text_list for sent_tokens in get_tokens(text)]
        text = ' </DOC> '.join(text_list)
        stars = [0 for _ in range(8)]
        doc_l = len(tokens)
        sent_l = [len(line) for line in tokens]
        max_sent_l = max(sent_l)

        for i_summ in range(3):
            summary = row['summ%i'%(i_summ+1)]
            summary_tokens = get_tokens(summary)
            summary_doc_l = len(summary_tokens)
            summary_max_sent_l = max([len(line) for line in tokens])

            ref_list.append({'business_id': business_id, 'category': category, 'text': text, 'tokens': tokens, 'summary': summary, 'stars': stars, \
                                         'doc_l': doc_l, 'max_sent_l':max_sent_l, 'sent_l':sent_l,
                                         'summary_tokens': summary_tokens, 'summary_doc_l': summary_doc_l, 'summary_max_sent_l': summary_max_sent_l})

    ref_df = pd.DataFrame(ref_list)
    return ref_df

In [16]:
dev_df = get_ref_df(config.dev_path)
test_df = get_ref_df(config.test_path)

## load train raw df

In [17]:
def get_data_raw_df(data_paths):
    data_raw_dfs = []
    for data_path in data_paths:
        item_raw_df = pd.read_csv(data_path, '\t')
        item_raw_df['tokens'] = item_raw_df['review_text'].apply(get_tokens)
        item_raw_df = item_raw_df[item_raw_df['tokens'].apply(lambda tokens: len(tokens) > 2)]
        if 'None' in list(item_raw_df.rating.values):
            item_raw_df = item_raw_df[item_raw_df['rating'].apply(lambda rating: rating != 'None')]
            item_raw_df['rating'] = item_raw_df['rating'].apply(lambda r: float(r))
        if len(item_raw_df) == 0: continue
        data_raw_df = item_raw_df.groupby('group_id').agg({
            'category': lambda category_list: category_dict[list(category_list)[0]],
            'tokens': lambda tokens_list: list(tokens_list),
            'rating': lambda stars_list: list(stars_list)
        })
        data_raw_dfs.append(data_raw_df)
    data_raw_df = pd.concat(data_raw_dfs)
    return data_raw_df

In [18]:
%%time
train_data_paths = [os.path.join(config.train_dir, data_name) for data_name in os.listdir(config.train_dir)]
train_raw_df = apply_parallel(train_data_paths, num_split=32, map_func=get_data_raw_df).reset_index().rename(columns={'group_id': 'business_id', 'rating': 'stars'})
len(train_raw_df)

FileNotFoundError: [Errno 2] No such file or directory: 'data/amazon/train'

## load train df

In [14]:
%%time
train_df = get_group_df(train_tmp_df, n_reviews=config.n_reviews, filter_sent_l=config.filter_sent_l, filter_doc_l=config.filter_doc_l, \
                        item_min_reviews=config.item_min_reviews, n_per_item=config.n_per_item)
print(len(train_df)) # 280692

Each item will appear 2 times
filtered unbalanced 0 instances from group_df
280692
CPU times: user 1min 41s, sys: 2.01 s, total: 1min 43s
Wall time: 1min 43s


In [15]:
len(set(train_raw_df.business_id)), len(set(train_df.business_id))

(182928, 140346)

# build token idxs for language modeling 

In [20]:
%%time
words_list = train_df['tokens'].apply(lambda tokens: [token for line in tokens for token in line])
word_tf_dict = sorted(Counter([word for words in words_list for word in words]).items(), key=lambda x: x[1])

CPU times: user 23 s, sys: 4.39 s, total: 27.4 s
Wall time: 27.3 s


In [23]:
lm_words = [PAD, UNK, BOS, EOS]
lm_words += filter_words(word_tf_dict, min_tf=16) # large, usual
idx_to_word = {idx: word for idx, word in enumerate(lm_words)}
word_to_idx = {word: idx for idx, word in idx_to_word.items()}
len(lm_words) # 30732

30732

In [24]:
def apply_token_idxs(tokens_series):
    def get_token_idxs(tokens):
        return [[word_to_idx[token] if token in word_to_idx else word_to_idx[UNK] for token in sent] for sent in tokens]
    return tokens_series.apply(get_token_idxs)

In [25]:
%%time
train_df['token_idxs'] = apply_parallel(train_df['tokens'], num_split=64, map_func=apply_token_idxs)
dev_df['token_idxs'] = apply_token_idxs(dev_df['tokens'])
test_df['token_idxs'] = apply_token_idxs(test_df['tokens'])

CPU times: user 44.9 s, sys: 28.1 s, total: 1min 13s
Wall time: 1min 13s


# write out

In [53]:
def get_save_df(data_df, summary=False):
    save_df = data_df[['business_id', 'doc_l', 'sent_l', 'max_sent_l', 'bows', 'tfidfbows', 'token_idxs']]
    if summary: save_df = data_df[['business_id', 'doc_l', 'sent_l', 'max_sent_l', 'bows', 'tfidfbows', 'token_idxs', \
                                                               'text', 'summary', 'summary_tokens', 'summary_doc_l', 'summary_max_sent_l']]
    return save_df

In [54]:
# (280692, 84, 96)
train_save_df = get_save_df(train_df)
dev_save_df = get_save_df(dev_df, summary=True)
test_save_df = get_save_df(test_df, summary=True)
len(train_save_df), len(dev_save_df), len(test_save_df)

(280692, 84, 96)

In [55]:
config.output_path, len(word_to_idx)

('data/amazon/amazon_df.pkl', 30732)

In [56]:
print('saving preprocessed instances...')
cPickle.dump((train_save_df, dev_save_df, test_save_df, word_to_idx, idx_to_word, bow_idxs),open(config.output_path,'wb'))
# cPickle.dump((train_save_df, dev_save_df, test_save_df, word_to_idx, idx_to_word, bow_idxs),open(config.output_large_path,'wb'))

saving preprocessed instances...
