# init

In [1]:
import os
import numpy as np
import pandas as pd

import cPickle
from collections import OrderedDict, defaultdict

from data_structure import Instance, DataSet

In [2]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
PAD_IDX = 0
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
UNK_IDX = 1
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
BOS_IDX = 2
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences
EOS_IDX = 3

# load and preprocess data

## split review_df

In [3]:
dirname = 'data'

In [5]:
# cPickle.dump(review_df, open(os.path.join(dirname, 'reviews_df.pkl'), 'wb'))
review_df = cPickle.load(open(os.path.join(dirname, 'sports_all_df.pkl'), 'rb'))

In [25]:
test_all_df = review_df[0:1000]
dev_all_df = review_df[1000:2000]
train_all_df = review_df[2000:]

In [26]:
train_df = train_all_df[(train_all_df['doc_l']>=10)&(train_all_df['doc_l']<60)&(train_all_df['max_sent_l']<=50)]
dev_df = dev_all_df[(dev_all_df['doc_l']>=5)&(dev_all_df['doc_l']<60)&(dev_all_df['max_sent_l']<=50)]
test_df = test_all_df[(test_all_df['doc_l']>=5)&(test_all_df['doc_l']<60)&(test_all_df['max_sent_l']<=50)]

In [27]:
len(train_df), len(dev_df), len(test_df)

(37445, 511, 466)

In [29]:
cPickle.dump((train_df, dev_df, test_df), open(os.path.join(dirname, 'sports_df.pkl'), 'wb'))

## use raw fastText vec

In [30]:
def get_word_list(tokens_list):
    # create vocab of words
    word_dict = defaultdict(int)
    word_dict[BOS] = np.inf
    word_dict[EOS] = np.inf
    word_dict['.'] = np.inf
    for tokens in tokens_list:
        for word in tokens:
            word_dict[word] += 1
    word_dict = sorted(word_dict.items(), key=lambda x: x[1])[::-1]
    return [w for w, cnt in word_dict]

def get_fasttext(word_vec_path):
    # create word_vec with word_vec vectors
    word_vec = {}
    header = True
    with open(word_vec_path) as f:
        for line in f:
            if header:
                header = False
                continue
            word, vec = line.split(' ', 1)
            word_vec[word] = np.array(list(vec.split())).astype(np.float32)
    return word_vec

def get_word_vec(word_list, fasttext_vec):
    word_vec = []
    for word in word_list:
        try:
            vec = fasttext_vec[word]
            word_vec.append((word, vec))
        except:
            continue
    return OrderedDict(word_vec)

def get_vocab_emb(word_vec, word_emb_dim, N=0):
    word_vec = word_vec.items()
    word_vec.insert(0, (UNK, np.zeros([word_emb_dim], dtype=np.float32)))
    word_vec.insert(0, (PAD, np.zeros([word_emb_dim], dtype=np.float32)))
    
    if N>0: word_vec = word_vec[:N]
    vocab = {word: i for i, (word, vec) in enumerate(word_vec)}
    embeddings = np.array([vec for word, vec in word_vec]).astype(np.float32)
    assert len(vocab) == len(embeddings)
    return vocab, embeddings

In [31]:
tokens = []
for doc in train_df.tokens:
    tokens.extend(doc)
if(dev_df is not None):
    for doc in dev_df.tokens:
        tokens.extend(doc)
print(tokens[0])

['bought', 'these', 'as', 'a', 'wedding', 'gift', 'to', 'my', 'groomsmen']


In [32]:
word_list = get_word_list(tokens)

In [35]:
%%time
word_vec_path = os.path.join('data', 'fastText/crawl-300d-2M.vec')
fasttext_vec = get_fasttext(word_vec_path)
cPickle.dump(fasttext_vec, open(os.path.join('data', 'tmp-crawl-300d-2M.dict'), 'wb'))

# fasttext_vec = cPickle.load(open(os.path.join('data', 'crawl-300d-2M.dict'), 'rb'))

In [36]:
word_vec = get_word_vec(word_list, fasttext_vec)
len(word_vec)

50047

In [37]:
word_emb_dim = 300
vocab, embeddings = get_vocab_emb(word_vec, word_emb_dim, N=50000)

# build instance list

In [38]:
def to_line_idxs(token_idxs, vocab):
    tokens_bos_eos = [token + [vocab['.']] for token in token_idxs]
    line_idxs = [token for tokens_line in tokens_bos_eos for token in tokens_line]
    return line_idxs

In [39]:
def prepare_instancelst(data_df, vocab):
    instancelst = []
    for i_doc, doc in data_df.iterrows():
        instance = Instance()
        instance.idx = i_doc
        instance.asin = doc.asin
        doc_token_idxs = []
        for i, sent_tokens in enumerate(doc.tokens):
            sent_token_idxs = []
            for token in sent_tokens:
                if(token in vocab):
                    sent_token_idxs.append(vocab[token])
                else:
                    sent_token_idxs.append(vocab[UNK])
            doc_token_idxs.append(sent_token_idxs)
        instance.token_idxs = doc_token_idxs
        instance.line_idxs = to_line_idxs(doc_token_idxs, vocab)
        instance.goldLabel = doc.overall
        instance.summary = doc.summary
        instance.summary_tokens = doc.summary_tokens
        instance.summary_idxs = [vocab[token] if token in vocab else vocab[UNK] for token in instance.summary_tokens]
        instance.doc_l = doc.doc_l
        instance.max_sent_l = doc.max_sent_l
        instancelst.append(instance)
    return instancelst

In [40]:
instances_train = prepare_instancelst(train_df, vocab)
instances_dev = prepare_instancelst(dev_df, vocab)
instances_test = prepare_instancelst(test_df, vocab)

In [41]:
out_filename = 'sports_fined.pkl'
out_path = os.path.join(dirname, out_filename)
cPickle.dump((instances_train, instances_dev, instances_test, embeddings, vocab),open(out_path,'w'))