In [1]:
import os
import numpy as np
import pandas as pd

import _pickle as cPickle
from collections import OrderedDict, defaultdict

import tensorflow as tf

from data_structure import Instance

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('input_path', 'data/sports_df.pkl', 'path of output data')
flags.DEFINE_string('output_path', 'data/sports_sents.pkl', 'path of input data')
flags.DEFINE_string('word_vec_path', 'data/crawl-300d-2M.vec', 'path of pretrained word vec')

flags.DEFINE_integer('n_vocab', 50000, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [3]:
# special tokens
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

In [4]:
def get_word_count_dict(tokens_list):
    # create vocab of words
    word_dict = defaultdict(int)
    word_dict['.'] = np.inf
    word_dict[EOS] = np.inf
    word_dict[BOS] = np.inf
    word_dict[UNK] = np.inf
    word_dict[PAD] = np.inf
    
    for tokens in tokens_list:
        for word in tokens:
            word_dict[word] += 1
    word_dict = sorted(word_dict.items(), key=lambda x: x[1])[::-1]
    return word_dict

In [5]:
print(str(config.flag_values_dict()))

print('loading input data...')
train_df, dev_df, test_df = cPickle.load(open(config.input_path, 'rb'))

{'input_path': 'data/sports_df.pkl', 'output_path': 'data/sports_sents.pkl', 'word_vec_path': 'data/crawl-300d-2M.vec', 'n_vocab': 50000, 'f': ''}
loading input data...


In [6]:
tokens = []
for doc in train_df.tokens:
    tokens.extend(doc)
for doc in dev_df.tokens:
    tokens.extend(doc)

In [13]:
word_count_dict = get_word_count_dict(tokens)
n_idx_to_word = 20000
idx_to_word = {idx: word for idx, (word, cnt) in enumerate(word_count_dict[:n_idx_to_word])}
assert len(idx_to_word) == n_idx_to_word
word_to_idx = {word: idx for idx, word in idx_to_word.items()}

In [14]:
def prepare_data(data_df, word_to_idx):
    data = []
    for i_doc, doc in data_df.iterrows():
        for i, sent_tokens in enumerate(doc.tokens):
            sent_idxs = [word_to_idx[token] if token in word_to_idx else word_to_idx[UNK] for token in sent_tokens]
            data.append(sent_idxs)
            
    sorted_data = sorted(data, key=lambda x: len(x))
    return sorted_data

In [15]:
data_train = prepare_data(train_df, word_to_idx)
data_dev = prepare_data(dev_df, word_to_idx)
data_test = prepare_data(test_df, word_to_idx)
print('saving preprocessed data...')
cPickle.dump((data_train, data_dev, data_test, word_to_idx, idx_to_word),open(config.output_path,'wb'))

saving preprocessed data...
