In [120]:
import os
import numpy as np
import pandas as pd

import _pickle as cPickle
from collections import OrderedDict, defaultdict
from nltk import word_tokenize

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from data_structure import Instance

# configure

In [196]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('train_path', 'data/apnews/apnews50k_train.txt', 'path of output data')
flags.DEFINE_string('valid_path', 'data/apnews/apnews50k_valid.txt', 'path of input data')
flags.DEFINE_string('test_path', 'data/apnews/apnews50k_test.txt', 'path of input data')

flags.DEFINE_string('stopwords_path', 'data/stopwords_mallet.txt', 'path of input data')

flags.DEFINE_string('output_path', 'data/apnews/instances.pkl', 'path of output data')

flags.DEFINE_integer('n_vocab', 50000, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [29]:
# special tokens
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

# load source

In [156]:
def get_df(data_path):
    data_dict = {}
    with open(data_path, 'r') as f:
        txt = f.readlines()
        tokens_list = []
        doc_list = []
        for doc in txt:
            doc = re.sub(r'-', ' ', doc)
            doc = re.sub(r'[0-9]+.[0-9]+|[0-9]+,[0-9]+|[0-9]+', '<num>', doc)
            lines = doc.split('\t')
            doc = ' '.join(lines)
            doc_list.append(doc)
            tokens = [word_tokenize(line)[:-1] for line in lines]
            tokens_list.append(tokens)
            doc_l = len(tokens)
            max_sent_l = max([len(line) for line in tokens])
        data_dict['doc'] = doc_list
        data_dict['tokens'] = tokens_list
        data_dict['doc_l'] = doc_l
        data_dict['max_sent_l'] = max_sent_l
    data_df = pd.DataFrame(data_dict)
    return data_df

In [157]:
train_df = get_df(config.train_path)
valid_df = get_df(config.valid_path)
test_df = get_df(config.test_path)

In [158]:
len(train_df), len(valid_df), len(test_df)

(50000, 2000, 2000)

# build vocab for language modeling 

In [166]:
def get_lm_word_cnt_dict(train_df, min_tf):
    # create vocab of words
    word_cnt_dict = defaultdict(int)
    word_cnt_dict['.'] = np.inf
    word_cnt_dict[EOS] = np.inf
    word_cnt_dict[BOS] = np.inf
    word_cnt_dict[UNK] = np.inf
    word_cnt_dict[PAD] = np.inf
    
    tokens_list = []
    for doc in train_df.tokens:
        tokens_list.extend(doc)
    
    for tokens in tokens_list:
        for word in tokens:
            word_cnt_dict[word] += 1
    word_cnt_dict = sorted(word_cnt_dict.items(), key=lambda x: x[1])[::-1]
    
    lm_word_cnt_dict = list(filter(lambda x: x[1] > min_tf, word_cnt_dict))
    return lm_word_cnt_dict

In [167]:
lm_word_cnt_dict = get_lm_word_cnt_dict(train_df, min_tf=10)
print(len(lm_word_cnt_dict))

29662


In [177]:
idx_to_word = {idx: word for idx, (word, cnt) in enumerate(lm_word_cnt_dict)}
word_to_idx = {word: idx for idx, word in idx_to_word.items()}

# build bow vector

In [159]:
with open(config.stopwords_path, 'r') as f:
    stop_words = [w.replace('\n', '') for w in f.readlines()]

In [169]:
vectorizer = TfidfVectorizer(min_df=100, max_df=1.0, stop_words=stop_words, tokenizer=word_tokenize, norm='l1', use_idf=False, smooth_idf=False, dtype=np.float32)
corpus = list(train_df.doc)
bow_list = vectorizer.fit_transform(corpus)
bow_features = vectorizer.get_feature_names()
print(len(bow_features))
assert len(train_df) == len(bow_list.toarray())
assert all([word in word_to_idx for word in bow_features])

6770


# write out

In [194]:
def prepare_instances(data_df, word_to_idx, bow_list=None):
    instances = []
    if bow_list is not None: bows = bow_list.toarray()
    for idx_doc, doc in data_df.iterrows():
        instance = Instance()
        instance.idx = idx_doc
        doc_token_idxs = []
        for sent_tokens in doc.tokens:
            sent_token_idxs = [word_to_idx[token] if token in word_to_idx else word_to_idx[UNK] for token in sent_tokens]
            doc_token_idxs.append(sent_token_idxs)            
        instance.token_idxs = doc_token_idxs
        instance.doc_l = doc.doc_l
        instance.max_sent_l = doc.max_sent_l
        if bow_list is not None: instance.bow = bows[idx_doc]
        instances.append(instance)
    return instances

In [195]:
instances_train = prepare_instances(train_df, word_to_idx, bow_list=bow_list)
instances_valid = prepare_instances(valid_df, word_to_idx)
instances_test = prepare_instances(test_df, word_to_idx)

In [198]:
print('saving preprocessed instances...')
cPickle.dump((instances_train, instances_valid, instances_test, word_to_idx, idx_to_word, bow_features),open(config.output_path,'wb'))

saving preprocessed instances...
