In [1]:
import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf
from nltk import word_tokenize

from sklearn.datasets import fetch_20newsgroups

from data_structure import Instance

# configure

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('train_path', 'data/20news/train.txt.npy', 'path of output data')
flags.DEFINE_string('valid_path', 'data/20news/valid.txt.npy', 'path of input data')
flags.DEFINE_string('test_path', 'data/20news/test.txt.npy', 'path of input data')
flags.DEFINE_string('vocab_path', 'data/20news/vocab.pkl', 'path of input data')

flags.DEFINE_string('stopwords_path', 'data/stopwords_mallet.txt.npy', 'path of input data')

flags.DEFINE_string('output_path', 'data/20news/instances.pkl', 'path of output data')

flags.DEFINE_integer('n_vocab', 50000, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

# load source

In [3]:
word_to_idx = cPickle.load(open(config.vocab_path, 'rb'))
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
bow_idxs = np.unique(list(word_to_idx.values()))

In [4]:
def get_df(data_path):
    data_dict = {}
    docs = np.load(data_path, allow_pickle=True, encoding='bytes')
    for token_idxs in docs:
        data_dict['token_idxs'] = token_idxs
        data_dict['doc_l'] = len(token_idxs)
    data_df = pd.DataFrame(data_dict)
    return data_df

In [5]:
train_tmp = np.load(config.train_path, allow_pickle=True, encoding='bytes')
test_tmp = np.load(config.test_path, allow_pickle=True, encoding='bytes')

In [6]:
len(train_tmp), len(test_tmp)

(11259, 7488)

# write out

In [51]:
def prepare_instances(data_path, bow_idxs):
    instances = []
    docs = np.load(data_path, allow_pickle=True, encoding='bytes')
    for idx_doc, token_idxs in enumerate(docs):
        if len(token_idxs) == 0: continue
        instance = Instance()
        instance.idx = idx_doc
        instance.token_idxs = token_idxs
        instance.doc_l = len(token_idxs)
        token_idx_cnt = Counter(token_idxs)
        instance.bow = np.array([token_idx_cnt[bow_idx] for bow_idx in bow_idxs])
        if not (sum(token_idx_cnt.values()) == np.sum(instance.bow) == len(instance.token_idxs)):
            print('skip: %i' % idx_doc)
        instances.append(instance)
    return instances

In [52]:
instances_train = prepare_instances(config.train_path, bow_idxs)

In [None]:
np.load(config.valid_path, allow_pickle=True, encoding='bytes')

In [11]:
# instances_train_valid = prepare_instances(config.train_path, bow_idxs)
# len(instances_train_valid)

# train_rate = 0.8
# train_size = int(len(instances_train_valid)*train_rate)
# instances_train = instances_train_valid[:train_size]
# instances_valid = instances_train_valid[train_size:]

In [55]:
instances_test = prepare_instances(config.test_path, bow_idxs)
instances_valid = instances_test

In [56]:
# assert len(instances_train) + len(instances_valid) == len(instances_train_valid)
len(instances_train), len(instances_valid), len(instances_test)

(11258, 7487, 7487)

In [16]:
print('saving preprocessed instances...')
cPickle.dump((instances_train, instances_valid, instances_test, word_to_idx, idx_to_word, bow_idxs),open(config.output_path,'wb'))

saving preprocessed instances...


# write corpus 

In [29]:
from sklearn.datasets import fetch_20newsgroups
corpus_train = fetch_20newsgroups(subset='train')
corpus_test = fetch_20newsgroups(subset='test')
data = corpus_train.data + corpus_test.data

In [32]:
len(data)

18846

In [34]:
dir_corpus = os.path.join('corpus', '20news')
n_doc = 1000
docs = []
for i, instance in enumerate(data):
    doc = ' '.join(word_tokenize(instance))
    docs.append(doc)
    if (i+1) % n_doc == 0:
        print(i, end='')
        fname = '20news.%i' % (i // n_doc)
        with open(os.path.join(dir_corpus, fname), 'w', encoding='utf-8') as f:
            f.write('\n'.join(docs))
            docs = []

9991999299939994999599969997999899999991099911999129991399914999159991699917999

# write 

In [57]:
data = np.concatenate([train_tmp, test_tmp])
assert len(data) == len(train_tmp) + len(test_tmp)
len(data)

18747

In [58]:
dir_corpus = os.path.join('corpus', '20news')
n_doc = 1000
docs = []

for i, instance in enumerate(data):
    doc = ' '.join([idx_to_word[word_idx] for word_idx in instance])
    docs.append(doc)
    if (i+1) % n_doc == 0:
        print(i, end='')
        fname = '20news.%i' % (i // n_doc)
        with open(os.path.join(dir_corpus, fname), 'w', encoding='utf-8') as f:
            f.write('\n'.join(docs))
            docs = []

9991999299939994999599969997999899999991099911999129991399914999159991699917999