In [1]:
import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk import word_tokenize

from data_structure import Instance

from sklearn.datasets import fetch_20newsgroups

# configure

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('train_path', 'data/20news/train.', 'path of train data')
flags.DEFINE_string('test_path', 'data/20news/test.', 'path of test data')
flags.DEFINE_string('vocab_path', 'data/20news/vocabulary.txt', 'path of vocab data')
flags.DEFINE_string('bow_path', 'data/20news/bow.pkl', 'path of vocab data')
flags.DEFINE_string('label_path', 'data/20news/train.map', 'path of vocab data')

flags.DEFINE_string('output_path', 'data/20news/instances.pkl', 'path of output data')

flags.DEFINE_integer('n_vocab', 50000, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

# load source

In [14]:
def get_docs(subset_path):
    data_path = subset_path + 'data'
    data = open(data_path, 'r').read().split('\n')[:-1]

    docs = defaultdict(dict)
    word_cnts = defaultdict(int)
    for row in data:
        doc_idx, word_idx, word_cnt = row.split()
        doc_idx = int(doc_idx) -1
        word_idx = int(word_idx)-1
        word_cnt = int(word_cnt)

        docs[doc_idx][word_idx] = word_cnt
        word_cnts[word_idx] += word_cnt
    
    docs = sorted(docs.items(), key=lambda x: x[0])
    
    label_path = subset_path + 'label'
    label_idxs = [int(label_idx)-1 for label_idx in open(label_path, 'r').read().split('\n')[:-1]]
    
    assert len(docs) == len(label_idxs)

    return docs, label_idxs, word_cnts

In [15]:
docs_train_valid, label_idxs_train_valid, word_cnts = get_docs(config.train_path)
docs_test, label_idxs_test, _ = get_docs(config.test_path)

In [16]:
len(docs_train_valid), len(label_idxs_test)

(11269, 7505)

In [17]:
docs_train, docs_valid, label_idxs_train, label_idxs_valid = train_test_split(docs_train_valid, label_idxs_train_valid, test_size = 0.2)
len(docs_train), len(docs_valid), len(docs_test)

(9015, 2254, 7505)

In [25]:
[words[word_idx] for word_idx, cnt in sorted(word_cnts.items(), key=lambda x: x[1])[::-1]][:500]

['the',
 'to',
 'of',
 'and',
 'in',
 'is',
 'that',
 'it',
 'for',
 'you',
 'this',
 'on',
 'be',
 'are',
 'not',
 'have',
 'with',
 'as',
 'or',
 'if',
 'but',
 'they',
 'was',
 'edu',
 'can',
 'from',
 'by',
 'at',
 'an',
 'there',
 'what',
 'my',
 'all',
 'will',
 'we',
 'one',
 'would',
 'do',
 'he',
 'about',
 'writes',
 'so',
 'com',
 'has',
 'your',
 'no',
 'any',
 'article',
 'me',
 'some',
 'who',
 'out',
 'which',
 'people',
 'don',
 'like',
 'more',
 'when',
 'just',
 'their',
 'were',
 'up',
 'other',
 'know',
 'only',
 'how',
 'get',
 'them',
 'than',
 'had',
 'think',
 'been',
 'his',
 'also',
 'use',
 'does',
 'time',
 'then',
 'these',
 'should',
 'new',
 'good',
 'could',
 'well',
 'am',
 'because',
 'even',
 'very',
 'may',
 'now',
 'us',
 'why',
 'into',
 'see',
 'apr',
 'two',
 'way',
 'first',
 'god',
 'many',
 'much',
 'make',
 'most',
 'such',
 'those',
 'right',
 'here',
 'where',
 'system',
 'say',
 're',
 've',
 'want',
 'our',
 'said',
 'being',
 'anyone',
 

In [6]:
words = open(config.vocab_path, 'r').read().split('\n')
tmp_word_to_idx = {word:i for i, word in enumerate(words)}

In [7]:
bow_words = list(cPickle.load(open(config.bow_path, 'rb')).keys())
word_to_idx = {word: tmp_word_to_idx[word] for word in bow_words if word in tmp_word_to_idx}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
bow_idxs = np.array(list(idx_to_word.keys()))
len(bow_words), len(bow_idxs)

(1995, 1991)

In [8]:
tmp_label_to_idxs = [label_idxs.split() for label_idxs in open(config.label_path, 'r').read().split('\n')[:-1]]
idx_to_label = {int(idx)-1: label for label, idx in tmp_label_to_idxs}

# write out

In [9]:
def prepare_instances(docs, label_idxs, bow_idxs, idx_to_label):
    instances = []
    assert len(docs) == len(label_idxs)
    for doc, label_idx in zip(docs, label_idxs):
        doc_idx = doc[0]
        word_idx_cnts = doc[1]
        bow = np.array([word_idx_cnts[bow_idx] if bow_idx in word_idx_cnts else 0 for bow_idx in bow_idxs])
        doc_l = np.sum(bow)
        if doc_l == 0: continue
            
        instance = Instance()
        instance.idx = doc_idx
        instance.bow = bow
        instance.doc_l = doc_l
        instance.label_idx = label_idx
        instance.label = idx_to_label[label_idx]
        instances.append(instance)
    return instances

In [10]:
instances_train = prepare_instances(docs_train, label_idxs_train, bow_idxs, idx_to_label)
instances_valid = prepare_instances(docs_valid, label_idxs_valid, bow_idxs, idx_to_label)
instances_test = prepare_instances(docs_test, label_idxs_test, bow_idxs, idx_to_label)
len(instances_train), len(instances_valid), len(instances_test)

(9012, 2253, 7501)

In [11]:
print('saving preprocessed instances...')
cPickle.dump((instances_train, instances_valid, instances_test, word_to_idx, idx_to_word, bow_idxs),open(config.output_path,'wb'))

saving preprocessed instances...
