In [1]:
import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

from data_structure import Instance

from sklearn.datasets import fetch_20newsgroups

# configure

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('train_path', 'data/20news/train.', 'path of train data')
flags.DEFINE_string('test_path', 'data/20news/test.', 'path of test data')
flags.DEFINE_string('vocab_path', 'data/20news/vocabulary.txt', 'path of vocab data')
flags.DEFINE_string('bow_path', 'data/20news/bow.pkl', 'path of vocab data')
flags.DEFINE_string('label_path', 'data/20news/train.map', 'path of vocab data')

flags.DEFINE_string('stopwords_path', 'data/stopwords_mallet.txt', 'path of input data')

flags.DEFINE_string('output_path', 'data/20news/instances.pkl', 'path of output data')

flags.DEFINE_integer('n_vocab', 50000, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

# load source

In [3]:
words = open(config.vocab_path, 'r').read().split('\n')[:-1]
tmp_word_to_idx = {word:i for i, word in enumerate(words)}
tmp_idx_to_word = {i: word for i, word in enumerate(words)}

In [4]:
def get_docs(subset_path):
    data_path = subset_path + 'data'
    data = open(data_path, 'r').read().split('\n')[:-1]

    docs = defaultdict(list)
    word_cnts = defaultdict(int)
    for row in data:
        doc_idx, word_idx, word_cnt = row.split()
        doc_idx = int(doc_idx) -1
        word_idx = int(word_idx)-1
        word_cnt = int(word_cnt)

        docs[doc_idx] += [tmp_idx_to_word[word_idx]] * word_cnt
        word_cnts[tmp_idx_to_word[word_idx]] += word_cnt
    
    docs = [' '.join(doc) for doc_idx, doc in sorted(docs.items(), key=lambda x: x[0])]
    
    label_path = subset_path + 'label'
    label_idxs = [int(label_idx)-1 for label_idx in open(label_path, 'r').read().split('\n')[:-1]]
    
    assert len(docs) == len(label_idxs)

    return docs, label_idxs, word_cnts

In [5]:
docs_train_valid, label_idxs_train_valid, word_cnts = get_docs(config.train_path)
docs_test, label_idxs_test, _ = get_docs(config.test_path)

In [6]:
len(docs_train_valid), len(label_idxs_test)

(11269, 7505)

In [7]:
docs_train, docs_valid, label_idxs_train, label_idxs_valid = train_test_split(docs_train_valid, label_idxs_train_valid, test_size = 0.2)
len(docs_train), len(docs_valid), len(docs_test)

(9015, 2254, 7505)

# preprocess 

## stop words

In [89]:
with open(config.stopwords_path, 'r') as f:
    stop_words_mallet = [w.replace('\n', '') for w in f.readlines()]
len(stop_words_mallet)

524

In [114]:
stop_words_mallet

['a',
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'comes',
 'concerning',
 'consequently',
 'consider',
 'con

In [119]:
stop_words_tf = [word for word, cnt in sorted(word_cnts.items(), key=lambda x: x[1])[::-1][:200]]
len(stop_words_tf)

200

In [105]:
stop_words = stop_words_mallet + stop_words_tf
len(stop_words)

674

## tfidf vectorizer

In [106]:
# vectorizer = TfidfVectorizer(min_df=100, max_df=1.0, stop_words=stop_words, norm=None, use_idf=False, dtype=np.float32)
# vectorizer = TfidfVectorizer(min_df=100, max_df=0.5, norm=None, use_idf=False, dtype=np.float32)
vectorizer = TfidfVectorizer(min_df=100, max_df=0.1, norm=None, stop_words=stop_words_tf, use_idf=False, dtype=np.float32)
bows_train = vectorizer.fit_transform(docs_train).toarray()
bows_valid = vectorizer.transform(docs_valid).toarray()
bows_test = vectorizer.transform(docs_test).toarray()

In [107]:
bow_tokens = vectorizer.get_feature_names()
len(bow_tokens)

1559

## get vocab

In [108]:
word_to_idx = {word: tmp_word_to_idx[word] for word in bow_tokens}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
bow_idxs = np.array(list(idx_to_word.keys()))
len(bow_tokens), len(bow_idxs), len(idx_to_word)

(1559, 1559, 1559)

In [109]:
tmp_label_to_idxs = [label_idxs.split() for label_idxs in open(config.label_path, 'r').read().split('\n')[:-1]]
idx_to_label = {int(idx)-1: label for label, idx in tmp_label_to_idxs}

# write out

In [110]:
def prepare_instances(bows, label_idxs, bow_idxs, idx_to_label):
    instances = []
    assert len(bows) == len(label_idxs)
    for doc_idx, (bow, label_idx) in enumerate(zip(bows, label_idxs)):
        doc_l = np.sum(bow)
        if doc_l == 0: continue
            
        instance = Instance()
        instance.idx = doc_idx
        instance.bow = bow
        instance.doc_l = doc_l
        instance.label_idx = label_idx
        instance.label = idx_to_label[label_idx]
        instances.append(instance)
    return instances

In [111]:
instances_train = prepare_instances(bows_train, label_idxs_train, bow_idxs, idx_to_label)
instances_valid = prepare_instances(bows_valid, label_idxs_valid, bow_idxs, idx_to_label)
instances_test = prepare_instances(bows_test, label_idxs_test, bow_idxs, idx_to_label)
len(instances_train), len(instances_valid), len(instances_test)

(9006, 2253, 7491)

In [112]:
print('saving preprocessed instances...')
cPickle.dump((instances_train, instances_valid, instances_test, word_to_idx, idx_to_word, bow_idxs),open(config.output_path,'wb'))

saving preprocessed instances...
