In [None]:
#import graph, coarsening, utils
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy.sparse
import time, shutil

%matplotlib inline

%load_ext autoreload
%autoreload 1
%aimport graph
%aimport coarsening
%aimport utils

#import models
%run -n models.ipynb

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS

# Graphs.
flags.DEFINE_integer('number_edges', 16, 'Graph: minimum number of edges per vertex.')
flags.DEFINE_string('metric', 'cosine', 'Graph: similarity measure (between features).')
# TODO: change cgcnn for combinatorial Laplacians.
flags.DEFINE_bool('normalized_laplacian', True, 'Graph Laplacian: normalized.')
flags.DEFINE_integer('coarsening_levels', 0, 'Number of coarsened graphs.')

flags.DEFINE_string('dir_data', 'data_20news', 'Directory to store data.')
flags.DEFINE_integer('val_size', 400, 'Size of the validation set.')

# Data

In [None]:
# Fetch dataset. Scikit-learn already performs some cleaning.
remove = ('headers','footers','quotes')  # (), ('headers') or ('headers','footers','quotes')
train = utils.Text20News(data_home=FLAGS.dir_data, subset='train', remove=remove)

# Pre-processing: transform everything to a-z and whitespace.
print(train.show_document(1)[:400])
train.clean_text(num='substitute')

# Analyzing / tokenizing: transform documents to bags-of-words.
#stop_words = set(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
# Or stop words from NLTK.
# Add e.g. don, ve.
train.vectorize(stop_words='english')
print(train.show_document(1)[:400])

In [None]:
# Remove short documents.
train.data_info(True)
wc = train.remove_short_documents(nwords=20, vocab='full')
train.data_info()
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
plt.figure(figsize=(17,5))
plt.semilogy(wc, '.');

# Remove encoded images.
def remove_encoded_images(dataset, freq=1e3):
    widx = train.vocab.index('ax')
    wc = train.data[:,widx].toarray().squeeze()
    idx = np.argwhere(wc < freq).squeeze()
    dataset.keep_documents(idx)
    return wc
wc = remove_encoded_images(train)
train.data_info()
plt.figure(figsize=(17,5))
plt.semilogy(wc, '.');

In [None]:
# Feature selection.
# Other options include: mutual information or document count.
freq = train.keep_top_words(1000, 20)
train.data_info()
train.show_document(1)
plt.figure(figsize=(17,5))
plt.semilogy(freq);

# Remove documents whose signal would be the zero vector.
wc = train.remove_short_documents(nwords=5, vocab='selected')
train.data_info(True)

In [None]:
train.normalize(norm='l1')
train.show_document(1);

In [None]:
# Word embedding
if True:
    train.embed()
else:
    train.embed('data_word2vec/GoogleNews-vectors-negative300.bin')
train.data_info()
# Further feature selection. (TODO)

In [None]:
# Test dataset.
test = utils.Text20News(data_home=FLAGS.dir_data, subset='test', remove=remove)
test.clean_text(num='substitute')
test.vectorize(vocabulary=train.vocab)
test.data_info()
wc = test.remove_short_documents(nwords=5, vocab='selected')
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
test.data_info(True)
test.normalize(norm='l1')

In [None]:
if True:
    train_data = train.data
    test_data = test.data
    train_labels = train.labels
    test_labels = test.labels
else:
    perm = np.random.permutation(train.data.shape[0])
    Ntest = 6695
    perm_test = perm[:Ntest]
    perm_train = perm[Ntest:]
    train_data = train.data[perm_train,:]
    test_data = train.data[perm_test,:]
    train_labels = train.labels[perm_train]
    test_labels = train.labels[perm_test]

if True:
    graph_data = train.embeddings
else:
    graph_data = train.data.T.toarray()

del train, test

# Feature graph

In [None]:
def coarsen(A, levels):
    graphs, parents = coarsening.metis(A, levels)
    perms = coarsening.compute_perm(parents)

    laplacians = []
    for i,A in enumerate(graphs):
        M, M = A.shape

        # No self-connections.
        if True:
            A = A.tocoo()
            A.setdiag(0)

        if i < levels:
            A = coarsening.perm_adjacency(A, perms[i])

        A = A.tocsr()
        A.eliminate_zeros()
        Mnew, Mnew = A.shape
        print('Layer {0}: M_{0} = |V| = {1} nodes ({2} added), |E| = {3} edges'.format(i, Mnew, Mnew-M, A.nnz//2))

        L = graph.laplacian(A, normalized=FLAGS.normalized_laplacian)
        laplacians.append(L)
    return laplacians, perms[0] if len(perms) > 0 else None

t_start = time.process_time()
dist, idx = graph.distance_lshforest(graph_data, k=FLAGS.number_edges, metric=FLAGS.metric)
A = graph.adjacency(dist, idx)
print("{} > {} edges".format(A.nnz//2, FLAGS.number_edges*graph_data.shape[0]//2))
A = graph.replace_random_edges(A, 0)
L, perm = coarsen(A, FLAGS.coarsening_levels)
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
graph.plot_spectrum(L)
del graph_data, A, dist, idx

In [None]:
t_start = time.process_time()
train_data = scipy.sparse.csr_matrix(coarsening.perm_data(train_data.toarray(), perm))
test_data = scipy.sparse.csr_matrix(coarsening.perm_data(test_data.toarray(), perm))
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
del perm

# Classification

In [None]:
# Training set is shuffled already.
#perm = np.random.permutation(train_data.shape[0])
#train_data = train_data[perm,:]
#train_labels = train_labels[perm]

# Validation set.
if False:
    val_data = train_data[:FLAGS.val_size,:]
    val_labels = train_labels[:FLAGS.val_size]
    train_data = train_data[FLAGS.val_size:,:]
    train_labels = train_labels[FLAGS.val_size:]
else:
    val_data = test_data
    val_labels = test_labels

In [None]:
if True:
    utils.baseline(train_data, train_labels, test_data, test_labels)

In [None]:
common = {}
common['num_epochs']     = 80
common['batch_size']     = 100
common['decay_steps']    = len(train_labels) / common['batch_size']
common['eval_frequency'] = 5 * common['num_epochs']
common['filter']         = 'chebyshev5'
common['brelu']          = 'b1relu'
common['pool']           = 'mpool1'
C = max(train_labels) + 1  # number of classes

shutil.rmtree('summaries/20news1', ignore_errors=True)
shutil.rmtree('checkpoints/20news1', ignore_errors=True)

In [None]:
if True:
    params = common.copy()
    params['dir_name']       = '20news1/softmax'
    params['regularization'] = 0
    params['dropout']        = 1
    params['learning_rate']  = 1e3
    params['decay_rate']     = 0.95
    params['momentum']       = 0.9
    params['F']              = []
    params['K']              = []
    params['p']              = []
    params['M']              = [C]
    model = cgcnn(L, **params)
    model.fit(train_data, train_labels, val_data, val_labels)
    print('train {}'.format(model.evaluate(train_data, train_labels)[0]))
    print('test  {}'.format(model.evaluate(test_data, test_labels)[0]))

In [None]:
if True:
    params = common.copy()
    params['dir_name']       = '20news1/fc_softmax'
    params['regularization'] = 0
    params['dropout']        = 1
    params['learning_rate']  = 0.1
    params['decay_rate']     = 0.95
    params['momentum']       = 0.9
    params['F']              = []
    params['K']              = []
    params['p']              = []
    params['M']              = [2500, C]
    model = cgcnn(L, **params)
    model.fit(train_data, train_labels, val_data, val_labels)
    print('train {}'.format(model.evaluate(train_data, train_labels)[0]))
    print('test  {}'.format(model.evaluate(test_data, test_labels)[0]))

In [None]:
if True:
    params = common.copy()
    params['dir_name']       = '20news1/fc_fc_softmax'
    params['regularization'] = 0
    params['dropout']        = 1
    params['learning_rate']  = 0.1
    params['decay_rate']     = 0.95
    params['momentum']       = 0.9
    params['F']              = []
    params['K']              = []
    params['p']              = []
    params['M']              = [2500, 500, C]
    model = cgcnn(L, **params)
    model.fit(train_data, train_labels, val_data, val_labels)
    print('train {}'.format(model.evaluate(train_data, train_labels)[0]))
    print('test  {}'.format(model.evaluate(test_data, test_labels)[0]))

In [None]:
if True:
    params = common.copy()
    params['dir_name']       = '20news1/gconv_softmax'
    params['regularization'] = 1e-3
    params['dropout']        = 1
    params['learning_rate']  = 0.1
    params['decay_rate']     = 0.999
    params['momentum']       = 0
    params['F']              = [32]
    params['K']              = [5]
    params['p']              = [1]
    params['M']              = [C]
    model = cgcnn(L, **params)
    model.fit(train_data, train_labels, val_data, val_labels)
    print('train {}'.format(model.evaluate(train_data, train_labels)[0]))
    print('test  {}'.format(model.evaluate(test_data, test_labels)[0]))

In [None]:
if True:
    params = common.copy()
    params['dir_name']       = '20news1/gconv_fc_softmax'
    params['regularization'] = 0
    params['dropout']        = 1
    params['learning_rate']  = 0.1
    params['decay_rate']     = 0.999
    params['momentum']       = 0
    params['F']              = [5]
    params['K']              = [15]
    params['p']              = [1]
    params['M']              = [100, C]
    model = cgcnn(L, **params)
    model.fit(train_data, train_labels, val_data, val_labels)
    print('train {}'.format(model.evaluate(train_data, train_labels)[0]))
    print('test  {}'.format(model.evaluate(test_data, test_labels)[0]))