In [None]:
import sklearn, sklearn.datasets
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import time, re
import graph, coarsening

# TODO: model NFEATURES NCLASSES
#import models
%run -n models.ipynb

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS

# Learning.
flags.DEFINE_float('num_epochs', 10, 'Number of training epochs.')
# 0.1 for cnn2, 0.3 for fgcnn2, 0.2 for lgcnn2
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_float('decay_rate', 1, 'Base of exponential decay. No decay with 1.')
flags.DEFINE_float('momentum', 0, 'Momentum. 0 indicates no momentum.')

# Regularizations.
flags.DEFINE_float('regularization', 5e-4, 'L2 regularizations of weights and biases.')
flags.DEFINE_float('dropout', 0.5, 'Dropout regularization (fc layers): probability to keep hidden neurons.'
                  'Deactivate with 1.')

flags.DEFINE_integer('batch_size', 100, 'Batch size. Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('eval_frequency', 100, 'Number of steps between evaluations.')

# Graphs.
flags.DEFINE_integer('number_edges', 8, 'Graph: minimum number of edges per vertex.')
flags.DEFINE_string('metric', 'cosine', 'Graph: similarity measure (between features).')
# TODO: change cgcnn for combinatorial Laplacians.
flags.DEFINE_bool('normalized_laplacian', True, 'Graph Laplacian: normalized.')
flags.DEFINE_integer('coarsening_levels', 0, 'Number of coarsened graphs.')

# Directories.
flags.DEFINE_string('dir_data', 'data_20news', 'Directory to store data.')
flags.DEFINE_string('dir_summaries', 'summaries/mnist1/run1', 'Directory for TensorBoard summaries.')

# Data

In [None]:
class Dataset(object):
    def fetch(self, **params):
        dataset = sklearn.datasets.fetch_20newsgroups(**params)
        self.documents = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names
        assert max(self.labels) + 1 == len(self.class_names)
        N, C = len(self.documents), len(self.class_names)
        print('N = {} documents, C = {} classes'.format(N, C))
        #print(train.target_names)

    def clean_text(self, num='substitute'):
        # TODO: stemming, lemmatisation
        for i,doc in enumerate(self.documents):
            # Digits.
            if num is 'spell':
                doc = doc.replace('0', ' zero ')
                doc = doc.replace('1', ' one ')
                doc = doc.replace('2', ' two ')
                doc = doc.replace('3', ' three ')
                doc = doc.replace('4', ' four ')
                doc = doc.replace('5', ' five ')
                doc = doc.replace('6', ' six ')
                doc = doc.replace('7', ' seven ')
                doc = doc.replace('8', ' eight ')
                doc = doc.replace('9', ' nine ')
            elif num is 'substitute':
                # All numbers are equal. Useful for embedding (countable words) ?
                doc = re.sub('(\\d+)', ' NUM ', doc)
            elif num is 'remove':
                # Numbers are uninformative (they are all over the place). Useful for bag-of-words ?
                # But maybe some kind of documents contain more numbers, e.g. finance.
                # Some documents are indeed full of numbers. At least in 20NEWS.
                doc = re.sub('[0-9]', ' ', doc)
            # Remove everything except a-z characters and single space.
            doc = doc.replace('$', ' dollar ')
            doc = doc.lower()
            doc = re.sub('[^a-z]', ' ', doc)
            doc = ' '.join(doc.split())  # same as doc = re.sub('\s{2,}', ' ', doc)
            self.documents[i] = doc

    def vectorize(self, **params):
        # TODO: count or tf-idf. Or in normalize ?
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(**params)
        self.data = vectorizer.fit_transform(self.documents)
        self.vocab = vectorizer.get_feature_names()
        assert len(self.vocab) == self.data.shape[1]
    
    def data_info(self):
        N, M = self.data.shape
        sparsity = self.data.nnz / N / M * 100
        print('N = {} documents, M = {} words, sparsity={:.4f}%'.format(N, M, sparsity))
        
    def show_document(self, i):
        label = self.labels[i]
        name = self.class_names[label]
        text = self.documents[i]
        wc = len(text.split())
        print('document {}: label {} --> {}, {} words'.format(i, label, name, wc))
        try:
            vector = self.data[i,:]
            for j in range(vector.shape[1]):
                if vector[0,j] != 0:
                    print('  {:.2f} "{}" ({})'.format(vector[0,j], self.vocab[j], j))
        except:
            pass
        return text
    
    def keep_documents(self, idx):
        """Keep the documents given by the index, discard the others."""
        self.documents = [self.documents[i] for i in idx]
        self.labels = self.labels[idx]
        self.data = self.data[idx,:]

    def keep_words(self, idx):
        """Keep the documents given by the index, discard the others."""
        self.data = self.data[:,idx]
        self.vocab = [self.vocab[i] for i in idx]
        
    def remove_short_documents(self, nwords, vocab='selected'):
        """Remove a document if it contains less than nwords."""
        if vocab is 'selected':
            # Word count with selected vocabulary.
            wc = self.data.sum(axis=1)
            wc = np.squeeze(np.asarray(wc))
        elif vocab is 'full':
            # Word count with full vocabulary.
            wc = np.empty(len(self.documents), dtype=np.int)
            for i,doc in enumerate(self.documents):
                wc[i] = len(doc.split())
        idx = np.argwhere(wc >= nwords).squeeze()
        self.keep_documents(idx)
        return wc
        
    def keep_top_words(self, M, Mprint=20):
        """Keep in the vocaluary the M words who appear most often."""
        freq = self.data.sum(axis=0)
        freq = np.squeeze(np.asarray(freq))
        idx = np.argsort(freq)[::-1]
        idx = idx[:M]
        self.keep_words(idx)
        print('most frequent words')
        for i in range(Mprint):
            print('  {:3d}: {:10s} {:6d} counts'.format(i, self.vocab[i], freq[idx][i]))
        return freq[idx]
    
    def normalize(self, norm='l1'):
        """Normalize data to unit length."""
        # TODO: TF-IDF.
        data = self.data.astype(np.float64)
        self.data = sklearn.preprocessing.normalize(data, axis=1, norm=norm)

In [None]:
# Fetch dataset. Scikit-learn already performs some cleaning.
remove = ('headers','footers','quotes')  # (), ('headers') or ('headers','footers','quotes')
train = Dataset()
train.fetch(data_home=FLAGS.dir_data, subset='train', remove=remove)

# Pre-processing: transform everything to a-z and whitespace.
print(train.show_document(1)[:400])
train.clean_text(num='substitute')

# Analyzing / tokenizing: transform documents to bags-of-words.
#stop_words = set(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
# Or stop words from NLTK.
# Add e.g. don, ve.
train.vectorize(stop_words='english')
print(train.show_document(1)[:400])

In [None]:
# Remove short documents.
train.data_info()
wc = train.remove_short_documents(nwords=20, vocab='full')
train.data_info()
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
plt.figure(figsize=(15,5))
plt.semilogy(wc, '.');

# Remove encoded images.
def remove_encoded_images(dataset, freq=1e3):
    widx = train.vocab.index('ax')
    wc = train.data[:,widx].toarray().squeeze()
    idx = np.argwhere(wc < freq).squeeze()
    dataset.keep_documents(idx)
    return wc
wc = remove_encoded_images(train)
train.data_info()
plt.figure(figsize=(15,5))
plt.semilogy(wc, '.');

In [None]:
# Feature selection.
freq = train.keep_top_words(1000, 20)
train.data_info()
train.show_document(1)
plt.figure(figsize=(15,5))
plt.semilogy(freq);

# Remove documents whose signal would be the zero vector.
wc = train.remove_short_documents(nwords=5, vocab='selected')
train.data_info()

In [None]:
train.normalize(norm='l1')
train.show_document(1);

In [None]:
# TODO
# Word embedding
# Further feature selection.

In [None]:
# Test dataset.
test = Dataset()
test.fetch(data_home=FLAGS.dir_data, subset='test', remove=remove)
test.clean_text(num='substitute')
test.vectorize(vocabulary=train.vocab)
test.data_info()
wc = test.remove_short_documents(nwords=5, vocab='selected')
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
test.data_info()
test.normalize(norm='l1')

# Feature graph

In [None]:
def coarsen(A, levels):
    graphs, parents = coarsening.metis(A, levels)
    perms = coarsening.compute_perm(parents)

    laplacians = []
    for i,A in enumerate(graphs):
        M, M = A.shape

        # No self-connections.
        if True:
            A = A.tocoo()
            A.setdiag(0)

        if i < levels:
            A = coarsening.perm_adjacency(A, perms[i])

        A = A.tocsr()
        A.eliminate_zeros()
        Mnew, Mnew = A.shape
        print('Layer {0}: M_{0} = |V| = {1} nodes ({2} added), |E| = {3} edges'.format(i, Mnew, Mnew-M, A.nnz))

        L = graph.laplacian(A, normalized=FLAGS.normalized_laplacian)
        laplacians.append(L)
    return laplacians, perms[0] if len(perms) > 0 else None

t_start = time.process_time()
A = graph.adjacency(train.data.T.toarray(), k=FLAGS.number_edges, metric=FLAGS.metric)
print("{} > {} edges".format(A.nnz, FLAGS.number_edges*train.data.shape[1]))
L, perm = coarsen(A, FLAGS.coarsening_levels)
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
del A

if True:
    for i,lap in enumerate(L):
        lamb, U = graph.fourier(lap)
        print('L_{}: spectrum in [{:1.2e}, {:1.2e}]'.format(i, lamb[0], lamb[-1]))
        plt.figure(figsize=(15,5))
        plt.plot(lamb);

In [None]:
Nval = 200
perm = np.random.permutation(train.data.shape[0])
val_data = train.data[perm[:Nval]].toarray()
val_labels = train.labels[perm[:Nval]]
train_data = train.data[perm[Nval:]].toarray()
train_labels = train.labels[perm[Nval:]]

# TODO: not a multiple of minibatch size
test_data, test_labels = test.data[:800].toarray(), test.labels[:800]

# TODO: coarsening
t_start = time.process_time()
train_data = coarsening.perm_data(train_data, perm)
val_data = coarsening.perm_data(val_data, perm)
test_data = coarsening.perm_data(test_data, perm)
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
del perm

# Neural network

In [None]:
def evaluate(sess, ph_dropout, op_ncorrects, op_loss, data, labels):
    """
    Runs one evaluation against the full epoch of data.
    Return the precision and the number of correct predictions.
    Batch evaluation saves memory and enables this to run on smaller GPUs.
    
    sess: the session in which the model has been trained.
    op: the Tensor that returns the number of correct predictions.
    data: size N x M
        N: number of signals (samples)
        M: number of vertices (features)
    labels: size N
        N: number of signals (samples)
    """
    ncorrects = 0  # Counts the number of correct predictions.
    loss = 0
    size = data.shape[0]
    for begin in range(0, size, FLAGS.batch_size):
        end = begin + FLAGS.batch_size
        batch_data, batch_labels = data[begin:end,:], labels[begin:end]
        feed_dict = {ph_data: batch_data, ph_labels: batch_labels, ph_dropout: 1}
        batch_ncorrects, batch_loss = sess.run([op_ncorrects, op_loss], feed_dict)
        ncorrects += batch_ncorrects
        loss += batch_loss
    precision = ncorrects / size * 100
    loss *= FLAGS.batch_size / size
    string = 'precision: {:.2f}% ({:d} / {:d}), loss: {:.2e}'.format(precision, ncorrects, size, loss)
    return string, precision, loss

In [None]:
model = cgcnn(L, F=[32], K=[25], p=[1], M=[512])

Nval = 200
perm = np.random.permutation(train.data.shape[0])
val_data = train.data[perm[:Nval]].toarray()
val_labels = train.labels[perm[:Nval]]
train_data = train.data[perm[Nval:]].toarray()
train_labels = train.labels[perm[Nval:]]

#train_(model, train_data, train_labels, val_data, val_labels, FLAGS)

In [None]:
#def train_(model, train_data, train_labels, val_data, val_labels, FLAGS):
with tf.name_scope('inputs'):
    ph_data = tf.placeholder(tf.float32, (FLAGS.batch_size, train_data.shape[1]), 'data')
    ph_labels = tf.placeholder(tf.int32, (FLAGS.batch_size), 'labels')
    ph_dropout = tf.placeholder(tf.float32, (), 'dropout')

# Construct computational graph.
op_logits = model.inference(ph_data, ph_dropout)
op_loss, op_loss_average = model.loss(op_logits, ph_labels, FLAGS.regularization)
op_train = model.training(op_loss, FLAGS.learning_rate,
        train_data.shape[0]/FLAGS.batch_size, FLAGS.decay_rate, FLAGS.momentum)
op_ncorrects = model.evaluation(op_logits, ph_labels)

# Summaries for TensorBoard.
op_summary = tf.merge_all_summaries()
sess = tf.Session()
writer = tf.train.SummaryWriter(FLAGS.dir_summaries, sess.graph)

In [None]:
# Initialize variables, i.e. weights and biases.
t_start = time.process_time()
init = tf.initialize_all_variables()
sess.run(init)

# Training.
indices = []
num_steps = int(FLAGS.num_epochs * train_data.shape[0] / FLAGS.batch_size)
for step in range(1, num_steps+1):

    # Be sure to have used all the samples before using one a second time.
    # TODO: queue
    if len(indices) < FLAGS.batch_size:
        new_indices = np.random.permutation(train_data.shape[0])
        indices.extend(new_indices)
    idx = indices[:FLAGS.batch_size]
    del indices[:FLAGS.batch_size]

    batch_data, batch_labels = train_data[idx,:], train_labels[idx]
    feed_dict = {ph_data: batch_data, ph_labels: batch_labels, ph_dropout: FLAGS.dropout}
    learning_rate, loss_average = sess.run([op_train, op_loss_average], feed_dict)

    # Periodical evaluation of the model.
    if step % FLAGS.eval_frequency == 0 or step == num_steps:
        epoch = step * FLAGS.batch_size / train_data.shape[0]
        print('step {} / {} (epoch {:.2f} / {}):'.format(step, num_steps, epoch, FLAGS.num_epochs))
        print('  learning_rate = {:.2e}, loss_average = {:.2e}'.format(learning_rate, loss_average))
        string, precision, loss = evaluate(sess, ph_dropout, op_ncorrects, op_loss, val_data, val_labels) 
        print('  validation {}'.format(string))
        print('  time: {:.0f}s'.format(time.process_time() - t_start))

        # Summaries for TensorBoard.
        summary = tf.Summary()
        summary.ParseFromString(sess.run(op_summary, feed_dict))
        summary.value.add(tag='validation/precision', simple_value=precision)
        summary.value.add(tag='validation/loss', simple_value=loss)
        writer.add_summary(summary, step)
writer.close()

#return sess

In [None]:
# Evaluate.
t_start = time.process_time()
print('test {}'.format(evaluate(sess, ph_dropout, op_ncorrects, op_loss, test_data, test_labels)[0]))
print('time: {:.2f}s'.format(time.process_time() - t_start))
#sess.close()

In [None]:
#if __name__ == '__main__':
#    tf.app.run()