In [None]:
import sklearn, sklearn.datasets
import sklearn.naive_bayes, sklearn.linear_model, sklearn.svm, sklearn.neighbors, sklearn.ensemble
import scipy.sparse
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import time, re, collections
import graph, coarsening

#import models
%run -n models.ipynb

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS

# Learning.
flags.DEFINE_float('num_epochs', 40, 'Number of training epochs.')
# 0.1 for cnn2, 0.3 for fgcnn2, 0.2 for lgcnn2
flags.DEFINE_float('learning_rate', 0.1, 'Initial learning rate.')
flags.DEFINE_float('decay_rate', 1, 'Base of exponential decay. No decay with 1.')
flags.DEFINE_float('momentum', 0, 'Momentum. 0 indicates no momentum.')

# Regularizations.
flags.DEFINE_float('regularization', 0, 'L2 regularizations of weights and biases.')
flags.DEFINE_float('dropout', 1, 'Dropout regularization (fc layers): probability to keep hidden neurons.'
                  'No dropout with 1.')

flags.DEFINE_integer('batch_size', 100, 'Batch size. Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('eval_frequency', 200, 'Number of steps between evaluations.')
flags.DEFINE_integer('val_size', 200, 'Size of the validation set.')

# Graphs.
flags.DEFINE_integer('number_edges', 8, 'Graph: minimum number of edges per vertex.')
flags.DEFINE_string('metric', 'cosine', 'Graph: similarity measure (between features).')
# TODO: change cgcnn for combinatorial Laplacians.
flags.DEFINE_bool('normalized_laplacian', True, 'Graph Laplacian: normalized.')
flags.DEFINE_integer('coarsening_levels', 0, 'Number of coarsened graphs.')

# Directories.
flags.DEFINE_string('dir_data', 'data_20news', 'Directory to store data.')
flags.DEFINE_string('dir_summaries', 'summaries/mnist1/run1', 'Directory for TensorBoard summaries.')

# Data

In [None]:
class Dataset(object):
    def fetch(self, **params):
        dataset = sklearn.datasets.fetch_20newsgroups(**params)
        self.documents = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names
        assert max(self.labels) + 1 == len(self.class_names)
        N, C = len(self.documents), len(self.class_names)
        print('N = {} documents, C = {} classes'.format(N, C))
        #print(train.target_names)

    def clean_text(self, num='substitute'):
        # TODO: stemming, lemmatisation
        for i,doc in enumerate(self.documents):
            # Digits.
            if num is 'spell':
                doc = doc.replace('0', ' zero ')
                doc = doc.replace('1', ' one ')
                doc = doc.replace('2', ' two ')
                doc = doc.replace('3', ' three ')
                doc = doc.replace('4', ' four ')
                doc = doc.replace('5', ' five ')
                doc = doc.replace('6', ' six ')
                doc = doc.replace('7', ' seven ')
                doc = doc.replace('8', ' eight ')
                doc = doc.replace('9', ' nine ')
            elif num is 'substitute':
                # All numbers are equal. Useful for embedding (countable words) ?
                doc = re.sub('(\\d+)', ' NUM ', doc)
            elif num is 'remove':
                # Numbers are uninformative (they are all over the place). Useful for bag-of-words ?
                # But maybe some kind of documents contain more numbers, e.g. finance.
                # Some documents are indeed full of numbers. At least in 20NEWS.
                doc = re.sub('[0-9]', ' ', doc)
            # Remove everything except a-z characters and single space.
            doc = doc.replace('$', ' dollar ')
            doc = doc.lower()
            doc = re.sub('[^a-z]', ' ', doc)
            doc = ' '.join(doc.split())  # same as doc = re.sub('\s{2,}', ' ', doc)
            self.documents[i] = doc

    def vectorize(self, **params):
        # TODO: count or tf-idf. Or in normalize ?
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(**params)
        self.data = vectorizer.fit_transform(self.documents)
        self.vocab = vectorizer.get_feature_names()
        assert len(self.vocab) == self.data.shape[1]
    
    def data_info(self, show_classes=False):
        N, M = self.data.shape
        sparsity = self.data.nnz / N / M * 100
        print('N = {} documents, M = {} words, sparsity={:.4f}%'.format(N, M, sparsity))
        if show_classes:
            for i in range(len(self.class_names)):
                num = sum(self.labels == i)
                print('  {:5d} documents in class {:2d} ({})'.format(num, i, self.class_names[i]))
        
    def show_document(self, i):
        label = self.labels[i]
        name = self.class_names[label]
        text = self.documents[i]
        wc = len(text.split())
        print('document {}: label {} --> {}, {} words'.format(i, label, name, wc))
        try:
            vector = self.data[i,:]
            for j in range(vector.shape[1]):
                if vector[0,j] != 0:
                    print('  {:.2f} "{}" ({})'.format(vector[0,j], self.vocab[j], j))
        except:
            pass
        return text
    
    def keep_documents(self, idx):
        """Keep the documents given by the index, discard the others."""
        self.documents = [self.documents[i] for i in idx]
        self.labels = self.labels[idx]
        self.data = self.data[idx,:]

    def keep_words(self, idx):
        """Keep the documents given by the index, discard the others."""
        self.data = self.data[:,idx]
        self.vocab = [self.vocab[i] for i in idx]
        
    def remove_short_documents(self, nwords, vocab='selected'):
        """Remove a document if it contains less than nwords."""
        if vocab is 'selected':
            # Word count with selected vocabulary.
            wc = self.data.sum(axis=1)
            wc = np.squeeze(np.asarray(wc))
        elif vocab is 'full':
            # Word count with full vocabulary.
            wc = np.empty(len(self.documents), dtype=np.int)
            for i,doc in enumerate(self.documents):
                wc[i] = len(doc.split())
        idx = np.argwhere(wc >= nwords).squeeze()
        self.keep_documents(idx)
        return wc
        
    def keep_top_words(self, M, Mprint=20):
        """Keep in the vocaluary the M words who appear most often."""
        freq = self.data.sum(axis=0)
        freq = np.squeeze(np.asarray(freq))
        idx = np.argsort(freq)[::-1]
        idx = idx[:M]
        self.keep_words(idx)
        print('most frequent words')
        for i in range(Mprint):
            print('  {:3d}: {:10s} {:6d} counts'.format(i, self.vocab[i], freq[idx][i]))
        return freq[idx]
    
    def normalize(self, norm='l1'):
        """Normalize data to unit length."""
        # TODO: TF-IDF.
        data = self.data.astype(np.float64)
        self.data = sklearn.preprocessing.normalize(data, axis=1, norm=norm)

In [None]:
# Fetch dataset. Scikit-learn already performs some cleaning.
remove = ('headers','footers','quotes')  # (), ('headers') or ('headers','footers','quotes')
train = Dataset()
train.fetch(data_home=FLAGS.dir_data, subset='train', remove=remove)

# Pre-processing: transform everything to a-z and whitespace.
print(train.show_document(1)[:400])
train.clean_text(num='substitute')

# Analyzing / tokenizing: transform documents to bags-of-words.
#stop_words = set(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
# Or stop words from NLTK.
# Add e.g. don, ve.
train.vectorize(stop_words='english')
print(train.show_document(1)[:400])

In [None]:
# Remove short documents.
train.data_info(True)
wc = train.remove_short_documents(nwords=20, vocab='full')
train.data_info()
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
plt.figure(figsize=(17,5))
plt.semilogy(wc, '.');

# Remove encoded images.
def remove_encoded_images(dataset, freq=1e3):
    widx = train.vocab.index('ax')
    wc = train.data[:,widx].toarray().squeeze()
    idx = np.argwhere(wc < freq).squeeze()
    dataset.keep_documents(idx)
    return wc
wc = remove_encoded_images(train)
train.data_info()
plt.figure(figsize=(17,5))
plt.semilogy(wc, '.');

In [None]:
# Feature selection.
# Other options include: mutual information or document count.
freq = train.keep_top_words(1000, 20)
train.data_info()
train.show_document(1)
plt.figure(figsize=(17,5))
plt.semilogy(freq);

# Remove documents whose signal would be the zero vector.
wc = train.remove_short_documents(nwords=5, vocab='selected')
train.data_info(True)

In [None]:
train.normalize(norm='l1')
train.show_document(1);

In [None]:
# TODO
# Word embedding
# Further feature selection.

In [None]:
# Test dataset.
test = Dataset()
test.fetch(data_home=FLAGS.dir_data, subset='test', remove=remove)
test.clean_text(num='substitute')
test.vectorize(vocabulary=train.vocab)
test.data_info()
wc = test.remove_short_documents(nwords=5, vocab='selected')
print('shortest: {}, longest: {} words'.format(wc.min(), wc.max()))
test.data_info(True)
test.normalize(norm='l1')

In [None]:
train_data = train.data  # .toarray()
test_data = test.data  # .toarray()
train_labels = train.labels
test_labels = test.labels

graph_data = train_data.T.toarray()
#graph_data = train.embeddings

del train, test

# Feature graph

In [None]:
def coarsen(A, levels):
    graphs, parents = coarsening.metis(A, levels)
    perms = coarsening.compute_perm(parents)

    laplacians = []
    for i,A in enumerate(graphs):
        M, M = A.shape

        # No self-connections.
        if True:
            A = A.tocoo()
            A.setdiag(0)

        if i < levels:
            A = coarsening.perm_adjacency(A, perms[i])

        A = A.tocsr()
        A.eliminate_zeros()
        Mnew, Mnew = A.shape
        print('Layer {0}: M_{0} = |V| = {1} nodes ({2} added), |E| = {3} edges'.format(i, Mnew, Mnew-M, A.nnz))

        L = graph.laplacian(A, normalized=FLAGS.normalized_laplacian)
        laplacians.append(L)
    return laplacians, perms[0] if len(perms) > 0 else None

t_start = time.process_time()
A = graph.adjacency(graph_data, k=FLAGS.number_edges, metric=FLAGS.metric)
print("{} > {} edges".format(A.nnz, FLAGS.number_edges*train_data.shape[1]))
L, perm = coarsen(A, FLAGS.coarsening_levels)
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
graph.plot_spectrum(L)
del graph_data, A

In [None]:
t_start = time.process_time()
train_data = scipy.sparse.csr_matrix(coarsening.perm_data(train_data.toarray(), perm))
test_data = scipy.sparse.csr_matrix(coarsening.perm_data(test_data.toarray(), perm))
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
del perm

# Classification

In [None]:
# Training set is shuffled already.
#perm = np.random.permutation(train_data.shape[0])
#train_data = train_data[perm,:]
#train_labels = train_labels[perm]

# Validation set.
val_data = train_data[:FLAGS.val_size,:]
val_labels = train_labels[:FLAGS.val_size]
train_data = train_data[FLAGS.val_size:,:]
train_labels = train_labels[FLAGS.val_size:]

In [None]:
# Baseline.
def baseline():
    clf, accuracy, f1, exec_time = [], [], [], []
    clf.append(sklearn.neighbors.KNeighborsClassifier(n_neighbors=10))
    clf.append(sklearn.linear_model.LogisticRegression())
    clf.append(sklearn.naive_bayes.BernoulliNB(alpha=.01))
    clf.append(sklearn.ensemble.RandomForestClassifier())
    clf.append(sklearn.naive_bayes.MultinomialNB(alpha=.01))
    clf.append(sklearn.linear_model.RidgeClassifier())
    clf.append(sklearn.svm.LinearSVC())
    for c in clf:
        t_start = time.process_time()
        c.fit(train_data, train_labels)
        pred = c.predict(test_data)
        accuracy.append('{:5.2f}'.format(100*sklearn.metrics.accuracy_score(test_labels, pred)))
        f1.append('{:5.2f}'.format(100*sklearn.metrics.f1_score(test_labels, pred, average='weighted')))
        exec_time.append('{:5.2f}'.format(time.process_time() - t_start))
    print('Accuracy:       {}'.format(' '.join(accuracy)))
    print('F1 (weighted):  {}'.format(' '.join(f1)))
    print('Execution time: {}'.format(' '.join(exec_time)))
baseline()

In [None]:
def evaluate(sess, ph_dropout, op_prediction, op_loss, data, labels):
    """
    Runs one evaluation against the full epoch of data.
    Return the precision and the number of correct predictions.
    Batch evaluation saves memory and enables this to run on smaller GPUs.
    
    sess: the session in which the model has been trained.
    op: the Tensor that returns the number of correct predictions.
    data: size N x M
        N: number of signals (samples)
        M: number of vertices (features)
    labels: size N
        N: number of signals (samples)
    """
    ncorrects = 0  # Counts the number of correct predictions.
    loss = 0
    size = len(labels)
    predictions = np.empty(size)
    for begin in range(0, size, FLAGS.batch_size):
        end = begin + FLAGS.batch_size
        end = min([end, size])
        batch_data = np.zeros((FLAGS.batch_size, data.shape[1]))
        batch_labels = np.zeros(FLAGS.batch_size)
        batch_data[:end-begin], batch_labels[:end-begin] = data[begin:end,:], labels[begin:end]
        if type(batch_data) is not np.ndarray:
            batch_data = batch_data.toarray()  # convert sparse matrices
        feed_dict = {ph_data: batch_data, ph_labels: batch_labels, ph_dropout: 1}
        batch_pred, batch_loss = sess.run([op_prediction, op_loss], feed_dict)
        predictions[begin:end] = batch_pred[:end-begin]
        loss += batch_loss
    #print(predictions)
    ncorrects = sum(predictions == labels)
    accuracy = 100 * sklearn.metrics.accuracy_score(labels, predictions)
    f1 = 100 * sklearn.metrics.f1_score(labels, predictions, average='weighted')
    loss *= FLAGS.batch_size / size
    string = 'accuracy: {:.2f} ({:d} / {:d}), f1 (weighted): {:.2f} ,loss: {:.2e}'.format(
            accuracy, ncorrects, size, f1, loss)
    return string, accuracy, f1, loss

In [None]:
# Logistic regression.
FLAGS.learning_rate = 1e3
FLAGS.num_epochs = 20
#model = cgcnn(L, F=[], K=[], p=[], M=[max(train_labels)+1])

# Fully connected.
FLAGS.learning_rate = 1e-1
FLAGS.num_epochs = 40
#model = cgcnn(L, F=[], K=[], p=[], M=[500,max(train_labels)+1])

# Graph convolutional.
FLAGS.learning_rate = 1e-1
FLAGS.num_epochs = 40
model = cgcnn(L, F=[32], K=[25], p=[1], M=[max(train_labels)+1])

#train_(model, train_data, train_labels, val_data, val_labels, FLAGS)

In [None]:
#TODO
#def create_comp_graph(model, train_data, train_labels, val_data, val_labels, FLAGS):

with tf.name_scope('inputs'):
    ph_data = tf.placeholder(tf.float32, (FLAGS.batch_size, train_data.shape[1]), 'data')
    ph_labels = tf.placeholder(tf.int32, (FLAGS.batch_size), 'labels')
    ph_dropout = tf.placeholder(tf.float32, (), 'dropout')

# Construct computational graph.
op_logits = model.inference(ph_data, ph_dropout)
op_loss, op_loss_average = model.loss(op_logits, ph_labels, FLAGS.regularization)
op_train = model.training(op_loss, FLAGS.learning_rate,
        train_data.shape[0]/FLAGS.batch_size, FLAGS.decay_rate, FLAGS.momentum)
op_prediction = model.prediction(op_logits)

# Summaries for TensorBoard.
op_summary = tf.merge_all_summaries()
sess = tf.Session()
writer = tf.train.SummaryWriter(FLAGS.dir_summaries, sess.graph)

In [None]:
#def train(model, train_data, train_labels, val_data, val_labels, FLAGS):

# Initialize variables, i.e. weights and biases.
t_process, t_wall = time.process_time(), time.time()
init = tf.initialize_all_variables()
sess.run(init)

# Training.
indices = collections.deque()
num_steps = int(FLAGS.num_epochs * train_data.shape[0] / FLAGS.batch_size)
for step in range(1, num_steps+1):

    # Be sure to have used all the samples before using one a second time.
    if len(indices) < FLAGS.batch_size:
        indices.extend(np.random.permutation(train_data.shape[0]))
    idx = [indices.popleft() for i in range(FLAGS.batch_size)]

    batch_data, batch_labels = train_data[idx,:], train_labels[idx]
    if type(batch_data) is not np.ndarray:
        batch_data = batch_data.toarray()  # convert sparse matrices
    feed_dict = {ph_data: batch_data, ph_labels: batch_labels, ph_dropout: FLAGS.dropout}
    learning_rate, loss_average = sess.run([op_train, op_loss_average], feed_dict)

    # Periodical evaluation of the model.
    if step % FLAGS.eval_frequency == 0 or step == num_steps:
        epoch = step * FLAGS.batch_size / train_data.shape[0]
        print('step {} / {} (epoch {:.2f} / {}):'.format(step, num_steps, epoch, FLAGS.num_epochs))
        print('  learning_rate = {:.2e}, loss_average = {:.2e}'.format(learning_rate, loss_average))
        string, accuracy, f1, loss = evaluate(sess, ph_dropout, op_prediction, op_loss, val_data, val_labels) 
        print('  validation {}'.format(string))
        print('  time: {:.0f}s (wall {:.0f}s)'.format(time.process_time()-t_process, time.time()-t_wall))

        # Summaries for TensorBoard.
        summary = tf.Summary()
        summary.ParseFromString(sess.run(op_summary, feed_dict))
        summary.value.add(tag='validation/accuracy', simple_value=accuracy)
        summary.value.add(tag='validation/f1', simple_value=f1)
        summary.value.add(tag='validation/loss', simple_value=loss)
        writer.add_summary(summary, step)
writer.close()

#return sess

In [None]:
# Evaluate.
t_process, t_wall = time.process_time(), time.time()
print('test {}'.format(evaluate(sess, ph_dropout, op_prediction, op_loss, test_data, test_labels)[0]))
print('time: {:.0f}s (wall {:.0f}s)'.format(time.process_time()-t_process, time.time()-t_wall))
#sess.close()

In [None]:
#if __name__ == '__main__':
#    tf.app.run()