In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
import glob, random, time, os, zlib

FEATURE_COUNT = 16 + 6
CROSS_VAL_SIZE = 1000
MINIBATCH_SIZE = 128
DEVICE_TO_USE = "/gpu:0"
DATA_ROOT = "/home/snp/proj/go/fastgo/massive_small_chunks"
TOTAL_CHUNK_COUNT = 39 # Intentionally low to leave some test data.

def to_hms(x):
    x = int(x)
    seconds = x % 60
    minutes = (x // 60) % 60
    hours   = x // 60 // 60
    return "%2i:%02i:%02i" % (hours, minutes, seconds)

def stream_decompress(s):
    decomp = zlib.decompressobj()
    block_size = 2**23
    i = 0
    results = []
    while i < len(s):
        block = s[i:i+block_size]
        results.append(decomp.decompress(block))
        i += block_size
    results.append(decomp.flush())
    return "".join(results)

def load_chunk(features, targets, winners):
    def load_flat_array(path, shape):
        with open(path) as f:
            data = f.read()
        data = stream_decompress(data)
        return np.fromstring(data, dtype=np.int8).reshape(shape)
    features = load_flat_array(features, (-1, FEATURE_COUNT, 19, 19))
    targets  = load_flat_array(targets, (-1, 19, 19))
    winners  = load_flat_array(winners, (-1, 2))
    assert len(features) == len(targets) == len(winners)
    # Take the features, and move the feature index to the end, so it has shape (-1, 19, 19, FEATURE_COUNT)
    features = np.moveaxis(features, 1, -1)
    assert features.shape == (len(features), 19, 19, FEATURE_COUNT)
    return {"features": features, "targets": targets, "winners": winners}

def load_chunk_from_fastgo_chunks_by_index(chunk_index):
    return load_chunk(
        os.path.join(DATA_ROOT, "features_%03i.z" % chunk_index),
        os.path.join(DATA_ROOT, "targets_%03i.z" % chunk_index),
        os.path.join(DATA_ROOT, "winners_%03i.z" % chunk_index),
    )

# Views into the extremely large dataset.
next_chunk_index = 0
chunk = None
cross_val = None
training_samples = None
in_sample_test = None

def chunk_index_by(index):
    result = chunk["features"][index], chunk["targets"][index]
    assert len(result[0]) == len(result[1]), "Different length inputs and outputs! This should never happen!"
    return result

def load_next_chunk():
    global next_chunk_index, chunk, cross_val, training_samples, in_sample_test
    print("    >>> Loading chunk:", next_chunk_index)
    # Free the memory from the previous chunk FIRST, if we have one loaded.
    # This is necessary to avoid running out of memory.
    if chunk is not None:
        del chunk
        #del cross_val
        del training_samples
        del in_sample_test
    start = time.time()
    chunk = load_chunk_from_fastgo_chunks_by_index(next_chunk_index)
    next_chunk_index = (next_chunk_index + 1) % total_chunk_count
    #cross_val = chunk_index_by(slice(None, CROSS_VAL_SIZE))
    cvs = 0
    training_samples = chunk_index_by(slice(cvs, None))
    in_sample_test   = chunk_index_by(slice(cvs, cvs + 1000))
    stop = time.time()
    print("    >>> (In %f) Samples: %i" % (stop - start, len(chunk["features"])))

def apply_symmetry(feature_arr, target_arr):
    assert feature_arr.shape == (19, 19, FEATURE_COUNT)
    assert target_arr.shape == (19, 19)
    # Break views.
    feature_arr = np.array(feature_arr)
    target_arr = np.array(target_arr)
    coin = lambda: random.choice((False, True))
    if coin():
        feature_arr = feature_arr[::-1,:,:]
        target_arr  = target_arr [::-1,:]
    if coin():
        feature_arr = feature_arr[:,::-1,:]
        target_arr  = target_arr [:,::-1]
    if coin():
        feature_arr = np.swapaxes(feature_arr, 0, 1)
        target_arr  = np.swapaxes(target_arr,  0, 1)
    assert feature_arr.shape == (19, 19, FEATURE_COUNT)
    assert target_arr.shape == (19, 19)
    return feature_arr, target_arr

In [2]:
# Load up a forever-fixed cross-val set.
load_next_chunk()

('    >>> Loading chunk:', 0)
    >>> (In 14.552898) Samples: 847083


In [3]:
random.seed(123456789)
cross_val_data = load_chunk(
    os.path.join(DATA_ROOT, "features_039.z"),
    os.path.join(DATA_ROOT, "targets_039.z"),
    os.path.join(DATA_ROOT, "winners_039.z"),
)
# Choose random indices.
indices = random.sample(list(range(len(cross_val_data["features"]))), CROSS_VAL_SIZE)
cross_val = np.array([cross_val_data["features"][i] for i in indices]), \
            np.array([cross_val_data["targets"][i] for i in indices])
# Delete the original data.
del cross_val_data

In [4]:
product = lambda l: reduce(lambda x, y: x * y, l, 1)
total_parameters = 0

def weight_variable(shape):
    global total_parameters
    total_parameters += product(shape)
    stddev = (2.0 / product(shape[:-1]))**0.5
    initial = tf.truncated_normal(shape, stddev=stddev)
    return tf.Variable(initial)

def bias_variable(shape):
    global total_parameters
    total_parameters += product(shape)
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")

class GoResNet:
    INPUT_FEATURE_COUNT = 22
    OUTPUT_SOFTMAX_COUNT = 361
    FILTERS = 192
    CONV_SIZE = 3
    NONLINEARITY = [tf.nn.relu]
    BLOCK_COUNT = 8

    def __init__(self):
        # Construct input/output placeholders.
        self.input_ph = tf.placeholder(
            tf.float32,
            shape=[None, 19, 19, self.INPUT_FEATURE_COUNT],
            name="input_placeholder")
        self.desired_output_ph = tf.placeholder(
            tf.float32,
            shape=[None, 19, 19],
            name="desired_output_placeholder")
        self.learning_rate_ph = tf.placeholder(tf.float32, shape=[], name="learning_rate")
        self.is_training_ph = tf.placeholder(tf.bool, shape=[], name="is_training")

        # Begin constructing the data flow.
        self.parameters = []
        self.flow = self.input_ph
        # Stack an initial convolution.
        self.stack_convolution(3, self.INPUT_FEATURE_COUNT, self.FILTERS)
        self.stack_nonlinearity()
        # Stack some number of residual blocks.
        for _ in xrange(self.BLOCK_COUNT):
            self.stack_block()
        # Stack a final batch-unnormalized 1x1 convolution.
        self.stack_convolution(1, self.FILTERS, 1, batch_normalization=False)

        # Construct the training components.
        self.flattened = tf.reshape(self.flow, [-1, self.OUTPUT_SOFTMAX_COUNT])
        self.flattened_desired_output = tf.reshape(self.desired_output_ph, [-1, self.OUTPUT_SOFTMAX_COUNT])
        self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
            labels=self.flattened_desired_output,
            logits=self.flattened,
        ))
        regularizer = tf.contrib.layers.l2_regularizer(scale=0.0001)
        reg_variables = tf.trainable_variables()
        self.regularization_term = tf.contrib.layers.apply_regularization(regularizer, reg_variables)
        self.loss = self.cross_entropy + self.regularization_term

        # Associate batch normalization with training.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.train_step = tf.train.MomentumOptimizer(
                learning_rate=self.learning_rate_ph, momentum=0.9).minimize(self.loss)

    def stack_convolution(self, kernel_size, old_size, new_size, batch_normalization=True):
        weights = weight_variable([kernel_size, kernel_size, old_size, new_size])
        self.parameters.append(weights)
        self.flow = conv2d(self.flow, weights)
        if batch_normalization:
            self.flow = tf.layers.batch_normalization(
                self.flow,
                center=False,
                scale=False,
                training=self.is_training_ph)
        else:
            bias = bias_variable([new_size])
            self.parameters.append(bias)
            self.flow = self.flow + bias # TODO: Is += equivalent?

    def stack_nonlinearity(self):
        self.flow = self.NONLINEARITY[0](self.flow)

    def stack_block(self):
        initial_value = self.flow
        # Stack the first convolution.
        self.stack_convolution(3, self.FILTERS, self.FILTERS)
        self.stack_nonlinearity()
        # Stack the second convolution.
        self.stack_convolution(3, self.FILTERS, self.FILTERS)
        # Add the skip connection.
        self.flow = self.flow + initial_value
        # Stack on the deferred non-linearity.
        self.stack_nonlinearity()

    def train(self, samples, learning_rate):
        self.run_on_samples(self.train_step.run, samples, learning_rate=learning_rate, is_training=True)

    def get_loss(self, samples):
        return self.run_on_samples(self.cross_entropy.eval, samples)

    def get_accuracy(self, samples):
        results = self.run_on_samples(self.flattened.eval, samples)
        results = np.argmax(results, axis=1)
        assert results.shape == (len(samples[0]),)
        correct = 0
        for sample, result in zip(samples[1], results):
            correct += np.argmax(sample) == result
        return correct / float(len(samples[0]))

    def run_on_samples(self, f, samples, learning_rate=0.01, is_training=False):
        input_tensor, output_tensor = samples
        return f(feed_dict={
            self.input_ph:          input_tensor,
            self.desired_output_ph: output_tensor,
            self.learning_rate_ph:  learning_rate,
            self.is_training_ph:    is_training,
        })

# We now build our network.
with tf.device(DEVICE_TO_USE):
    cnn = GoResNet()
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())

print "Total parameters:", total_parameters

total_training_steps = 0
loss_plot = []
in_sample_loss_plot = []
lr = 0.05

('stddev:', 0.10050378152592121)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.034020690871988585)
('stddev:', 0.10206207261596575)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [5]:
model_save_counter = 0
def save_model():
    global model_save_counter
    model_save_counter += 1
    x_conv_weights = [(sess.run(W), sess.run(b)) for W, b in cnn.convolution_weights]
    path = "MASSIVE-resnet-%03i" % (model_save_counter,)
    np.save(path, [x_conv_weights])
    print "\x1b[35mSaved model to:\x1b[0m", path

In [7]:
total_work = 0.0
start_time = time.time()
best_loss = float("inf")
bad = 0

lr_schedule = lambda step: 0.005 * 0.5**(step / 10e4)

for overall_step in range(10000):
    elapsed = time.time() - start_time
    in_sample_loss = cnn.get_loss(in_sample_test)
    loss = cnn.get_loss(cross_val)
    color_pair = "", ""
    if loss < best_loss:
        color_pair = "\x1b[31m", "\x1b[0m"
    print("%s%6i [%s - %s] Loss: %.6f  In-sample loss: %.6f  Accuracy: %.3f%s" % (
        color_pair[0],
        total_training_steps,
        to_hms(elapsed),
        to_hms(total_work),
        loss,
        in_sample_loss,
        cnn.get_accuracy(cross_val) * 100,
        color_pair[1]
    ))
    loss_plot.append((total_training_steps, loss))
    in_sample_loss_plot.append((total_training_steps, in_sample_loss))

    if loss >= best_loss:
        bad_threshold = 3 #5 + 3
        if lr >= 0.001:
            bad_threshold = 2
        if lr >= 0.003:
            bad_threshold = 1
        bad += 1
        if bad >= bad_threshold:
            lr *= 0.75 #0.75
            lr = max(lr, lr_schedule(total_training_steps))
            print "\x1b[33mLearning rate reduction!\x1b[0m lr =", lr
            bad = 0
    else:
        bad = 0
    best_loss = min(best_loss, loss)

    for _ in range(500):
        indices = []
        while len(indices) < MINIBATCH_SIZE:
            i = random.randrange(len(training_samples[0]))
            if np.any(training_samples[1][i]):
                indices.append(i)
        #indices = random.sample(xrange(len(training_samples[0])), MINIBATCH_SIZE)
        features = []
        targets  = []
        for i in indices:
            feat_arr = training_samples[0][i]
            targ_arr = training_samples[1][i]
            feat_arr, targ_arr = apply_symmetry(feat_arr, targ_arr)
            features.append(feat_arr)
            targets.append(targ_arr)
            del feat_arr
            del targ_arr
        #features = np.array([training_samples[0][i] for i in indices])
        #targets  = np.array([training_samples[1][i] for i in indices])
        minibatch = (features, targets)
        working = time.time()
        cnn.train(minibatch, lr)
        total_work += time.time() - working
        # Try really hard to not keep any views around!
        del minibatch
        del features
        del targets
        total_training_steps += 1

    # Periodically swap out the data for fresh training data.
    if (overall_step + 1) % 5 == 0:
        load_next_chunk()
    if (overall_step + 1) % 20 == 0:
        save_model()

print("%6i [%s] Accuracy: %7.3f%%" % (total_training_steps, to_hms(elapsed), 100.0 * cnn.get_accuracy(cross_val)))
#plt.hold(True)
#plt.plot(*zip(*loss_plot[1:]))
#plt.plot(*zip(*in_sample_loss_plot[1:]))

[31m   334 [ 0:00:00 -  0:00:00] Loss: 3.284615  In-sample loss: 3.407727  Accuracy: 28.200[0m


KeyboardInterrupt: 

## Everything from here on down is horrific cruft to be ignored.

In [None]:
plt.plot(*zip(*loss_plot[1:]))
plt.plot(*zip(*in_sample_loss_plot[1:]))
cnn.get_accuracy(cross_val) * 100