# Una red neuronal convolucional para detectar objetos

In [1]:
import os 
import sys
import tarfile
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from six.moves import urllib
session = tf.Session()

In [2]:
batch_size = 128
output_every = 50
generations = 20000
eval_every = 500
image_height = 32
image_width = 32
crop_height = 24
crop_width = 24
num_channels = 3
num_targets = 10
data_folder = "cifar-10-batches-bin"

caida exponencial del ratio de aprendizaje

$$learning\ rate = 0.1\cdot 0.9^{\frac{x}{250}}$$

In [3]:
learning_rate = 0.1
lr_decay = 0.9
num_gens_to_wait = 250

In [4]:
image_vect_length = image_height*image_width*num_channels
record_length = 1+image_vect_length

## Descarga y procesamiento de CIFAR 10

In [5]:
data_dir = 'cifar-10-temp'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
cifar10_url = "http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
data_file = os.path.join(data_dir, 'cifar-10-binary.tar.gz')
if not os.path.isfile(data_file):
    filepath, _ = urllib.request.urlretrieve(cifar10_url, data_file)
    tarfile.open(filepath, 'r:gz').extractall(data_dir)

In [6]:
def read_cifar_files(filename_queue, distort_images = True):
    reader = tf.FixedLengthRecordReader(record_bytes=record_length)
    key, record_string = reader.read(filename_queue)
    #creamos fichero binario
    record_bytes = tf.decode_raw(record_string, tf.uint8)
    #extraemos la etiqueta
    image_label = tf.cast(tf.slice(record_bytes, [0], [1]), tf.int32)
    #extraemos la imagen
    image_extracted = tf.reshape(tf.slice(record_bytes, [1], [image_vect_length]), [num_channels, image_height, image_width])
    #redimensión de imagen
    reshaped_image = tf.transpose(image_extracted, [1,2,0])
    reshaped_image = tf.cast(reshaped_image, tf.float32)
    #crop aleatorio
    final_image = tf.image.resize_image_with_crop_or_pad(reshaped_image, crop_width, crop_height)
    if distort_images:
        #flip aleatorio horizontal, cambiar brillo y contraste
        final_image = tf.image.random_flip_left_right(final_image)
        final_image = tf.image.random_brightness(final_image, max_delta=63)
        final_image = tf.image.random_contrast(final_image, lower=0.2, upper=1.8)
    #estandarización por color
    final_image = tf.image.per_image_standardization(final_image)
    return final_image, image_label

In [7]:
def input_pipeline(batch_size, train_logical=True):
    if train_logical:
        files = [os.path.join(data_dir, data_folder, 'data_batch_{}.bin'.format(i)) for i in range(1,6)]
    else:
        files = [os.path.join(data_dir, data_folder, 'test_batch.bin')]
    
    filename_queue = tf.train.string_input_producer(files)
    image, label = read_cifar_files(filename_queue)
    
    min_after_dequeue = 1000
    capacity = min_after_dequeue+3*batch_size
    example_batch, label_batch = tf.train.shuffle_batch([image,label], batch_size, capacity, min_after_dequeue)
    return example_batch, label_batch

## el modelo CCN

In [8]:
def cifar_cnn_model(input_images, batch_size, train_logical=True):
    def truncated_normal_var(name, shape, dtype):
        return tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.truncated_normal_initializer(stddev=0.0))
    
    def zero_var(name, shape, dtype):
        return tf.get_variable(name=name, shape=shape, dtype=dtype, initializer=tf.constant_initializer(0.0))
    
    # primera capa de convolucion
    with tf.variable_scope("conv1") as scope:
        # conv con kernel 5x5 para 3 canales con un total de 64 nodos de salida
        conv1_kernel = truncated_normal_var(name="conv_kernel1", shape=[5,5,3,64], dtype=tf.float32)
        conv1 = tf.nn.conv2d(input_images, conv1_kernel, [1,1,1,1], padding="SAME")
        conv1_bias = zero_var(name="conv_bias1", shape=[64], dtype=tf.float32)
        conv1_add_bias = tf.nn.bias_add(conv1, conv1_bias)
        # capa de relu
        relu_conv1 = tf.nn.relu(conv1_add_bias)
    # capa de max pooling  con ventana de 3x3 que se mueve de a 2 lugares cada vez  
    pool1 = tf.nn.max_pool(relu_conv1, ksize=[1,3,3,1], strides=[1,2,2,1], padding="SAME", name="pool_layer1")
    
    # normalizacion local (en ingles: local response normalization(lrn)) sirve para normalizar las imagenes.
    norm1 = tf.nn.lrn(pool1, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75, name="norm1")
    
    
    
    
    # segunda capa de convolucion
    with tf.variable_scope("conv2") as scope:
        # conv con kernel 5x5 para 64 nodos de entrada con un total de 64 nodos de salida
        conv2_kernel = truncated_normal_var(name="conv_kernel2", shape=[5,5,64,64], dtype=tf.float32)
        conv2 = tf.nn.conv2d(norm1, conv2_kernel, [1,1,1,1], padding="SAME")
        conv2_bias = zero_var(name="conv_bias2", shape=[64], dtype=tf.float32)
        conv2_add_bias = tf.nn.bias_add(conv2, conv2_bias)
        # capa de relu
        relu_conv2 = tf.nn.relu(conv2_add_bias)
    # capa de max pooling  con ventana de 3x3 que se mueve de a 2 lugares cada vez   
    pool2 = tf.nn.max_pool(relu_conv1, ksize=[1,3,3,1], strides=[1,2,2,1], padding="SAME", name="pool_layer2")
    
    # normalizacion local (en ingles: local response normalization(lrn)) sirve para normalizar las imagenes.
    norm2 = tf.nn.lrn(pool2, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75, name="norm2")
    
    
    
    
    # redimensionar a una matriz para poder multiplicar en las capas totalmente conectadas
    reshaped_output = tf.reshape(norm2, [batch_size,-1]) #redimensiona la capa anterior para que quede un vector columna y se pueda multiplicar luego
    reshaped_dim = reshaped_output.get_shape()[1].value # se obtiene la cantidad de nodos de la imagen, servira para conectar el resultado de "norm2" con la capa totalmente conectada
    
    
    
    # primera capa totalmente conectada con la cantidad de nodos de entrada que devuleva"reshaped_dim" y con 384 nodos de salida
    with tf.variable_scope("full1") as scope:
        full_weight1 = truncated_normal_var(name="full_mult1", shape=[reshaped_dim,384], dtype=tf.float32)
        full_bias1 = zero_var(name="full_bias1", shape=[384], dtype=tf.float32)
        full_layer1 = tf.nn.relu(tf.add(tf.matmul(reshaped_output, full_weight1), full_bias1))
        
        
        
    # segunda capa totalmente conectada con 384 nodos de entrada y con 192 nodos de salida
    with tf.variable_scope("full2") as scope:
        full_weight2 = truncated_normal_var(name="full_mult2", shape=[384,192], dtype=tf.float32)
        full_bias2 = zero_var(name="full_bias2", shape=[192], dtype=tf.float32)
        full_layer2 = tf.nn.relu(tf.add(tf.matmul(full_layer1, full_weight2), full_bias2))    
    
    
    
    # tercera capa totalmente conectada con 192 nodos de entrada y con 10 nodos de salida (num_targets=10)
    with tf.variable_scope("full3") as scope:
        full_weight3 = truncated_normal_var(name="full_mult3", shape=[192,10], dtype=tf.float32)
        full_bias3 = zero_var(name="full_bias3", shape=[10], dtype=tf.float32)
        final_output = tf.nn.relu(tf.add(tf.matmul(full_layer2, full_weight3), full_bias3))

    return final_output

In [9]:
def cifar_loss(logits, targets):
    targets = tf.squeeze(tf.cast(targets, tf.int32)) #elimino las dimensiones inecesarias con squeeze
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)
    cross_entropy_mean = tf.reduce_mean(cross_entropy)
    return cross_entropy_mean

In [17]:
def train_step(loss_value, generation_num):
    model_learning_rate = tf.train.exponential_decay(learning_rate, generation_num, num_gens_to_wait, lr_decay, staircase=True)
    optimizador = tf.train.GradientDescentOptimizer(model_learning_rate)
    train_step = optimizador.minimize(loss_value)
    return train_step

In [11]:
def accuracy_from_batches(logits, targets):
    targets = tf.squeeze(tf.cast(targets, tf.int32)) #elimino las dimensiones inecesarias con squeeze
    batch_predictions = tf.cast(tf.argmax(logits,1), tf.int32) # los logits contienen probabilidades de los targets, con "tf.argmax" seleccionamos la mas alta
    predicted_correctly = tf.equal(batch_predictions, targets) # si son iguales devuelve 1, si son diferentes devuelve 0
    accuracy = tf.reduce_mean(tf.cast(predicted_correctly, tf.float32)) # convertimos los V a 1 y los F a 0, y hacemos el promedio
    return accuracy

## Entrenamiento

In [15]:
images, targets = input_pipeline(batch_size, train_logical=True) # para entrenamiento
test_images, test_targets = input_pipeline(batch_size, train_logical=False) # para test

In [13]:
with tf.variable_scope("model_definition") as scope:
    model_output = cifar_cnn_model(images, batch_size)
    scope.reuse_variables()
    test_output = cifar_cnn_model(test_images, batch_size)

In [18]:
loss = cifar_loss(model_output, targets) # perdida en entrenamiento
accuracy = accuracy_from_batches(test_output, test_targets) # accuracy en test
generation_num = tf.Variable(0, trainable=False) # indica en que generacion vamos, inicia en 0
train_op = train_step(loss, generation_num) # se ejecuta el entrenamiento

In [19]:
init = tf.global_variables_initializer()
session.run(init)

In [20]:
tf.train.start_queue_runners(sess=session)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.


[<Thread(QueueRunnerThread-input_producer-input_producer/input_producer_EnqueueMany, started daemon 2912)>,
 <Thread(QueueRunnerThread-shuffle_batch/random_shuffle_queue-shuffle_batch/random_shuffle_queue_enqueue, started daemon 8892)>,
 <Thread(QueueRunnerThread-input_producer_1-input_producer_1/input_producer_1_EnqueueMany, started daemon 3988)>,
 <Thread(QueueRunnerThread-shuffle_batch_1/random_shuffle_queue-shuffle_batch_1/random_shuffle_queue_enqueue, started daemon 7308)>,
 <Thread(QueueRunnerThread-input_producer_2-input_producer_2/input_producer_2_EnqueueMany, started daemon 7792)>,
 <Thread(QueueRunnerThread-shuffle_batch_2/random_shuffle_queue-shuffle_batch_2/random_shuffle_queue_enqueue, started daemon 7356)>,
 <Thread(QueueRunnerThread-input_producer_3-input_producer_3/input_producer_3_EnqueueMany, started daemon 4216)>,
 <Thread(QueueRunnerThread-shuffle_batch_3/random_shuffle_queue-shuffle_batch_3/random_shuffle_queue_enqueue, started daemon 2112)>,
 <Thread(QueueRunnerTh

In [25]:
train_loss = []
test_acc = []
for i in range(20000):
    _, loss_value = session.run([train_op, loss])
    if(i+1)%50==0:
        train_loss.append(loss_value)
        print("paso {}, perdida: {:.5f}" .format((i+1), loss_value))

    if(i+1)%500==0:
        [temp_acc] = session.run([accuracy])
        test_acc.append(temp_acc)
        print("--- precision en test: {:.2f}%. ---" .format(100*temp_acc))

paso 1, perdida: 2.30258
paso 2, perdida: 2.30258
paso 3, perdida: 2.30258
paso 4, perdida: 2.30258
paso 5, perdida: 2.30258
--- precision en test: 5.47%. ---
paso 6, perdida: 2.30258
paso 7, perdida: 2.30258
paso 8, perdida: 2.30258
paso 9, perdida: 2.30258
paso 10, perdida: 2.30258
--- precision en test: 10.16%. ---
paso 11, perdida: 2.30258
paso 12, perdida: 2.30258
paso 13, perdida: 2.30258
paso 14, perdida: 2.30258
paso 15, perdida: 2.30258
--- precision en test: 3.12%. ---
paso 16, perdida: 2.30258
paso 17, perdida: 2.30258
paso 18, perdida: 2.30258
paso 19, perdida: 2.30258
paso 20, perdida: 2.30258
--- precision en test: 10.94%. ---
paso 21, perdida: 2.30258
paso 22, perdida: 2.30258
paso 23, perdida: 2.30258
paso 24, perdida: 2.30258
paso 25, perdida: 2.30258
--- precision en test: 7.81%. ---
paso 26, perdida: 2.30258
paso 27, perdida: 2.30258
paso 28, perdida: 2.30258
paso 29, perdida: 2.30258
paso 30, perdida: 2.30258
--- precision en test: 9.38%. ---
paso 31, perdida: 2.302

KeyboardInterrupt: 