RNN

pchavanne · Jan 31, 2017 · 981804b · 981804b
1 parent 75fdeda
commit 981804b
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 124 deletions.
diff --git a/examples/lstm_example.py b/examples/lstm_example.py
@@ -5,83 +5,76 @@
 This example show you how to train an LSTM for text generation.
 """
 import os
+import numpy as np
 import yadll
+
 import logging
 
 logging.basicConfig(level=logging.DEBUG, format='%(message)s')
 
-# load the data
-datafile = 'nietzsche.txt'
-if not os.path.isfile(datafile):
-    import urllib
-    origin = 'https://s3.amazonaws.com/text-datasets/nietzsche.txt'
-    print 'Downloading data from %s' % origin
-    urllib.urlretrieve(origin, datafile)
-data = yadll.data.Data(datafile)
+# Creat the data
+alphabet = 'abcdefghijklmnopqrstuvwxyz'
+number_of_chars = len(alphabet)
+sequence_length = 2
+sentences = [alphabet[i: i + sequence_length] for i in range(len(alphabet) - sequence_length)]
+next_chars = [alphabet[i + sequence_length] for i in range(len(alphabet) - sequence_length)]
+
+# Transform sequences and labels into 'one-hot' encoding
+X = np.zeros((len(sentences), sequence_length, number_of_chars), dtype=np.bool)
+y = np.zeros((len(sentences), number_of_chars), dtype=np.bool)
+for i, sentence in enumerate(sentences):
+    for t, char in enumerate(sentence):
+        X[i, t, ord(char) - ord('a')] = 1
+    y[i, ord(next_chars[i]) - ord('a')] = 1
+data = yadll.data.Data(data=[(X, y), (X, y), (X, y)])
 
 # create the model
 model = yadll.model.Model(name='lstm', data=data)
 
 # Hyperparameters
 hp = yadll.hyperparameters.Hyperparameters()
-hp('batch_size', 128)
-hp('n_epochs', 1000)
-hp('learning_rate', 0.9)
-hp('momentum', 0.5)
-hp('l1_reg', 0.00)
-hp('l2_reg', 0.0000)
-hp('patience', 10000)
+hp('batch_size', 1)
+hp('n_epochs', 60)
 
 # add the hyperparameters to the model
 model.hp = hp
 
 # Create connected layers
 # Input layer
-l_in = yadll.layers.InputLayer(input_shape=(hp.batch_size, 28 * 28), name='Input')
-# Dropout Layer 1
-l_dro1 = yadll.layers.Dropout(incoming=l_in, corruption_level=0.4, name='Dropout 1')
-# Dense Layer 1
-l_hid1 = yadll.layers.DenseLayer(incoming=l_dro1, n_units=100, W=yadll.init.glorot_uniform,
-                                 l1=hp.l1_reg, l2=hp.l2_reg, activation=yadll.activations.relu,
-                                 name='Hidden layer 1')
-# Dropout Layer 2
-l_dro2 = yadll.layers.Dropout(incoming=l_hid1, corruption_level=0.2, name='Dropout 2')
-# Dense Layer 2
-l_hid2 = yadll.layers.DenseLayer(incoming=l_dro2, n_units=100, W=yadll.init.glorot_uniform,
-                                 l1=hp.l1_reg, l2=hp.l2_reg, activation=yadll.activations.relu,
-                                 name='Hidden layer 2')
+l_in = yadll.layers.InputLayer(input_shape=(hp.batch_size, sequence_length, number_of_chars))
+# LSTM 1
+l_lstm1 = yadll.layers.LSTM(incoming=l_in, n_units=16, last_only=False)
+# LSTM 2
+l_lstm2 = yadll.layers.LSTM(incoming=l_lstm1, n_units=16)
 # Logistic regression Layer
-l_out = yadll.layers.LogisticRegression(incoming=l_hid2, n_class=10, l1=hp.l1_reg,
-                                        l2=hp.l2_reg, name='Logistic regression')
+l_out = yadll.layers.LogisticRegression(incoming=l_lstm2, n_class=number_of_chars)
 
 # Create network and add layers
-net = yadll.network.Network('2 layers mlp with dropout')
+net = yadll.network.Network('stacked lstm')
 net.add(l_in)
-net.add(l_dro1)
-net.add(l_hid1)
-net.add(l_dro2)
-net.add(l_hid2)
+net.add(l_lstm1)
+net.add(l_lstm2)
 net.add(l_out)
 
 # add the network to the model
 model.network = net
-
 # updates method
-model.updates = yadll.updates.newton
+model.updates = yadll.updates.adam
 
 # train the model and save it to file at each best
 model.train()
 
-# saving network paramters
-net.save_params('net_params.yp')
-
-# make prediction
-# We can test it on some examples from test
-test_set_x = data.test_set_x.get_value()
-test_set_y = data.test_set_y.eval()
-
-predicted_values = model.predict(test_set_x[:30])
+# prime the model with 'ab' sequence and let it generate the learned alphabet
+sentence = alphabet[:sequence_length]
+generated = sentence
+for iteration in range(number_of_chars - sequence_length):
+    x = np.zeros((1, sequence_length, number_of_chars))
+    for t, char in enumerate(sentence):
+        x[0, t, ord(char) - ord('a')] = 1.
+    preds = model.predict(x)[0]
+    next_char = chr(np.argmax(preds) + ord('a'))
+    generated += next_char
+    sentence = sentence[1:] + next_char
 
-print ("Model 1, predicted values for the first 30 examples in test set:")
-print predicted_values
-print test_set_y[:30]
+# check that it did generate the alphabet correctly
+assert(generated == alphabet)
diff --git a/yadll/model.py b/yadll/model.py
@@ -79,12 +79,13 @@ class Model(object):
 
     """
     def __init__(self, network=None, data=None, hyperparameters=None, name='model',
-                 updates=sgd, file=None):
+                 updates=sgd, objective=categorical_crossentropy, file=None):
         self.network = network
         self.data = data             # data [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)]
         self.name = name
         self.hp = hyperparameters
         self.updates = updates
+        self.objective = objective
         self.file = file
         self.save_mode = None        # None, 'end' or 'each'
         self.index = T.iscalar()     # index to a [mini]batch
@@ -170,7 +171,8 @@ def train(self, unsupervised_training=True, save_mode=None):
         n_valid_batches = self.data.valid_set_x.get_value(borrow=True).shape[0] / self.hp.batch_size
         n_test_batches = self.data.test_set_x.get_value(borrow=True).shape[0] / self.hp.batch_size
 
-        cost = -T.mean(T.log(self.network.get_output(stochastic=True))[T.arange(self.y.shape[0]), self.y])
+        #cost = -T.mean(T.log(self.network.get_output(stochastic=True))[T.arange(self.y.shape[0]), self.y])
+        cost = - self.objective(prediction=self.network.get_output(stochastic=True), target=self.y)
         # add regularisation
         cost += self.network.reguls
 

diff --git a/yadll/objectives.py b/yadll/objectives.py
@@ -10,122 +10,93 @@
 
 def mean_squared_error(prediction, target):
     r"""
-    Mean Squared Error
+    Mean Squared Error: MSE
 
-    .. math:: MSE_i = \frac{1}{n} \sum_{j}{(target_{i,j} - prediction_{i,j})^2}
+    .. math:: MSE_i = \frac{1}{n} \sum_{j}{(prediction_{i,j} - target_{i,j})^2}
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
-
-    Returns
-    -------
-        MSE
     """
     return T.mean(T.square(prediction - target), axis=-1)
 
 
 def root_mean_squared_error(prediction, target):
     r"""
-    Root Mean Squared Error
+    Root Mean Squared Error: RMSE
 
     .. math:: RMSE_i = \sqrt{\frac{1}{n} \sum_{j}{(target_{i,j} - prediction_{i,j})^2}}
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
-
-    Returns
-    -------
-        RMSE
     """
     return T.sqrt(T.mean(T.square(prediction - target), axis=-1))
 
 
 def mean_absolute_error(prediction, target):
     r"""
-    Mean Absolute Error
+    Mean Absolute Error: MAE
 
-    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}}}
+    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}\big|}
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
-
-    Returns
-    -------
-        MAE
     """
     return T.mean(T.abs_(prediction - target), axis=-1)
 
 
-def hinge(prediction, target):
+def binary_hinge_error(prediction, target):
     r"""
-    Hinge Error
+    Binary Hinge Error: BHE
+    .. math:: hinge_i = \frac{1}{n} \sum_{j}{\max(1. - target_{i,j} * prediction_{i,j}, 0.)}
+
+    """
+    return T.mean(T.maximum(1. - target * prediction, 0.), axis=-1)
 
-    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\max(1. - target_{i,j} * prediction_{i,j}, 0.)}
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
+def categorical_hinge_error(prediction, target):
+    r"""
+    Categorical Hinge Error: CHE
+    .. math:: hinge_i = \frac{1}{n} \sum_{j}{\max(1. - target_{i,j} * prediction_{i,j}, 0.)}
 
-    Returns
-    -------
-        Hinge
     """
     return T.mean(T.maximum(1. - target * prediction, 0.), axis=-1)
 
 
-def binary_crossentropy(prediction, target):
+def binary_crossentropy_error(prediction, target):
     r"""
-    Binary Crossentropy Error
-
-    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}}}
+    Binary Cross-entropy Error: BCE
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
+    .. math:: BCE_i = \frac{1}{n} \sum_{j}{-(target_{i,j} * \log(prediction_{i,j})
+        + (1 - target_{i,j}) * \log(1 - prediction_{i,j}))}
 
-    Returns
-    -------
-        Binary crossentropy
     """
     clip_pred = T.clip(prediction, EPSILON, 1 - EPSILON)
     return T.mean(T.nnet.binary_crossentropy(clip_pred, target), axis=-1)
 
 
-def categorical_crossentropy(prediction, target):
+def categorical_crossentropy_error(prediction, target):
     r"""
-    Categorical Crossentropy Error
+    Categorical Cross-entropy Error: CCE
 
-    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}}}
+    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}\big|}
+
+    """
+    prediction /= prediction.sum(axis=-1, keepdims=True)
+    prediction = T.clip(prediction, EPSILON, 1 - EPSILON)
+    return T.mean(T.nnet.categorical_crossentropy(prediction, target), axis=-1)
+
+
+def kullback_leibler_divergence(prediction, target):
+    r"""
+    Kullback Leibler Divergence: KLD
 
-    Parameters
-    ----------
-    prediction : Theano tensor
-        The predicted values
-    target : Theano tensor
-        The target values
+    .. math:: MAE_i = \frac{1}{n} \sum_{j}{\big|{target_{i,j} - prediction_{i,j}\big|}
 
-    Returns
-    -------
-        Categorical crossentropy
     """
     prediction /= prediction.sum(axis=-1, keepdims=True)
     prediction = T.clip(prediction, EPSILON, 1 - EPSILON)
-    return T.mean(T.nnet.categorical_crossentropy(prediction, target), axis=-1)
+    return T.mean(T.nnet.categorical_crossentropy(prediction, target), axis=-1)
+
+# Aliases
+mse = MSE = mean_squared_error
+rmse = RMSE = root_mean_squared_error
+mae = MAE = mean_absolute_error
+bhe = BHE = binary_hinge_error
+che = CHE = categorical_hinge_error
+bce = BCE = binary_crossentropy_error
+cce = CCE = categorical_crossentropy_error
+kld = KLD = kullback_leibler_divergence