In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
root_path = "../"
path_data_clean = root_path + "data/clean/"
dfNoticias = pd.read_pickle(path_data_clean + "/dfNoticiasCleanV2.p")


dfNoticias.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351681 entries, 0 to 367147
Data columns (total 16 columns):
Cuerpo       351681 non-null object
Fecha        351681 non-null object
Hora         351681 non-null object
ID           351681 non-null object
Resumen      350517 non-null object
Seccion_1    351681 non-null object
Seccion_2    187037 non-null object
Seccion_3    59174 non-null object
Subtema_1    255959 non-null object
Subtema_2    114189 non-null object
Subtema_3    39513 non-null object
Tema_1       349253 non-null object
Tema_2       179833 non-null object
Tema_3       57572 non-null object
Titular      351681 non-null object
Type         351680 non-null object
dtypes: object(16)
memory usage: 45.6+ MB


In [3]:
raw_text = ""

count = 0
for index, row in dfNoticias.iterrows():    
    if index < 700:
        raw_text += row["Cuerpo"]

In [4]:
# load ascii text and covert to lowercase
raw_text = raw_text.lower()

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [5]:
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  1118101
Total Vocab:  90


In [6]:


# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print( "Total Patterns: ", n_patterns)


Total Patterns:  1118001


In [7]:


# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)


In [8]:


# define the LSTM model

# model = Sequential()
# model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
# model.add(Dropout(0.5))
# model.add(Dense(y.shape[1], activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam')


model = Sequential()
model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(512))
model.add(Dropout(0.5))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')




In [9]:
# define the checkpoint
#filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"


filepath="weights-improvement-{epoch:02d}-{loss:.4f}-test1.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

Concatenating all pg essays over the last ~5 years we get approximately 1MB text file, or about 1 million characters (this is considered a very small dataset by the way). Technical: Lets train a 2-layer LSTM with 512 hidden nodes (approx. 3.5 million parameters), and with dropout of 0.5 after each layer. We’ll train with batches of 100 examples and truncated backpropagation through time of length 100 characters. With these settings one batch on a TITAN Z GPU takes about 0.46 seconds (this can be cut in half with 50 character BPTT at negligible cost in performance). Without further ado, lets see a sample from the RNN:

In [10]:
import time
print("start")
# start = time.time()/

model.fit(X, y, epochs=50, batch_size=128, callbacks=callbacks_list)

end = time.time()
print("Training time :" + str(end - start) )

start
Epoch 1/50

Epoch 00001: loss improved from inf to 2.61581, saving model to weights-improvement-01-2.6158-bigger.hdf5
Epoch 2/50

Epoch 00002: loss improved from 2.61581 to 2.23013, saving model to weights-improvement-02-2.2301-bigger.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.23013 to 2.03921, saving model to weights-improvement-03-2.0392-bigger.hdf5
Epoch 4/50

Epoch 00004: loss improved from 2.03921 to 1.92088, saving model to weights-improvement-04-1.9209-bigger.hdf5
Epoch 5/50

Epoch 00005: loss improved from 1.92088 to 1.84127, saving model to weights-improvement-05-1.8413-bigger.hdf5
Epoch 6/50

Epoch 00006: loss improved from 1.84127 to 1.78219, saving model to weights-improvement-06-1.7822-bigger.hdf5
Epoch 7/50

Epoch 00007: loss improved from 1.78219 to 1.73581, saving model to weights-improvement-07-1.7358-bigger.hdf5
Epoch 8/50

Epoch 00008: loss improved from 1.73581 to 1.69815, saving model to weights-improvement-08-1.6982-bigger.hdf5
Epoch 9/50

Epoch 0000

KeyboardInterrupt: 