In [1]:
import numpy as np
import codecs
import re
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [43]:
filename = "../indie/ALL.txt"
# First go the file once to see how big it is and to build the vocab
token_to_idx = {}
total_size = 0
with codecs.open(filename, 'r', 'utf-8') as f:
    for line in f:
        total_size += len(line)
        words = filter(lambda a: a != '', re.split('([\.\?\n\t\(\)\s,!"]+)', line))
        for w in words:
            w = w.lower()
            if w not in token_to_idx:
                token_to_idx[w] = len(token_to_idx) + 1

In [44]:
len(token_to_idx)
#TODO clear some useless words (e.g. '')

3577

In [45]:
token_to_idx

{u'daremo': 468,
 u'assurde': 2903,
 u'unica': 2106,
 u'ciao': 1296,
 u'dovrebbe': 1852,
 u'd\xe9j\xe0': 1448,
 u'imitavo': 1748,
 u'jihad': 471,
 u'sleep': 1957,
 u'francesco': 479,
 u'coprirti': 1432,
 u'abbaia': 3573,
 u'cambiami': 3274,
 u'attimo': 1003,
 u'bastata': 879,
 u'porno': 1207,
 u'piange': 1311,
 u'presente': 2974,
 u'poco': 655,
 u"l'ultima": 2799,
 u"rock'n'roll": 1683,
 u'loro': 129,
 u'fronte': 2986,
 u'chicco': 1733,
 u'libert\xe0': 499,
 u'tremore': 3364,
 u'riso': 1734,
 u'annoiavo': 338,
 u'felpa': 2554,
 u'figa': 1654,
 u'figo': 1750,
 u'buche': 3422,
 u'legami': 3329,
 u'calcio': 847,
 u'scelto': 2855,
 u"dell'altro": 2414,
 u'chiamami': 2452,
 u'impazzire': 3136,
 u'poche': 3030,
 u'vederti': 2265,
 u'l\u2019aria': 3142,
 u'remi': 543,
 u'piccioni': 2083,
 u"d'auto": 1799,
 u'appeso': 642,
 u'pavimento': 2586,
 u'strettie': 3220,
 u'reggio-calabria': 730,
 u'dandomi': 2616,
 u'sincronizzato': 1612,
 u'andr\xf2': 1310,
 u'occhiate': 3381,
 u'valgono': 696,
 u's

In [5]:
# now tokenize the whole text file, using the dictionary created
token_text = []
with codecs.open(filename, 'r', 'utf-8') as f:
    for line in f:
        total_size += len(line)
        words = re.split('([\.\?\n\t\(\)\s,!"]+)', line)
        token_text += [token_to_idx[w.lower()] for w in words]

In [6]:
len(token_text)

67369

In [7]:
# prepare the dataset of input sequences (10 words) to output (1 word) pairs
seq_length = 10 # length of word sequences used for predictions
dataX = []
dataY = []
for i in range(0, len(token_text) - seq_length):
    seq_in = token_text[i:i + seq_length]
    seq_out = token_text[i + seq_length]
    dataX.append(seq_in)
    dataY.append(seq_out)
n_patterns = len(dataX)
print "Total Patterns: ", n_patterns



Total Patterns:  67359


In [8]:
print dataX[0], '-->', dataY[0]

[1, 2, 3, 2, 4, 2, 5, 2, 6, 2] --> 7


In [11]:
# create numpy arrays from lists
X = np.reshape(np.array(dataX), (n_patterns, seq_length, 1))
y = np.array(dataY)

In [12]:
X.shape

(67359, 10, 1)

In [23]:
n_neurons = seq_length
n_epoch = 20
n_batch = 128

In [13]:
y = np_utils.to_categorical(y)

In [20]:
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=X[0].shape))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 10)                480       
_________________________________________________________________
dropout_3 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3579)              39369     
Total params: 39,849
Trainable params: 39,849
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
filepath="model-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True,mode='min')
callbacks_l = [checkpoint]

In [24]:
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=2, callbacks=callbacks_l)

Epoch 1/20
 - 47s - loss: 5.4965

Epoch 00001: loss improved from inf to 5.49646, saving model to model-01-5.4965.hdf5
Epoch 2/20
 - 43s - loss: 4.1986

Epoch 00002: loss improved from 5.49646 to 4.19860, saving model to model-02-4.1986.hdf5
Epoch 3/20
 - 43s - loss: 3.9364

Epoch 00003: loss improved from 4.19860 to 3.93641, saving model to model-03-3.9364.hdf5
Epoch 4/20
 - 43s - loss: 3.7706

Epoch 00004: loss improved from 3.93641 to 3.77057, saving model to model-04-3.7706.hdf5
Epoch 5/20
 - 43s - loss: 3.6900

Epoch 00005: loss improved from 3.77057 to 3.68997, saving model to model-05-3.6900.hdf5
Epoch 6/20
 - 43s - loss: 3.6492

Epoch 00006: loss improved from 3.68997 to 3.64918, saving model to model-06-3.6492.hdf5
Epoch 7/20
 - 43s - loss: 3.6248

Epoch 00007: loss improved from 3.64918 to 3.62481, saving model to model-07-3.6248.hdf5
Epoch 8/20
 - 43s - loss: 3.6105

Epoch 00008: loss improved from 3.62481 to 3.61049, saving model to model-08-3.6105.hdf5
Epoch 9/20
 - 43s - 

<keras.callbacks.History at 0x115435810>

In [25]:
idx_to_token = dict()
for k in token_to_idx.iteritems():
    idx_to_token[k[1]] = k[0]

In [28]:
len(idx_to_token)

3578

In [29]:
idx_to_token[np.argmax(y[582])]

u'di'

In [42]:
n_vocab = len(token_to_idx)
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print "Seed:"
print "\"", ''.join([idx_to_token[value] for value in pattern]), "\""
# generate characters
for i in range(10):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = idx_to_token[index]
    print "DBG ---- Pred:", result, "for", x
    seq_in = [idx_to_token[value] for value in pattern]
    print result,
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print "\nDone."

Seed:
" odiarmi se fisso l'arancione dei  "
DBG ---- Pred:  for [[[  9.01062046e-01]
  [  5.58971492e-04]
  [  2.23588597e-03]
  [  5.58971492e-04]
  [  1.41699273e-01]
  [  5.58971492e-04]
  [  9.01341532e-01]
  [  5.58971492e-04]
  [  7.93739519e-02]
  [  5.58971492e-04]]]
 DBG ---- Pred:  for [[[  5.58971492e-04]
  [  2.23588597e-03]
  [  5.58971492e-04]
  [  1.41699273e-01]
  [  5.58971492e-04]
  [  9.01341532e-01]
  [  5.58971492e-04]
  [  7.93739519e-02]
  [  5.58971492e-04]
  [  3.63331470e-03]]]
 DBG ---- Pred:  for [[[  2.23588597e-03]
  [  5.58971492e-04]
  [  1.41699273e-01]
  [  5.58971492e-04]
  [  9.01341532e-01]
  [  5.58971492e-04]
  [  7.93739519e-02]
  [  5.58971492e-04]
  [  3.63331470e-03]
  [  3.63331470e-03]]]
 DBG ---- Pred:  for [[[  5.58971492e-04]
  [  1.41699273e-01]
  [  5.58971492e-04]
  [  9.01341532e-01]
  [  5.58971492e-04]
  [  7.93739519e-02]
  [  5.58971492e-04]
  [  3.63331470e-03]
  [  3.63331470e-03]
  [  3.63331470e-03]]]
 DBG ---- Pred:  for [[[ 

In [37]:
value

13