In [1]:
import numpy as np
import codecs
import re
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
filename = "../indie/ALL.txt"
# First go the file once to see how big it is and to build the vocab
token_to_idx = {}
total_size = 0
with codecs.open(filename, 'r', 'utf-8') as f:
    for line in f:
        total_size += len(line)
        words = filter(lambda a: a != '', re.split('([\.\?\n\t\(\)\s,!"]+)', line))
        for w in words:
            w = w.lower()
            if w not in token_to_idx:
                token_to_idx[w] = len(token_to_idx) + 1

In [3]:
len(token_to_idx)
#TODO clear some useless words (e.g. '')

3577

In [4]:
token_to_idx

{u'daremo': 468,
 u'assurde': 2903,
 u'unica': 2106,
 u'ciao': 1296,
 u'dovrebbe': 1852,
 u'd\xe9j\xe0': 1448,
 u'imitavo': 1748,
 u'jihad': 471,
 u'sleep': 1957,
 u'francesco': 479,
 u'coprirti': 1432,
 u'abbaia': 3573,
 u'cambiami': 3274,
 u'attimo': 1003,
 u'bastata': 879,
 u'porno': 1207,
 u'piange': 1311,
 u'presente': 2974,
 u'poco': 655,
 u"l'ultima": 2799,
 u"rock'n'roll": 1683,
 u'loro': 129,
 u'fronte': 2986,
 u'chicco': 1733,
 u'libert\xe0': 499,
 u'tremore': 3364,
 u'riso': 1734,
 u'annoiavo': 338,
 u'felpa': 2554,
 u'figa': 1654,
 u'figo': 1750,
 u'buche': 3422,
 u'legami': 3329,
 u'calcio': 847,
 u'scelto': 2855,
 u"dell'altro": 2414,
 u'chiamami': 2452,
 u'impazzire': 3136,
 u'poche': 3030,
 u'vederti': 2265,
 u'l\u2019aria': 3142,
 u'remi': 543,
 u'piccioni': 2083,
 u"d'auto": 1799,
 u'appeso': 642,
 u'pavimento': 2586,
 u'strettie': 3220,
 u'reggio-calabria': 730,
 u'dandomi': 2616,
 u'sincronizzato': 1612,
 u'andr\xf2': 1310,
 u'occhiate': 3381,
 u'valgono': 696,
 u's

In [17]:
# now tokenize the whole text file, using the dictionary created
token_text = []
with codecs.open(filename, 'r', 'utf-8') as f:
    for line in f:
        total_size += len(line)
        words = filter(lambda a: a != '', re.split('([\.\?\n\t\(\)\s,!"]+)', line))
        token_text += [token_to_idx[w.lower()] for w in words]

In [18]:
len(token_text)

61580

In [19]:
# prepare the dataset of input sequences (10 words) to output (1 word) pairs
seq_length = 10 # length of word sequences used for predictions
dataX = []
dataY = []
for i in range(0, len(token_text) - seq_length):
    seq_in = token_text[i:i + seq_length]
    seq_out = token_text[i + seq_length]
    dataX.append(seq_in)
    dataY.append(seq_out)
n_patterns = len(dataX)
print "Total Patterns: ", n_patterns



Total Patterns:  61570


In [20]:
print dataX[0], '-->', dataY[0]

[1, 2, 3, 2, 4, 2, 5, 2, 6, 2] --> 7


In [21]:
# create numpy arrays from lists
X = np.reshape(np.array(dataX), (n_patterns, seq_length, 1))
y = np.array(dataY)

In [11]:
X.shape

(61570, 10, 1)

In [12]:
n_neurons = seq_length
n_epoch = 20
n_batch = 128

In [13]:
y = np_utils.to_categorical(y)

In [14]:
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=X[0].shape))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10)                480       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3578)              39358     
Total params: 39,838
Trainable params: 39,838
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
filepath="model-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True,mode='min')
callbacks_l = [checkpoint]

In [16]:
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=2, callbacks=callbacks_l)

Epoch 1/20
 - 44s - loss: 5.7601

Epoch 00001: loss improved from inf to 5.76015, saving model to model-01-5.7601.hdf5
Epoch 2/20
 - 52s - loss: 4.3195

Epoch 00002: loss improved from 5.76015 to 4.31954, saving model to model-02-4.3195.hdf5
Epoch 3/20
 - 46s - loss: 4.2913

Epoch 00003: loss improved from 4.31954 to 4.29127, saving model to model-03-4.2913.hdf5
Epoch 4/20
 - 44s - loss: 4.0876

Epoch 00004: loss improved from 4.29127 to 4.08764, saving model to model-04-4.0876.hdf5
Epoch 5/20
 - 40s - loss: 3.8743

Epoch 00005: loss improved from 4.08764 to 3.87427, saving model to model-05-3.8743.hdf5
Epoch 6/20
 - 40s - loss: 3.8021

Epoch 00006: loss improved from 3.87427 to 3.80207, saving model to model-06-3.8021.hdf5
Epoch 7/20
 - 40s - loss: 3.7615

Epoch 00007: loss improved from 3.80207 to 3.76150, saving model to model-07-3.7615.hdf5
Epoch 8/20
 - 40s - loss: 3.7406

Epoch 00008: loss improved from 3.76150 to 3.74055, saving model to model-08-3.7406.hdf5
Epoch 9/20
 - 40s - 

<keras.callbacks.History at 0x11cb148d0>

In [22]:
idx_to_token = dict()
for k in token_to_idx.iteritems():
    idx_to_token[k[1]] = k[0]

In [25]:
idx_to_token

{1: u'lo',
 2: u' ',
 3: u'sai',
 4: u'che',
 5: u'la',
 6: u'tachipirina',
 7: u'500',
 8: u'se',
 9: u'ne',
 10: u'prendi',
 11: u'due',
 12: u'\n',
 13: u'diventa',
 14: u'1000',
 15: u'si',
 16: u'vede',
 17: u'hai',
 18: u'provato',
 19: u'qualcosina',
 20: u'parlano',
 21: u'le',
 22: u'tue',
 23: u'pupille',
 24: u'e',
 25: u'adesso',
 26: u'mi',
 27: u'per',
 28: u'mano',
 29: u'vacci',
 30: u'piano',
 31: u'stringi',
 32: u'cos\xec',
 33: u'io',
 34: u'sento',
 35: u'il',
 36: u'cuore',
 37: u'a',
 38: u'mille',
 39: u'duomo',
 40: u'di',
 41: u'milano',
 42: u'\xe8',
 43: u'un',
 44: u'paracetamolo',
 45: u'sempre',
 46: u'pronto',
 47: u'tonsille',
 48: u'domani',
 49: u'non',
 50: u'lavoro',
 51: u'puoi',
 52: u'venire',
 53: u"po'",
 54: u'da',
 55: u'me',
 56: u'ma',
 57: u'poi',
 58: u'vieni',
 59: u'mai',
 60: u'te',
 61: u', ',
 62: u'versailles',
 63: u'canto',
 64: u'gabbiano',
 65: u'dentro',
 66: u'mia',
 67: u'siamo',
 68: u'in',
 69: u'metro',
 70: u'o',
 71: u't

In [32]:
idx_to_token[100]

u'ferro'

In [36]:
n_vocab = len(token_to_idx)
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print "Seed:"
print "\"", ''.join([idx_to_token[value] for value in pattern]), "\""
# generate characters
for i in range(10):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = idx_to_token[index]
    #print "DBG ---- Pred:", result, "for", x
    seq_in = [idx_to_token[value] for value in pattern]
    print result,
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print "\nDone."

Seed:
" giorno pagherò il conto
quello  "
che che che che che che che che che che 
Done.


In [37]:
value

13