In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import os
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [2]:
data = open("../input/judita.txt", "r").read()

# Treniranje na 30% podataka
sample = data[:int(len(data))]

print("Uzorak ima " + str(len(sample)) + " slova.")
print(sample[:200])

Uzorak ima 81119 slova.
Dike ter hvaljen'ja presvetoj Juditi,
  smina nje stvore(n)'ja hoću govoriti;
  zato ću moliti, Bože, tvoju svitlost,
  ne hti(j) mi kratiti u tom punu milost.
Ti s' on ki da kripost svakomu dilu nje



In [3]:
unique_chars = sorted(list(set(sample)))
print("Unikatnih znakova: " + str(len(unique_chars)))

char_to_index = dict((c, i) for i, c in enumerate(unique_chars))
index_to_char = dict((i, c) for i, c in enumerate(unique_chars))

Unikatnih znakova: 66


In [4]:
part_len = 40
step = 3

parts = []
next_chars = []

for i in range(0, len(sample) - part_len, step):
    # Podijeli tekst na niz dijelova koji se preklapaju, svakih `step` slova
    # npr. "Bok, ja sam Marin" ", "ja sam Marin Ben", "sam Marin Bencev" ...
    parts.append(sample[i : i + part_len])
    # i za svaki taj dio spremi u `next_parts` sljedece slovo recenice
    # npr. ""
    next_chars.append(sample[i + part_len])
    
print("Dijelova: " + str(len(parts)))
for p in [parts[i] + " + " + next_chars[i] for i in range(3)]:
    print(p)

Dijelova: 27027
Dike ter hvaljen'ja presvetoj Juditi,
   + s
e ter hvaljen'ja presvetoj Juditi,
  smi + n
er hvaljen'ja presvetoj Juditi,
  smina  + n


In [5]:
# Kodiraj input i output. Za svako slovo napravi niz boolova takav da su
# sve vrijednosti false osim vrijednosti na indeksu tog slova.
# Za output koristi indekse od next_chars. Tako da za svaki input 
# (koji je niz indeksa), output bude indeks sljedeceg slova.

x = np.zeros((len(parts), part_len, len(unique_chars)), dtype=np.bool)
y = np.zeros((len(parts), len(unique_chars)), dtype=np.bool)
for i, part in enumerate(parts):
    for j, char in enumerate(part):
        x[i, j, char_to_index[char]] = 1
    y[i, char_to_index[next_chars[i]]] = 1
    
print(x[:1])
print(y[:1])

[[[False False False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]
  ...
  [ True False False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]]]
[[False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False  True False False False False False False
  False False False False False False]]


In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint
import random
import sys
import io

model = Sequential()
model.add(LSTM(256, input_shape=(part_len, len(unique_chars)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, input_shape=(part_len, len(unique_chars))))
model.add(Dense(len(unique_chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='adam')

Using TensorFlow backend.


In [7]:
# Za listu predvidjenih indeksa slova odaberi jedno slovo.
# `temperature` odredjuje moguce odstupanje od najbolje 
# vrijednosti.
def sample_preds(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [8]:
filepath = "model.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

with tf.device('/gpu:0'):
    model.fit(x, y,
              batch_size=128,
              epochs=50,
              verbose=2,
              callbacks=[checkpoint])

Epoch 1/50
 - 39s - loss: 3.2223

Epoch 00001: loss improved from inf to 3.22231, saving model to model.hdf5
Epoch 2/50
 - 37s - loss: 2.6190

Epoch 00002: loss improved from 3.22231 to 2.61899, saving model to model.hdf5
Epoch 3/50
 - 37s - loss: 2.3450

Epoch 00003: loss improved from 2.61899 to 2.34498, saving model to model.hdf5
Epoch 4/50
 - 37s - loss: 2.2435

Epoch 00004: loss improved from 2.34498 to 2.24347, saving model to model.hdf5
Epoch 5/50
 - 37s - loss: 2.1755

Epoch 00005: loss improved from 2.24347 to 2.17549, saving model to model.hdf5
Epoch 6/50
 - 37s - loss: 2.1354

Epoch 00006: loss improved from 2.17549 to 2.13543, saving model to model.hdf5
Epoch 7/50
 - 37s - loss: 2.0850

Epoch 00007: loss improved from 2.13543 to 2.08497, saving model to model.hdf5
Epoch 8/50
 - 37s - loss: 2.0449

Epoch 00008: loss improved from 2.08497 to 2.04488, saving model to model.hdf5
Epoch 9/50
 - 37s - loss: 2.0020

Epoch 00009: loss improved from 2.04488 to 2.00203, saving model t

In [9]:
start = random.randint(0, len(x) - 1)
generated = ''
part = sample[start : start + part_len]
generated += part

for i in range(1000):
    x_pred = np.zeros((1, part_len, len(unique_chars)))
    for j, char in enumerate(part):
        x_pred[0, j, char_to_index[char]] = 1.
        
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample_preds(preds, 0.8)
    next_char = index_to_char[next_index]
    
    generated += next_char
    part = part[1:] + next_char
    
    sys.stdout.write(next_char)
    sys.stdout.flush()
    
print()

 Bog, poprezaše: "Jer sa nastoriti,
  Adi se ne vriti vazza neće se sferi.
  Bog nere postitaše zna čtu hrudu posti,
  od rezim poljim živom, žive ka na ljih.
Ni ter na ka visi, zimen'ja bustući
  i ter ju vanje visto u puka svitu,
  da je lepaši pro(j)ahu svak ga njih izdite.
Čanost ogonj da njega ke prada jast.
Da to prova žene u sčelju su zi(j)u.
  Njih kim pokora, statim i živo(j)e.
Zdi se vemari prav glavne odsteni,
  trardi se rečari vizše pitil stanje.
Toko di naj bili, kako ga da žilom sud
Ozmore po bita, kazbi se goveriti
Bogu se zrato(j)aše bi jar obraše,
  od njim projim svom mora či svojahu,
  ka Gje u nie stana varve ga predanjih
  neće s' Mrirsajim, Bog jaš na porizaje.
Kad Bog nje poje osta priv je me čili,
  krazbi kako rina, seren'jame, čamo,
  sila kako poti, poglečen'je opo(j)a
  Bog se or stoja da za pro(j)ah vi(j)i
  udimil u razih, leperi jud lipaše,
  u trapik pritivu, hame na vista vase,
  i svaka vadiru o plavce veniše,
  za njega napase dade se mena mom
  i te