In [1]:
import re
import numpy as np
import tensorflow as tf
import time
import helpers
import matplotlib.pyplot as plt
import seaborn as sns

import sys
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
with open('iron_man.srt', 'r') as fopen:
    text = fopen.read().split('\n')
text = list(filter(None, text))

In [3]:
# Any first character in a string is digit or '\r' or '<', we remove the string
# but we must start from the back, or later python will throw exception because the index no longer exist
for i in reversed(range(len(text))):
    if text[i][0].isdigit() or text[i][0] == '\r' or text[i][0] == '<' or text[i][-1] == '>' or text[i][0] == '-':
        del text[i]
        
# Replace all '\r' with empty character
text = [i.replace('\r', '') for i in text]

# Strip spaces with empty character
text = [i.strip() for i in text]

# Remove non ascii from our string
text = [re.sub(r'[^\x00-\x7F]+','', i) for i in text]

In [4]:
new_text = []
for i in text:
    for j in i.split():
        new_text.append(j)

In [5]:
new_char = []
for i in new_text:
    for j in i:
        new_char.append(j)
    new_char.append(" ")

In [6]:
chars = sorted(list(set(new_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
n_chars = len(new_text)
n_vocab = len(new_char)

In [8]:
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  7018
Total Vocab:  46486


In [9]:
seq_length = 100

In [10]:
dataX = []
dataY = []

In [11]:
for i in range(0, n_chars - seq_length, 1):
	seq_in = new_text[i:i + seq_length]
	seq_out = new_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])

In [12]:
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  6918


In [13]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(n_vocab)
y = np_utils.to_categorical(dataY)

In [14]:
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, y, epochs=2, batch_size=100, verbose=2)

Epoch 1/2
94s - loss: 3.5473e-04
Epoch 2/2
89s - loss: 3.5473e-04


<keras.callbacks.History at 0x128026cf8>

In [15]:
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print(' '.join([int_to_char[value] for value in pattern]))

Seed:
mabuk. Aku rasa aku boleh sembuhkan kau. Itu pekerjaanku. Membaiki. Dan semua yang mengalihkan perhatianmu? Uh... Aku akan lupakan mereka seketika. Jarvis. Hei. Semuanya selamat, Tuan. Ada yang lain? Kau tahu apa yang harus dilakukan. Peraturan "Piring Bersih", Tuan? Buang masa, ini hari Natal. Ya, ya. Baik, setakat ini, kau suka? Sudah tentu. Oh! Itulah tidur Tak apa, tak apa. It's okay. Tidak, lihat. lepaskan benda itu dari dadaku,</i>, Untuk berkongsi semua perasaan... dan pengalamanku dengan seseorang, seperti berkongsi beban itu menjadi dua. Seperti ular yang menelan ekornya sendiri. Semuanya kembali seperti mula. Dan... dan fakta yang kau dah mampu...


In [19]:
x = np.reshape(pattern, (1, len(pattern), 1))

for i in range(100):
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

akuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuakuaku
Done.
