In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
MAX_WORDS = 50000
INPUT_LENGTH = 19

In [3]:
subtitles = json.loads(open('game-of-thrones-srt/season1.json').read())

In [4]:
s1 = []
df = pd.read_json('game-of-thrones-srt/season1.json')

for episode in range(len(df.columns)):
    e = df[df.columns[episode]].dropna().sort_index()
    s1 = s1 + list(e.values)

print("total lines = ", len(s1))

total lines =  6658


In [5]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(s1)
seq = tokenizer.texts_to_sequences(s1)
print(seq[:10])

[[599, 65], [18, 21, 2, 796, 298, 710], [43, 543, 2166, 5, 1268, 51, 263, 543], [131, 2, 41, 10, 298, 2167, 544, 209, 3, 1060], [117, 79, 225, 512, 21, 5, 140, 58, 30], [4, 79, 225, 5, 140, 58, 30, 22, 168, 14, 8, 118], [60, 429, 61, 2, 80], [429, 40, 169, 54, 59, 33, 115, 219, 85, 3, 1, 155], [21, 1, 139, 711, 2], [87, 643, 84, 3, 1571, 1, 512]]


In [6]:
corpus = [subitem for item in seq for subitem in item]
print("corpus length = ", len(corpus))

corpus length =  45036


In [7]:
vocab_size = len(tokenizer.word_index)
print('vocab size = ', vocab_size)

vocab size =  3943


In [8]:
sentence_len = 20
prediction_len = 1
train_len = sentence_len - prediction_len

train_seq = []
for item in range(len(corpus) - sentence_len):
    train_seq.append(corpus[item:item + sentence_len])

In [9]:
trainX = []
trainy = []
for i in train_seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

In [36]:
model = Sequential([
    Embedding(vocab_size + 1, 50, input_length=train_len),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(150, activation='relu'),
    Dense(vocab_size - 1, activation='softmax')
])

In [37]:
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 19, 50)            197200    
_________________________________________________________________
lstm_12 (LSTM)               (None, 19, 150)           120600    
_________________________________________________________________
lstm_13 (LSTM)               (None, 150)               180600    
_________________________________________________________________
dense_11 (Dense)             (None, 150)               22650     
_________________________________________________________________
dense_12 (Dense)             (None, 3942)              595242    
Total params: 1,116,292
Trainable params: 1,116,292
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
print(np.asarray(trainX).shape)
print(pd.get_dummies(np.asarray(trainy).shape))

(45016, 19)
   45016
0      1


In [None]:
model.fit(np.asarray(trainX), pd.get_dummies(np.asarray(trainy)), batch_size=64, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

In [365]:
model.save('model_weights.hdf5')
model.load_weights('model_weights.hdf5')

In [366]:
token_to_word_map = dict(map(reversed, tokenizer.word_index.items()))

def generate_text(input_text, prediction_length):
    tokens = tokenizer.texts_to_sequences([input_text])

    while len(tokens[0]) < prediction_length:
        if len(tokens[0]) <= INPUT_LENGTH:
            padded_tokens = pad_sequences(tokens[-INPUT_LENGTH:], maxlen=INPUT_LENGTH)
        else:
            padded_tokens = [tokens[0][-INPUT_LENGTH:]]

        prediction = model.predict(np.asarray(padded_tokens).reshape(1,-1))
        tokens[0].append(prediction.argmax())
        
    tokens[0] = [134 if x==0 else x for x in tokens[0]]

    generated_text = " ".join(map(lambda x : token_to_word_map[x], tokens[0]))

    return generated_text

In [367]:
generate_text("What do you know about warfare? - Nothing.", 50)

"what do you know about nothing the gives are your brother and your talking feasting oh been in his sister leave the tongue to oh oh oh oh i rescue be understand have the orders when oh they in it it's they only will and name the please had oh"

In [368]:
generate_text("kneel before me", 50)

"before me a things you you eddard in oh a if to so i our there's it it's i end i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo i drogo"