In [0]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
MAX_WORDS = 50000
INPUT_LENGTH = 19


drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
subtitles = json.loads(open('/content/gdrive/My Drive/Colab Notebooks/game-of-thrones-srt/season1.json').read())

In [4]:
s1 = []
df = pd.read_json('/content/gdrive/My Drive/Colab Notebooks/game-of-thrones-srt/season1.json')

filter = '@'

for episode in range(len(df.columns)):
    e = df[df.columns[episode]].dropna().sort_index()
    dialogs = list(e.values)
    dialogs = [x + ' <NEWLINE>' for x in dialogs]
    s1 = s1 + dialogs

print("total lines = ", len(s1))
print(s1)

total lines =  6658


In [5]:
tokenizer = Tokenizer(filters=filter)
tokenizer.fit_on_texts(s1)
seq = tokenizer.texts_to_sequences(s1)
print(seq[:10])
print(tokenizer.word_index)

[[1911, 171, 1], [19, 22, 3, 2954, 247, 1912, 1], [48, 658, 2955, 7, 1913, 47, 269, 1914, 1], [130, 3, 49, 16, 247, 2956, 529, 219, 5, 1149, 1], [102, 70, 224, 720, 22, 7, 184, 53, 339, 1], [6, 70, 224, 7, 184, 53, 823, 21, 154, 14, 10, 270, 1], [52, 477, 61, 3, 1434, 1], [4, 477, 38, 149, 78, 1435, 4, 31, 97, 313, 111, 5, 2, 370, 1], [22, 2, 248, 721, 89, 1], [83, 722, 75, 5, 1915, 2, 2957, 1]]


In [6]:
corpus = [subitem for item in seq for subitem in item]
print("corpus word length = ", len(corpus))

corpus word length =  52886


In [7]:
vocab_size = len(tokenizer.word_index)
print('vocab size = ', vocab_size)

vocab size =  6756


In [0]:
sentence_len = 20
prediction_len = 1
train_len = sentence_len - prediction_len

train_seq = []
for item in range(len(corpus) - sentence_len):
    train_seq.append(corpus[item:item + sentence_len])

In [0]:
trainX = []
trainy = []
for i in train_seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

In [0]:
model = Sequential([
    Embedding(vocab_size + 1, 50, input_length=train_len),
    LSTM(512),
    # LSTM(150),
    Dropout(0.1),
    # Dense(150, activation='relu'),
    Dense(6754, activation='softmax')
])

In [11]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 19, 50)            337850    
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1153024   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 6754)              3464802   
Total params: 4,955,676
Trainable params: 4,955,676
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
print(np.asarray(trainX).shape)
print(pd.get_dummies(np.asarray(trainy).shape))

(52866, 19)
   52866
0      1


In [14]:
model.fit(np.asarray(trainX), pd.get_dummies(np.asarray(trainy)), batch_size=64, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f46603022b0>

In [0]:
model.save('/content/gdrive/My Drive/Colab Notebooks/game-of-thrones-srt/model_weights.hdf5')
model.load_weights('/content/gdrive/My Drive/Colab Notebooks/game-of-thrones-srt/model_weights.hdf5')

In [0]:
token_to_word_map = dict(map(reversed, tokenizer.word_index.items()))

def generate_text(input_text, prediction_length):
    tokens = tokenizer.texts_to_sequences([input_text])

    while len(tokens[0]) < prediction_length:
        if len(tokens[0]) <= INPUT_LENGTH:
            padded_tokens = pad_sequences(tokens[-INPUT_LENGTH:], maxlen=INPUT_LENGTH)
        else:
            padded_tokens = [tokens[0][-INPUT_LENGTH:]]

        prediction = model.predict(np.asarray(padded_tokens).reshape(1,-1))
        tokens[0].append(prediction.argmax())
        
    tokens[0] = [134 if x==0 else x for x in tokens[0]]

    generated_text = " ".join(map(lambda x : token_to_word_map[x], tokens[0]))
    generated_text = generated_text.replace('<newline>', '\n')

    return generated_text

In [18]:
print(generate_text("What do you know about warfare? - Nothing. <NEWLINE>", 50))

what do you know about - nothing. 
 be could always maester hand how at 
 you'll stark. you'll i'm that save gods needs - you'll in 
 hit don't well, than is be 
 you night's gods he - you'll you'll be limp you'll want - from 
 be


In [19]:
print(generate_text("", 100))

throat. you'll for 
 what not shaggydog. you'll 
 
 
 me a could taste you. you'll want there you. you'll so... you'll this your years you'll we know. you'll no from 
 are. you'll my see she time. you'll from the hear. you'll you why i'm 
 she on my seen will. you'll seen of men on 
 no from 
 you'll tell hand you'll a home and when is more from 
 a poor riding - you'll made i'm over you. you'll all 
 they ever heard i ever ever i some by even first on there are


In [20]:
print(generate_text("king in the north", 100))

king in the north you'll was i final swore you'll first and can. you'll you'll it bring lord it? you'll i'm 
 
 go. you'll i'm quick say. you'll hang you'll throat. you'll my put all when to our maester leagues he first you'll you'll his took - have you're you're he leave - you'll a day how i'll go! for i eddard came - when the very right? you'll you am from 
 
 if it's that boy, a father know you're two his new did. you'll is but him for when the can spear with i join there
