# Notebook

In [None]:
import json
import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import GRU, Dropout, Dense, Activation, CuDNNGRU, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import keras.utils as ku
from sklearn.model_selection import train_test_split

## Opening the dataset

In [None]:
with open('data/eco.json', 'r', encoding='utf-8') as file:
    data = file.read()
    obj = json.loads(data)

## Filtering the dataset to only keep the non null answers

In [None]:
dataset = []

for i in range(len(obj)):
  for question in obj[i]['responses']:
    if question['value'] is None:
      continue

    spl = question['value'].split('{"labels":[],"other":')
    if len(spl) == 1:
      continue

    spl = spl[1].split('}')[0]

    if spl == "null":
      continue

    spl = spl[1:-1]
    
    dataset.append(spl)

## Transforms the sentence in a list of integers representing the chars

In [None]:
tokenizer = Tokenizer()

corpus = list(map(lambda x: x.lower().replace(".", " END "), dataset))
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
max_sequence_len = 100

## Batch generator

In [None]:
def generator(batch_size):
    i = 1
    while 1:
      idx = np.random.randint(0, len(corpus), batch_size)
      d = np.array(corpus)[idx]

      input_sequences = []
      for line in d:
        token_list = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, min(max_sequence_len+1, len(token_list))):
          if i <= max_sequence_len:
            input_sequences.append(token_list[:i+1])
          else:
            input_sequences.append(token_list[i-max_sequence_len:i+1])
        
      input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding="pre"))

      predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
      label = to_categorical(label, num_classes=total_words)

      yield predictors, label

## Creating the model

In [None]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(CuDNNGRU(256))
model.add(Dropout(0.1))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

## Setting hyperparameters

In [1]:
batch_size = 128

## Fitting the model

In [None]:
mc = ModelCheckpoint('weights1{epoch:08d}.h5', save_weights_only=True, period=10)
# model.load_weights('weights.h5') # In case of fine tuning

model.fit_generator(
  generator(batch_size),
  steps_per_epoch=np.ceil(len(corpus) / batch_size),
  epochs=200,
  callbacks=[mc]
)

## Loading a trained model

In [None]:
model.load_weights('weights1.h5')

## Text generation

In [None]:
next_words = 100
seed_text = "ma liberté"

for j in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text.lower()])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)

    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)