In [0]:
!pip install tensorflow==2.1.0

In [0]:
import json
import numpy as np
import os
from tensorflow.keras.layers import Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import  BatchNormalization
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [0]:
training_data = json.load(open('/content/drive/My Drive/training_data_sample.json'))

In [0]:
training_data

In [0]:
SEQUENCE_LENGTH = 128
EMBEDDING_DIM = 16
# ROWS_TO_SCAN = 2000000
NUM_EPOCHS = 20
BATCH_SIZE = 64

In [0]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
label_id_counter = 1
for i, row in enumerate(training_data):
    # template_id = str(row[0]).zfill(12)
    text = row[1].lower().strip('|')
    text = text+ ' >'
    print(text)
    # start_index = len(template_id) + 2 + 1 + 2  # template_id, spaces, box_index, spaces
    # box_index = 0
    for j in range(0, len(text)):
        char = text[j]
        # note: it is critical that the number of spaces plus len(box_index) is >= the convolution width
        texts.append(text[0:j])
        if char in labels_index:
            label_id = labels_index[char]
        else:
            label_id = label_id_counter
            labels_index[char] = label_id
            label_id_counter += 1
        labels.append(label_id)
        # if char == '|':
        #     box_index += 1


In [0]:
def char_to_num(text):
  char = []
  for i in text:
    char.append(labels_index[i])
  return char

In [0]:
# def map_char_to_int(texts):
#     char_counts = {}
#     for text in texts:
#         for char in text:
#             char_counts[char] = char_counts[char] + 1 if char in char_counts else 1
#     char_counts_sorted = sorted(char_counts.items(), reverse=True)
#     char_to_int = {}
#     for i, row in enumerate(char_counts_sorted):
#         char_to_int[row[0]] = i + 1
#     return char_to_int


# for some reason these two functions are way faster than the keras char-level tokenizer
def texts_to_sequences(texts, labels_index):
    sequences = []
    for text in texts:
        sequences.append([labels_index[char] for char in text])
    return sequences


In [0]:
# char_to_int = map_char_to_int(texts)
sequences = texts_to_sequences(texts, labels_index)

In [0]:
data = pad_sequences(sequences,  maxlen=128)
labels = np.asarray(labels)

In [0]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# validation set can be much smaller if we use a lot of data (source: andrew ng on coursera video)
validation_ratio = 0.001 if data.shape[0] < 1000000 else 0.02
num_validation_samples = int(validation_ratio * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [0]:
with open('/content/drive/My Drive/params.json', 'w') as handle:
    json.dump({
        'sequence_length': SEQUENCE_LENGTH,
        'embedding_dim': EMBEDDING_DIM,
        'num_rows_used': len(sequences),
        'num_epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'labels_index': labels_index
    }, handle)

In [0]:
import tensorflow as tf

In [0]:
from tensorflow.keras.layers import LSTM

In [0]:
model = Sequential()
model.add(Embedding(57, 128, input_shape=(128,)))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
# model.add(Dropout(0.4))
# model.add(Conv1D(1024, 5, activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(2))
# model.add(Dropout(0.4))
# model.add(Conv1D(1024, 5, activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(2))
# model.add(Dropout(0.4))
# model.add(Conv1D(1024, 5, activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(MaxPooling1D(2))
# model.add(Dropout(0.4))
# model.add(Conv1D(1024, 5, activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(GlobalMaxPooling1D())
# model.add(Dropout(0.4))
# model.add(Dense(1024, activation='relu'))
# model.add(MaxPooling1D(16))
# model.add(BatchNormalization())
# model.add(Dropout(0.4))
# model.add(Conv1D(1024, 5, activation='relu', padding='same'))
# model.add(BatchNormalization())
# model.add(GlobalMaxPooling1D())
model.add(Dense(len(labels_index)+1, activation='softmax'))


model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='rmsprop', metrics=['acc'])

In [0]:
model.summary()


In [0]:
MODEL_PATH = '/content/drive/My Drive'

In [0]:
checkpointer = ModelCheckpoint(filepath=MODEL_PATH + '/model_2.h5', verbose=1, save_best_only=True)
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=NUM_EPOCHS, batch_size=128, callbacks=[checkpointer])

In [0]:
training_data

In [0]:
params = json.load(open('/content/drive/My Drive/params.json'))
SEQUENCE_LENGTH = params['sequence_length']
int_to_char = {v: k for k, v in params['labels_index'].items()}

In [0]:
sample = 'coderschool'

In [0]:
res = sample.lower()
for i in range(300):
  temp = char_to_num(res)
  data = pad_sequences([temp],  maxlen=SEQUENCE_LENGTH)[0]
  data = np.expand_dims(data, axis = 0)
  y = model.predict(data)
  y = np.argmax(y)
  char = int_to_char[y]
  if char == '>':
    break
  res += char
res

In [0]:
tf.saved_model.save(model, "/content/drive/My Drive")

In [0]:
model.save('/content/drive/My Drive/model_final_2.h5')