In [1]:
import pandas as pd
import numpy as np
import os
import json
import tensorflow as tf
import datetime
import time
import matplotlib.pyplot as plt

In [2]:
file_path = "../input/news-category-dataset/News_Category_Dataset_v2.json"

In [3]:
with open(file_path, "r") as f:
    data = f.readlines()

In [4]:
headlines = []
for i in range(0, len(data)):
    headlines.append(json.loads(data[i])["headline"])

In [5]:
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'''/\|_@#$%ˆ&*˜'+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = '<EOS>'
UNK = "<UNK>"
PAD = "<PAD>"
chars.append(UNK)
chars.append(EOS)
chars.insert(0, PAD)

In [6]:
char2idx = {u:i for i, u in enumerate(chars)}
idx2char = np.array(chars)

In [7]:
def char_idx(c):
    if c in chars:
        return char2idx[c]
    return char2idx[UNK]

In [8]:
data = []
maxlen = 75
for h in headlines:
    converted = [char_idx(c) for c in h]
    #print(converted)
    if len(converted) >= maxlen:
        converted = converted[0: maxlen-1]
        converted.append(char2idx[EOS])
    else:
        converted.append(char2idx[EOS])
        remain = maxlen - len(converted)
        if remain > 0:
            for i in range(remain):
                converted.append(char2idx[PAD])
    data.append(converted)

In [9]:
# Hyperparameters
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 1024
BATCH_SIZE=1024

In [10]:
data_array = np.array(data)

In [11]:
X = data_array[:, :-1]
Y = data_array[:, 1:]

In [12]:
x = tf.data.Dataset.from_tensor_slices((X, Y))

In [13]:
x_train = x.shuffle(100000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)

## Create model

In [14]:
def create_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  mask_zero=True,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,return_sequences=True, stateful=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(vocab_size)])
    
    return model

In [15]:
model = create_model(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [16]:
model.summary()

In [17]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer = 'adam', loss = loss)

In [20]:
dt = datetime.datetime.today().strftime("%Y-%b-%d-%H-%M-%S")
checkpoint_dir = './'+dt
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,save_weights_only=True)

In [21]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
    

le_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)


In [22]:
print("**** Start Training ****")
EPOCHS=30
start = time.time()
history = model.fit(x_train, epochs=EPOCHS,callbacks=[checkpoint_callback, early_stopping, le_scheduler ])
print("**** End Training ****")
print("Training time: ", time.time()- start)

In [38]:
plt.plot(history.history["loss"])
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.savefig('Loss.pdf')

In [24]:
model.save("model.h5")

In [39]:
plt.plot(history.history["lr"])
plt.ylabel('Learning Rate')
plt.xlabel('Epochs')
plt.savefig('Learning_Rate.pdf')