# reddit_tifu/short Score Regression

Dataset: tensorflow/datasets/reddit_tifu/short

In [None]:
from time import time

class DC:
    dataset = 'reddit_tifu/short'
    split = ['train[:50%]', 'train[50%:70%]', 'train[70%:]']
    batch_size = 256

class MC:
    vocab_size = 4096
    embedding_features = 128
    sequence_length = 256

class TC:
    lr = .001
    momentum = .0
    epochs = 100
    logs = (f'/tf/logs/reddit_tifu/'
            f'v:{MC.vocab_size} f:{MC.embedding_features} s:{MC.sequence_length} '
            f'lr:{lr} e:{epochs}')
    
    reduce_lr_factor = .5
    reduce_lr_patience = 10
    early_stop_patience = 30
    
class Config:
    data = DC
    model = MC
    training = TC

## Setup

In [None]:
import io, os, re, shutil, string
from datetime import datetime
import numpy as np
import pandas as pd
import requests
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set()

In [None]:
def extract_sample_fn(s):
    return (s['documents'], tf.math.log(s['score'] + tf.keras.backend.epsilon()))

def standardize_fn(x):
    x = tf.strings.lower(x)
    return tf.strings.regex_replace(x, '[%s]' % re.escape(string.punctuation), '')

def prepare(ds):
    return (ds # .filter(lambda r: r['tldr'] != '')
              .batch(Config.data.batch_size)
              .map(extract_sample_fn, num_parallel_calls=tf.data.AUTOTUNE)
              .cache()
              .prefetch(tf.data.AUTOTUNE))

## Dataset

In [None]:
import tensorflow_datasets as tfds

class Data:
    (train, val, test), info = tfds.load(Config.data.dataset,
                                         split=Config.data.split,
                                         with_info=True,
                                         shuffle_files=True)
    
    (train, val, test) = map(prepare, (train, val, test))

In [None]:
Data.info

## Defining Model

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorize_layer = TextVectorization(
    standardize=standardize_fn,
    max_tokens=Config.model.vocab_size,
    output_mode='int',
    output_sequence_length=Config.model.sequence_length,
    name='vec')

text_ds = Data.train.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)
vectorize_layer.weights[0].name = 'vec/encodings:0'

In [None]:
from tensorflow.keras.layers import (Embedding, Dense, GlobalAveragePooling1D, LSTM,
                                     Bidirectional)

em = Embedding(
    Config.model.vocab_size,
    Config.model.embedding_features,
    name='em')

score_model = tf.keras.Sequential([
    vectorize_layer,
    em,
    Bidirectional(LSTM(128, name='r1/lstm'),
                  name='r1/bi'),
    Dense(128, activation='relu', name='fc1'),
    Dense(1, name='predictions')],
    name='score_reg')

In [None]:
score_model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.RMSprop(
        Config.training.lr,
        momentum=Config.training.momentum),
    metrics=['mse', 'mae']
)

In [None]:
import os
from tensorflow.keras import callbacks

shutil.rmtree(Config.training.logs, ignore_errors=True)
os.makedirs(Config.training.logs, exist_ok=True)

try:
    score_model.fit(
        Data.train,
        epochs=Config.training.epochs,
        validation_data=Data.val,
        callbacks=[
            callbacks.TerminateOnNaN(),
            callbacks.EarlyStopping(patience=Config.training.early_stop_patience, verbose=1),
            callbacks.ModelCheckpoint(Config.training.logs + '/weights',
                                      save_weights_only=True,
                                      save_best_only=True,
                                      save_format='tf',
                                      verbose=1),
            callbacks.ReduceLROnPlateau(
                factor=Config.training.reduce_lr_factor,
                patience=Config.training.reduce_lr_patience,
                verbose=1),
            callbacks.TensorBoard(Config.training.logs,
                                  # histogram_freq=max(Config.training.epochs // 10, 5),
                                  profile_batch=(10,20))
        ]);
except KeyboardInterrupt:
    print('stopped')

In [None]:
h = score_model.history

plt.figure(figsize=(16, 8))
plt.subplot(221)
plt.plot(h.history['mse'], label='train MSE')
plt.plot(h.history['val_mse'], label='val MSE')
plt.legend()
plt.subplot(222)
plt.plot(h.history['loss'], label='train loss')
plt.plot(h.history['val_loss'], label='val loss')
plt.legend()
plt.subplot(223)
plt.plot(h.history['mae'], label='train MAE')
plt.plot(h.history['val_mae'], label='val MAE')
plt.legend()
plt.subplot(224)
plt.plot(h.history['lr'], label='learning rate')
plt.legend();

In [None]:
tf.keras.utils.plot_model(
    score_model,
    to_file='score.png',
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True)

## Testing

In [None]:
score_model.load_weights(Config.training.logs + '/weights');

In [None]:
r = pd.DataFrame(
    [score_model.evaluate(Data.train, verbose=0),
     score_model.evaluate(Data.val, verbose=0),
     score_model.evaluate(Data.test, verbose=0)],
    columns=score_model.metrics_names,
    index=['train', 'val', 'test']).T

r.round(2)

In [None]:
SAMPLED = 8

y, p = np.hstack([[y.numpy(), score_model.predict(x).ravel()]
                  for x, y in Data.test.take(SAMPLED)])

In [None]:
plt.figure(figsize=(16, 4))
plt.subplot(121, title='log p')
a, b = np.percentile(y, (0.1, 99.9))
m1 = (y > a) & (y < b)
sns.histplot(y[m1], label='true likes', color='crimson')
sns.histplot(p[m1], label='pred likes', color='orange')
plt.legend()

plt.subplot(122, title='p')
ey, ep = map(np.exp, (y, p))
a, b = np.percentile(ey, (0, 90))
m2 = (ey >= a) & (ey <= b)
sns.histplot(ey[m2], label='true likes', color='crimson')
sns.histplot(ep[m2], label='pred likes', color='orange')
plt.legend();

In [None]:
r = pd.DataFrame(dict(likes_log=y, likes=ey, pred_log=p, pred=ep))
sns.pairplot(r[m1]);