# Training Pipeline
This notebook walks through the entire process of loading, cleaning, and encoding TED Talks, followed by the training, testing, and saving of a model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import collections
import random
import json
import re

## Data loading and cleaning

In [None]:
NEXT = 'NNNNN'
APPLAUSE = 'AAAAA'
LAUGHTER = 'LLLLL'
reactions = ["😐", "👏", "😂"]

In [None]:
# This CSV can be downloaded from https://www.kaggle.com/miguelcorraljr/ted-ultimate-dataset
df = pd.read_csv('ted_talks_en.csv')

In [None]:
transcripts = df[[(not x) for x in df['transcript'].isnull()]]['transcript']
transcripts

In [None]:
def sentencize(text):
    # Audience interactions
    transcript = re.sub(r"\((.*?)\)", r"\n{\1}.\n", text)
    # Replace funky punctuation
    transcript = re.sub(r" \.\.\.", ",", transcript)
    transcript = re.sub(r" —", ",", transcript)
    transcript = re.sub(r":", ",", transcript)
    transcript = re.sub(r";", ",", transcript)
    transcript = re.sub(r"!", ".", transcript)
    sent_tokenize(transcript)
    # Go through each line
    lines = transcript.splitlines()
    lines = [x.strip() for x in lines]
    lines = [x for x in lines if x != '']
    sentences = []
    for line in lines:
        if line == '{Laughter}.':
            sentences.append('LLLLL')
        elif line in ['{Applause}.', '{Cheers}.', '{Cheers and applause}.', '{Applause and cheers}.', '{Applause, cheers}.', '{Audience cheers}.', '{Cheering}.']:
            sentences.append('AAAAA')
        elif len(line) >= 2 and line[0] != '{' and line[-2:] != '}.':
            sentences += sent_tokenize(line)
    return sentences

def full_sentencize(texts):
    sentences = []
    for i in range(len(texts)):
        sentences += ['NNNNN']
        sentences += sentencize(texts[i])
    return sentences

In [None]:
# Clean and split sentences
sentences = full_sentencize(transcripts)

## Text encoding

In [None]:
# Only consider the top 20k words
VOCAB_SIZE = 20000  
# Only consider the first 200 words of each sample
MAXLEN = 200

In [None]:
dont_count = ["``", "''"]

def get_encodings(text, vocab_size):
    ''' Get word -> index and index -> word based on token frequency '''
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in dont_count]
    word_freqs = dict(collections.Counter(tokens))
    word_freqs_list = [(k, v) for k, v in word_freqs.items()]
    sorted_frequencies = sorted(word_freqs_list, key=lambda x: -x[1])
    abridged = sorted_frequencies[:vocab_size]
    words = [x[0] for x in abridged]
    word_to_index = {}
    for i, word in enumerate(words):
        word_to_index[word] = (i + 1)
    return word_to_index

def encode_sentence(text, word_to_index):
    ''' Encode a sentence '''
    tokenized = word_tokenize(text.lower())
    return [(word_to_index[word] if (word in word_to_index) else 0) for word in tokenized]

In [None]:
# Don't include the NEXT token as a token
sentences_no_next = [s for s in sentences if s != NEXT]

In [None]:
# Get encodings
word_to_index = get_encodings(' '.join(sentences_no_next), vocab_size=VOCAB_SIZE)

Now we create our X and y data with a max-3-sentence sliding window to predict one of three reactions.

In [None]:
special = [APPLAUSE, LAUGHTER]

def create_dataset(sentences, num_sentences=3):
    ''' Create X and y data using sliding window approach '''
    X = []
    y = []
    for i in range(len(sentences)):
        # Don't end a sentence with a special
        if sentences[i] in (special + [NEXT]):
            continue
        # Encode sentence
        current_batch = encode_sentence(sentences[i], word_to_index)
        retrace_index = 0
        sentence_count = 1
        # Pre-append sentences to the list until we hit our desired no. context sentences
        while sentence_count < num_sentences:
            retrace_index += 1
            index = i - retrace_index
            # If we hit the beginning of the speech, give up
            if index < 0 or sentences[index] == NEXT:
                break
            # Skip specials
            elif sentences[index] in special:
                continue
            current_batch = encode_sentence(sentences[index], word_to_index) + current_batch
            sentence_count += 1
        X.append(current_batch)
        # y is the reaction following this sentence
        if i < len(sentences) - 1 and sentences[i + 1] in special:
            y.append(special.index(sentences[i + 1]) + 1)
        # If no reaction, reaction is 0
        else:
            y.append(0)
    return X, np.array(y)

In [None]:
X, y = create_dataset(sentences)

In [None]:
# Check the distribution of reactions
sns.histplot(y, discrete=True)

This is highly imbalanced, so we'll synthetically undersample the "no reaction" X data.

In [None]:
def downsample_zeros(X, y, rate=(1/8)):
    ''' Downsample class 0 with given rate '''
    X_keep = []
    y_keep = []
    for i in range(len(X)):
        if not (y[i] == 0 and random.random() > rate):
            X_keep.append(X[i])
            y_keep.append(y[i])
    return X_keep, np.array(y_keep)

In [None]:
# Somewhat arbitrary
X, y = downsample_zeros(X, y, rate=(3/32))

In [None]:
sns.histplot(y, discrete=True)

## Training and testing
First we define the Transformer model.

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def defineTransformer(embed_dim=128, num_heads=2, ff_dim=64):
    ''' Define and return Transformer with given parameters '''
    inputs = layers.Input(shape=(MAXLEN,))
    embedding_layer = TokenAndPositionEmbedding(MAXLEN, VOCAB_SIZE + 1, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    ''' Tried two configurations: average pooling and, alternatively, last timestep only '''
    x = layers.GlobalAveragePooling1D()(x)
    # x = layers.Lambda(lambda x: x[:,-1])(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(3, activation="softmax")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
model = defineTransformer()
model.summary()

### Data preparation

In [None]:
# Split data into train and testing data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [None]:
# Pad sequences to a constant length
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAXLEN)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=MAXLEN)

### Training and testing

In [None]:
N_EPOCHS = 4
# Weights for minority class enforcing
weights = {0: 1, 1: 1, 2: 1}

model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    X_train, y_train, batch_size=32, epochs=N_EPOCHS, 
    validation_data=(X_val, y_val), 
    class_weight=weights,
    # Early stopping
    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)]
)

In [None]:
def plot_loss_history(history):
    ''' Plot training and validation loss over epochs '''
    loss_train = history.history['loss']
    loss_val = history.history['val_loss']
    epochs = range(0, len(loss_train))
    plt.plot(epochs, loss_train, 'g', label='Training loss')
    plt.plot(epochs, loss_val, 'b', label='validation loss')
    plt.title('Training and Validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
plot_loss_history(history)

In [None]:
# Make predictions
predictions = model.predict(X_val)
y_pred = np.argmax(predictions, axis=-1)

In [None]:
# Check confusion matrix
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True)
plt.xlabel("predicted")
plt.ylabel("truth")
plt.show()

### Qualitative test

In [None]:
toy_text = [
    "NNNNN",
    "Hello everyone, and welcome to my TED Talk.",
    "These past few weeks, I've been working on a project that I'm very excited to show you.",
    "This program will listen to you as you speak, and determine whether what you just said was funny, impressive, or neither.",
    "Then, it laughs or applauds accordingly, ensuring your genius never goes unnoticed.",
    "With this application, people from around the world will finally be able to deliver a TED Talk from the comfort of their own home, and receive the attention they deserve.",
    "Thank you very much."
]

In [None]:
# Create windows and pad
toy_X, _ = create_dataset(toy_text)
toy_X = keras.preprocessing.sequence.pad_sequences(toy_X, maxlen=MAXLEN)

In [None]:
# Make predictions
toy_pred = model.predict(toy_X)
toy_pred = np.argmax(toy_pred, axis=-1)

In [None]:
toy_pred

### Model persistence

In [None]:
SAVE_INDEX = 0

In [None]:
# Save model
model.save(f'transformer_{SAVE_INDEX}')

In [None]:
# Save encodings
with open(f'word_to_index_{SAVE_INDEX}.json', 'w') as f:
    json.dump(word_to_index, f)