# Intro to Language Modeling

## Bigram model using empirical frequencies

In [10]:
import gradio as gr
import matplotlib.pyplot as plt
from collections import Counter
import torch


def character_tokenizer(names_str):
    vocab = sorted(list(set(names_str)) + ["<S>"])
    s_to_i = {s: i for i, s in enumerate(vocab)}
    i_to_s = {i: s for i, s in enumerate(vocab)}
    return s_to_i, i_to_s


def tokenize_name(name, s_to_i):
    token_sequence = [s_to_i["<S>"]]  # special token for start of name
    for character in name:
        token_sequence.append(s_to_i[character])
    token_sequence.append(s_to_i["<S>"])  # special token for end of name
    return token_sequence


def visualize_bigram_counts_frequencies(i_to_s, counts_matrix, freqs_matrix):
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(freqs_matrix, interpolation="nearest", cmap="Blues")

    # add the bigram and its count inside each cell
    for i, _ in enumerate(counts_matrix):
        for j, _ in enumerate(counts_matrix[i]):
            ax.text(
                j,
                i,
                (
                    f"Bigram = {i_to_s[i]} → {i_to_s[j]}\n"
                    f"count = {counts_matrix[i][j]:.2f}\n"
                    f"freq = {freqs_matrix[i][j]:.2f}\n"
                ),
                ha="center",
                va="top",
                color="black",
                fontsize=8,
            )

    # keep ticks as integer tokens
    ax.set_xticks(range(len(i_to_s)))
    ax.set_yticks(range(len(i_to_s)))

    return fig


def sample_name_from_prob_dist(prob_dist, s_to_i, i_to_s):
    # start with an empty name
    name = ""

    # current token is initialized to <S> special token for start of name
    token = s_to_i["<S>"]
    while True:
        # return the probability of next token given the current token
        prob_next_token = prob_dist[token]

        # given a probability distribution return an integer token sampled from it
        next_token = torch.multinomial(input=prob_next_token, num_samples=1).item()

        # convert the integer token to a character
        next_char = i_to_s[next_token]

        # if we have reached the <S> special token for end of name, stop
        if next_char == "<S>":
            break

        # update the name
        name += next_char

        # update the current token
        token = next_token

    return name


def bigram_output(names_str, n_names_to_sample=10, seed=0):
    torch.manual_seed(seed)

    # -------------------- Character-level Tokenization --------------------
    s_to_i, i_to_s = character_tokenizer(names_str.replace("\n", ""))

    # -------------------- Tokenized Names --------------------
    tokenized_names = [tokenize_name(name, s_to_i) for name in names_str.split("\n")]

    # -------------------- Bigram Prediction Task Examples --------------------
    bigram_prediction_tasks = [
        f"Given {tokenized_char} ({i_to_s[tokenized_char]}) predict {next_tokenized_char} ({i_to_s[next_tokenized_char]})"
        for tokenized_name in tokenized_names
        for (tokenized_char, next_tokenized_char) in zip(
            tokenized_name[:-1], tokenized_name[1:]
        )
    ]

    # -------------------- Observed counts and frequencies --------------------
    counts = Counter(
        [
            (tokenized_char, next_tokenized_char)
            for tokenized_name in tokenized_names
            for tokenized_char, next_tokenized_char in zip(
                tokenized_name[:-1], tokenized_name[1:]
            )
        ]
    )

    # produce a matrix of the counts of size (vocab_size, vocab_size)
    counts_matrix = torch.Tensor([[counts[(i, j)] for j in i_to_s] for i in i_to_s])

    # produce a matrix of the frequencies by normalizing the counts matrix
    freqs_matrix = counts_matrix / counts_matrix.sum(dim=1, keepdim=True)

    # produce an plot of the counts and frequencies matrices
    fig = visualize_bigram_counts_frequencies(i_to_s, counts_matrix, freqs_matrix)

    # -------------------- Sampled Output --------------------
    sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(freqs_matrix, s_to_i, i_to_s)
        sampled_names.append(name)

    # -------------------- Random Sampled Output --------------------
    prob_uniform = torch.ones_like(freqs_matrix)
    prob_uniform /= prob_uniform.sum(dim=1, keepdim=True)
    random_sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(prob_uniform, s_to_i, i_to_s)
        random_sampled_names.append(name)

    return (
        s_to_i,
        "\n".join(
            [
                f"{name} → {tokenized_name}"
                for name, tokenized_name in zip(names, tokenized_names)
            ]
        ),
        "\n".join(bigram_prediction_tasks),
        fig,
        "\n".join(sampled_names),
        "\n".join(random_sampled_names),
    )


names = ["emma", "ava"]

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Bigram Language Model using empirical frequencies
                Shows how to build a simple probabilistic language model using bi-grams.
                """
            )
    with gr.Row():
        with gr.Column():
            text1 = gr.Textbox(label="Names Dataset", value="\n".join(names))
            inbtw = gr.Button("Submit", variant="primary")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Character-level Tokenization")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="Tokenized Names")
        with gr.Column(scale=1, min_width=100):
            out3 = gr.Textbox(label="Bigram Prediction Task Examples")

    with gr.Row():
        with gr.Column():
            out4 = gr.Plot(label="Observed counts and frequencies")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out5 = gr.Textbox(label="Sampled Output")
        with gr.Column(scale=1, min_width=100):
            out6 = gr.Textbox(label="Random Sampled Output")

    inbtw.click(
        bigram_output, inputs=text1, outputs=[out1, out2, out3, out4, out5, out6]
    )

demo.launch(quiet=True)



## How do we evaluate our bigram model? Negative log likelihood


Take a dataset $\text{data}$ we want to evaluate our model against

An ideal model would have a $$P(\text{data}) = 1$$

In plain english, this means that the model would be able to predict all the characters in the dataset with 100% certainty.

Given data is a sequence of characters, we can calculate the probability of the dataset as follows

$$P(\text{data}) = \prod_{i=1}^{n} P(\text{data}_i | \text{data}_{i-1})$$ 

i.e. a joint probability of all the characters in the dataset

where $n$ is the length of the dataset and $P(\text{data}_i | \text{data}_{i-1})$ is the conditional probability of the ith character in the dataset given the previous characters (in the case of bigram only the most previous character).

We chose to apply a log transform to the probability to avoid underflow errors given we are multiplying probabilities between 0 and 1. This is acceptable given that the log function is monotonically increasing between 0 and 1. We call this our log-likelihood:

$$Log(P(\text{data})) = Log(\prod_{i=1}^{n} P(\text{data}_i | \text{data}_{i-1}))$$

And given  $Log(a * b) = log(a) + log(b)$ we can further simplify the log-likehood as follows

$$ Log(P(\text{data})) = \sum_{i=1}^{n} Log(P(\text{data}_i | \text{data}_{i-1}))$$

We then normalize the score by taking a mean and as such we compute a `mean_log_likelihood`

$$\bar{ll} = \frac{1}{n} *  \sum_{i=1}^{n} Log(P(\text{data}_i | \text{data}_{i-1}))$$

Note that in order to frame this as a minimization problem, we take the negative of the log-likelihood as the loss value to minimize.

$$\bar{nll} = \frac{-1}{n} *  \sum_{i=1}^{n} Log(P(\text{data}_i | \text{data}_{i-1}))$$

In [23]:
import gradio as gr
from collections import Counter
import torch


def character_tokenizer(names_str):
    vocab = sorted(list(set(names_str)) + ["<S>"])
    s_to_i = {s: i for i, s in enumerate(vocab)}
    i_to_s = {i: s for i, s in enumerate(vocab)}
    return s_to_i, i_to_s


def tokenize_name(name, s_to_i):
    token_sequence = [s_to_i["<S>"]]  # special token for start of name
    for character in name:
        token_sequence.append(s_to_i[character])
    token_sequence.append(s_to_i["<S>"])  # special token for end of name
    return token_sequence


def bigram_evaluation(training_dataset, evaluation_dataset, apply_smoothing):
    # create a character-level tokenizer
    s_to_i, i_to_s = character_tokenizer(training_dataset.replace("\n", ""))
    vocab_size = len(s_to_i)

    # tokenize each name in the dataset into a sequence of integers
    tokenized_names = [
        tokenize_name(name, s_to_i) for name in training_dataset.split("\n")
    ]

    # build counts of each bigram
    counts = Counter(
        [
            (tokenized_char, next_tokenized_char)
            for tokenized_name in tokenized_names
            for tokenized_char, next_tokenized_char in zip(
                tokenized_name[:-1], tokenized_name[1:]
            )
        ]
    )

    # produce a matrix of the counts
    counts_matrix = torch.Tensor([[counts[(i, j)] for j in i_to_s] for i in i_to_s])

    # ---------------------------- Apply Smoothing ----------------------------
    if apply_smoothing:
        # ensure that no count is zero
        counts_matrix += 1

    # produce a matrix of the frequencies
    freqs_matrix = counts_matrix / counts_matrix.sum(dim=1, keepdim=True)

    # ---------------------------- Evaluate Model ----------------------------

    # ---------------------------- Mean Negative Log Likelihood - Bigram Model ----------------------------
    log_likelihood = 0
    n = 0
    for name in evaluation_dataset.split("\n"):
        tokenized_name = tokenize_name(name, s_to_i)

        # for each name we always start with the special token for start of name
        token = 0
        for next_token in tokenized_name[1:]:
            # take the estimated probability of the observed next token
            prob_next_token = freqs_matrix[token][next_token]

            # transform to log space
            log_prob_next_token = torch.log(prob_next_token)

            # add to the log likelihood
            log_likelihood += log_prob_next_token

            # keep track of the number of tokens
            n += 1

            # update the current token
            token = next_token

    # compute the mean negative log likelihood
    negative_mean_log_likelihood = -1 / n * log_likelihood

    # ---------------------------- Mean Negative Log Likelihood - Random Model ----------------------------
    # This is summarized as -1 / n * (n * log(1 / vocab_size)) = -1 * log(1 / vocab_size)
    random_negative_mean_log_likelihood = -1 * torch.log(torch.tensor(1 / vocab_size))

    return (
        f"{negative_mean_log_likelihood.item():.4f}",
        f"{random_negative_mean_log_likelihood.item():.4f}",
    )


training_dataset = [
    "emma",
    "evan",
    "ava",
]

evaluation_dataset = ["eva"]


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Bigram Model Language Model Evaluation - Log Likelihood
                """
            )

    with gr.Row():
        with gr.Column():
            text1 = gr.Textbox(
                label="Training Dataset", value="\n".join(training_dataset)
            )
            text2 = gr.Textbox(
                label="Evaluation Dataset", value="\n".join(evaluation_dataset)
            )
            checkbox = gr.Checkbox(label="Apply Smoothing")
            inbtw = gr.Button("Evaluate Model", variant="primary")
    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Mean Negative Log Likelihood - Bigram Model")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="Mean Negative Log Likelihood - Random Model")

    inbtw.click(
        bigram_evaluation, inputs=[text1, text2, checkbox], outputs=[out1, out2]
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7878




## Bigram model using a linear model built as a neural network minimizing log-likelihood

In [24]:
import gradio as gr
import matplotlib.pyplot as plt
import torch


def character_tokenizer(names_str):
    vocab = sorted(list(set(names_str)) + ["<S>"])
    s_to_i = {s: i for i, s in enumerate(vocab)}
    i_to_s = {i: s for i, s in enumerate(vocab)}
    return s_to_i, i_to_s


def tokenize_name(name, s_to_i):
    token_sequence = [s_to_i["<S>"]]  # special token for start of name
    for character in name:
        token_sequence.append(s_to_i[character])
    token_sequence.append(s_to_i["<S>"])  # special token for end of name
    return token_sequence


def sample_name_from_prob_dist(prob_dist, s_to_i, i_to_s):
    # start with an empty name
    name = ""

    # current token is initialized to <S> special token for start of name
    token = s_to_i["<S>"]
    while True:
        # return the probability of next token given the current token
        prob_next_token = prob_dist[token]

        # given a probability distribution return an integer token sampled from it
        next_token = torch.multinomial(input=prob_next_token, num_samples=1).item()

        # convert the integer token to a character
        next_char = i_to_s[next_token]

        # if we have reached the <S> special token for end of name, stop
        if next_char == "<S>":
            break

        # update the name
        name += next_char

        # update the current token
        token = next_token

    return name


class Linear:
    def __init__(self, input_size, output_size):
        self.W = torch.randn(input_size, output_size) * 0.01
        self.W.requires_grad_()

    def forward(self, X):
        # B, I @ I, O -> B, O
        return X @ self.W


def fit_model(model, X_one_hot, Y):
    for _ in range(10_000):
        # we call these logits
        log_counts = model.forward(X_one_hot.float())

        # perform a softmax
        counts = torch.exp(log_counts)
        probs = counts / counts.sum(dim=1, keepdim=True)

        # negative mean log likelihood loss
        loss = (
            -1 * torch.log(probs[torch.arange(Y.shape[0]), Y.long()]).sum() / Y.shape[0]
        )

        model.W.grad = None
        loss.backward()

        model.W.data -= 0.1 * model.W.grad

    return loss


def predict_proba(model, X):
    logits = model.forward(X)
    counts = torch.exp(logits)
    return counts / counts.sum(dim=1, keepdim=True)


def plot_prob_dist(probs, i_to_s):
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(probs, interpolation="nearest", cmap="Blues")
    for i in range(len(probs)):
        for j in range(len(probs[i])):
            ax.text(
                j,
                i,
                (f"Bigram = {i_to_s[i]} → {i_to_s[j]} \n prob = {probs[i][j]:.2f}"),
                ha="center",
                va="top",
                color="black",
                fontsize=8,
            )

    return fig


@torch.no_grad()
def get_prob_dist(model, vocab_size, i_to_s):
    X_tokens = torch.arange(vocab_size)
    X_one_hot = torch.nn.functional.one_hot(input=X_tokens, num_classes=vocab_size)
    prob_dist = predict_proba(model, X_one_hot.float())
    return prob_dist


def bigram_via_linear_model(names_str, n_names_to_sample=10, seed=0):
    torch.manual_seed(seed)

    # -------------------- Character-level Tokenization --------------------
    s_to_i, i_to_s = character_tokenizer(names_str.replace("\n", ""))
    vocab_size = len(s_to_i)

    # -------------------- Tokenized Names --------------------
    tokenized_names = [tokenize_name(name, s_to_i) for name in names_str.split("\n")]

    # -------------------- Model input (X) --------------------
    xs = [token for tokenized_name in tokenized_names for token in tokenized_name[:-1]]
    X = torch.tensor(xs)

    # -------------------- Model targets (Y) --------------------
    ys = [token for tokenized_name in tokenized_names for token in tokenized_name[1:]]
    Y = torch.Tensor(ys)

    # -------------------- One-hot encoded input --------------------
    X_one_hot = torch.nn.functional.one_hot(input=X, num_classes=vocab_size)
    fig_one_hot, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(X_one_hot, interpolation="nearest", cmap="Greys")

    # -------------------- Linear Model --------------------
    model = Linear(input_size=vocab_size, output_size=vocab_size)

    # -------------------- Training Loss --------------------
    loss = fit_model(model, X_one_hot, Y)

    # -------------------- Model-Fitted Probabilities --------------------
    prob_dist = get_prob_dist(model, vocab_size, i_to_s)
    fig_probs = plot_prob_dist(prob_dist, i_to_s)

    # -------------------- Sampled Output --------------------
    sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(prob_dist, s_to_i, i_to_s)
        sampled_names.append(name)

    # -------------------- Random Sampled Output --------------------
    prob_uniform = torch.ones_like(prob_dist)
    prob_uniform /= prob_uniform.sum(dim=1, keepdim=True)
    random_sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(prob_uniform, s_to_i, i_to_s)
        random_sampled_names.append(name)

    return (
        s_to_i,
        "\n".join(
            [
                f"{name} → {tokenized_name}"
                for name, tokenized_name in zip(names, tokenized_names)
            ]
        ),
        xs,
        ys,
        fig_one_hot,
        f"{loss.item():.4f}",
        fig_probs,
        "\n".join(sampled_names),
        "\n".join(random_sampled_names),
    )


names = [
    "emma",
    "ava",
]

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Bigram Language Model via Linear Model
                """
            )

    with gr.Row():
        with gr.Column():
            text1 = gr.Textbox(label="Names Dataset", value="\n".join(names))
            inbtw = gr.Button("Submit", variant="primary")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Character-level Tokenization")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="Tokenized Input")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out3 = gr.Textbox(label="Model input (X)")
            out4 = gr.Textbox(label="Model targets (Y)")

        with gr.Column(scale=1, min_width=100):
            out5 = gr.Plot(label="One-hot encoded input")

    with gr.Row():
        with gr.Column():
            out6 = gr.Textbox(label="Training Loss (Negative Mean Log Likelihood)")
            out7 = gr.Plot(label="Model-Fitted Probabilities")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out8 = gr.Textbox(label="Sampled Output")
        with gr.Column(scale=1, min_width=100):
            out9 = gr.Textbox(label="Random Sampled Output")

    inbtw.click(
        bigram_via_linear_model,
        inputs=text1,
        outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7879




## The Curse of Dimensionality - Embeddings to the Rescue

**Exponential Growth in dimensions**
If we wanted to extend our content length to use a sequence of two characters instead of just one to predict the next character then our probability distribution matrix will now be `vocab_size` x `vocab_size` to cover all possible combinations of two characters. If we take this out to using three characters, then we will now need `vocab_size^3` (an expontentail growth in the number of rows of our matrix size)  - this means we quickly will:

- have a very large matrix that is very sparse (i.e. most of the values will be zero)
- need a lot of data to ensure we have enough examples of each combination of the characters. 


**Embedding into a vector space**
Instead, what Bengio et al. (2003) proposed was to embed each character into a vector space of a fixed size that is smaller than the vocabulary size. By compressing each character into a vector space, we can"

- learn the relationships between characters enabling us to transfer knowledge between similar characters.
- reduce the amount of data required to train the model.

In the paper, Bengio et al. (2003) used a 30-dimensional vector space to embed each word from a vocab size of 17,000. They then provide the following example of how the embedding can be used to learn the relationship between words:

"The cat is walking in the bedroom" can be used to generalize to "A dog was running in a room"

In [20]:
from pathlib import Path
from collections import deque

import gradio as gr
import matplotlib.pyplot as plt
import torch


def character_tokenizer(names_str):
    vocab = sorted(list(set(names_str)) + ["<S>"])
    s_to_i = {s: i for i, s in enumerate(vocab)}
    i_to_s = {i: s for i, s in enumerate(vocab)}
    return s_to_i, i_to_s


def tokenize_name(name, s_to_i):
    token_sequence = [s_to_i["<S>"]]  # special token for start of name
    for character in name:
        token_sequence.append(s_to_i[character])
    token_sequence.append(s_to_i["<S>"])  # special token for end of name
    return token_sequence


class Embedding:
    def __init__(self, vocab_size, embedding_dim):
        self.E = torch.randn(vocab_size, embedding_dim)
        self.E.requires_grad_()

    def forward(self, x):
        return self.E[x.long()]

    def parameters(self):
        return [self.E]


class Flatten:
    def forward(self, x):
        B, T, C = x.shape
        return x.view(B, T * C)

    def parameters(self):
        return []


class Linear:
    def __init__(self, input_size, output_size, bias=True):
        self.W = torch.randn(input_size, output_size)
        self.W.requires_grad_()

        if bias:
            self.b = torch.randn(output_size)
            self.b.requires_grad_()

    def forward(self, X):
        return X @ self.W + self.b

    def parameters(self):
        params = [self.W]
        if hasattr(self, "b"):
            params.append(self.b)
        return params


class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params


class BatchedDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


def fit_model(model, X, Y):
    ds = BatchedDataset(X, Y)

    for _ in range(10000):
        for X, Y in torch.utils.data.DataLoader(ds, batch_size=min(32, len(X))):
            logits = model.forward(X.float())

            loss = torch.nn.functional.cross_entropy(logits, Y)

            for param in model.parameters():
                param.grad = None

            loss.backward()

            for param in model.parameters():
                param.data -= 0.1 * param.grad

    return loss


def build_dataset(tokenized_names, context_length):
    xs = []
    ys = []
    for tokenized_name in tokenized_names:
        # initialize the context to the special token <S> whose id is 0
        context = deque([0] * context_length, maxlen=context_length)
        for token in tokenized_name[1:]:
            y = token
            x = list(context)
            context.append(token)
            xs.append(x)
            ys.append(y)

    return xs, ys


@torch.no_grad()
def plot_embeddings(layer, i_to_s):
    E = layer.E
    fig, ax = plt.subplots(figsize=(8, 8))
    plt.scatter(E[:, 0], E[:, 1], color="blue", s=800)
    for i in range(len(i_to_s)):
        ax.text(
            E[i, 0],
            E[i, 1],
            i_to_s[i],
            ha="center",
            va="center",
            color="white",
            fontsize=12,
        )
    return fig


@torch.no_grad()
def predict_proba(model, X):
    logits = model(X)
    return torch.nn.functional.softmax(logits, dim=1)


def sample_name_from_prob_dist(prob_gen, s_to_i, i_to_s, context_length):
    # start with an empty name
    name = ""

    # current token is initialized to <S> special token for start of name
    context = deque([s_to_i["<S>"]] * context_length, maxlen=context_length)
    token = list(context)
    while True:
        # return the probability of next token given the current token
        token_tensor = torch.tensor([token])
        prob_next_token = prob_gen(token_tensor)

        # given a probability distribution return an integer token sampled from it
        next_token = torch.multinomial(input=prob_next_token, num_samples=1).item()

        # convert the integer token to a character
        next_char = i_to_s[next_token]

        # if we have reached the <S> special token for end of name, stop
        if next_char == "<S>":
            break

        # update the name
        name += next_char

        # update the current token
        context.append(next_token)
        token = list(context)

    return name


def mlp_with_embedding(names_str, context_length="3", n_names_to_sample=10, seed=0):
    # -------------------- Preprocessing arguments --------------------
    context_length = int(context_length)
    embedding_size = 2
    names = names_str.split("\n")

    torch.manual_seed(seed)

    # -------------------- Character-Level Tokenization --------------------
    s_to_i, i_to_s = character_tokenizer("".join(names))
    vocab_size = len(s_to_i)

    # -------------------- Tokenized Input --------------------
    tokenized_names = [tokenize_name(name, s_to_i) for name in names]

    # -------------------- Model input (X) and targets (Y) --------------------
    xs, ys = build_dataset(tokenized_names, context_length)

    X = torch.tensor(xs)
    Y = torch.tensor(ys)

    # -------------------- Training --------------------
    model = Sequential(
        [
            Embedding(vocab_size, embedding_size),
            Flatten(),
            Linear(context_length * embedding_size, vocab_size),
        ]
    )
    loss = fit_model(model, X, Y)

    #  -------------------- Learned Embedding --------------------
    fig_embed = plot_embeddings(
        next(layer for layer in model.layers if isinstance(layer, Embedding)), i_to_s
    )

    # -------------------- Sampled Output --------------------
    sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(
            lambda tokens: predict_proba(model, tokens.float()),
            s_to_i,
            i_to_s,
            context_length,
        )
        sampled_names.append(name)

    # -------------------- Random Sampled Output --------------------
    prob_uniform = torch.ones((1, vocab_size))
    prob_uniform /= prob_uniform.sum(dim=1, keepdim=True)
    random_sampled_names = []
    for _ in range(n_names_to_sample):
        name = sample_name_from_prob_dist(
            lambda tokens: prob_uniform, s_to_i, i_to_s, 1
        )
        random_sampled_names.append(name)

    return (
        s_to_i,
        "\n".join(
            [
                f"{name} → {tokenized_name}"
                for name, tokenized_name in zip(names, tokenized_names)
            ]
        ),
        "\n".join([str(x) for x in xs]),
        ys,
        fig_embed,
        f"{loss:.4f}",
        "\n".join(sampled_names),
        "\n".join(random_sampled_names),
    )


names = [
    "emma",
    "ava",
]
# names = Path("names.txt").read_text().splitlines()


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Multi-Layer Perceptron with Embedding
                """
            )
    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            text1 = gr.Textbox(label="Names Dataset", value="\n".join(names))
        with gr.Column(scale=1, min_width=100):
            text2 = gr.Textbox(label="Context Length", value=3)

    with gr.Row():
        inbtw = gr.Button("Submit", variant="primary")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Character-level Tokenization")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="Tokenized Input")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out3 = gr.Textbox(label="Model input (X)")
            out4 = gr.Textbox(label="Model targets (Y)")

        with gr.Column(scale=1, min_width=100):
            out5 = gr.Plot(label="Learned Embedding")

    with gr.Row():
        with gr.Column():
            out6 = gr.Textbox(label="Training Loss (mean negative log-likelihood)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out7 = gr.Textbox(label="Sampled Output")
        with gr.Column(scale=1, min_width=100):
            out8 = gr.Textbox(label="Random Sampled Output")

    inbtw.click(
        mlp_with_embedding,
        inputs=[text1, text2],
        outputs=[out1, out2, out3, out4, out5, out6, out7, out8],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7875




## GPT model - Attention is all you need

A transformer model that uses the attention mechanism to learn contextual relations between words in a text. 

Attention function: $ \text{Attention}(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}}) V = \text{weights} V $

**Explanation**:
For a given token, we perform three linear transformations to obtain:

- the query vector Q
- the key vector K 
- the value vector V

The matrix multiplication of $QK^T$ scaled by $\sqrt{d_k}$ and passed through a softmax function can be interpreted as a weighted matrix of the importance of each token in the sequence with respect to the given token. 

The weighted matrix is then multiplied by the value vector V to obtain the contextualized representation of the given token.

As a first step - let's wrap our minds around how a weighted aggregation can be expressed as a matrix multiplication.


### Weighted aggregations as a matrix multiplication - a mathematical trick in self-attention

This shows how we can perform weighted aggregations using a matrix multiplication. 

In [25]:
import torch
import gradio as gr


def display_tensor(t):
    return "\n".join(
        ["   ".join([f"{elem:.2f}" for elem in row]) for row in t.tolist()]
    )


def aggergate_via_matrix_mult(sequence_length=3, embedding_size=2, seed=0):
    # show intended averaging
    """
    B = batch_size
    T = sequence_length # (time steps)
    C = embedding_size # (channels)

    for b in range(B):
        for t in range(T):
            # select the batch and keep the first t tokens
            xprev = x[b, : t + 1]  # t x C
            # average over the channels across the tokens
            xbow[b, t] = xprev.mean(dim=0)
    """
    torch.manual_seed(seed)
    # ---------------------------- Prepare data ----------------------------
    sequence_length = int(sequence_length)
    embedding_size = int(embedding_size)
    X = torch.randint(0, 10, (sequence_length, embedding_size)).float()

    # ---------------------------- Weights as Ones matrix ----------------------------
    w_ones = torch.ones((sequence_length, sequence_length))
    out_ones = w_ones @ X

    # ---------------------------- Weights as  Lower triangular matrix ----------------------------
    w_tril = torch.tril(torch.ones((sequence_length, sequence_length)))
    out_tril = w_tril @ X

    # ---------------------------- Weights as  Lower triangular matrix normalized ----------------------------
    w_tril_norm = w_tril / w_tril.sum(dim=1, keepdim=True)
    out_tril_norm = w_tril_norm @ X

    # ---------------------------- Weights as Lower triangular zeros and infities ----------------------------
    w_zeroes = torch.zeros((sequence_length, sequence_length))
    w_zeroes = w_zeroes.masked_fill(w_tril == 0, float("-inf"))
    softmaxed = torch.softmax(w_zeroes, dim=1)

    # show how to normalize using a zeros and inf mask + softmax
    # a = torch.zeros((sequence_length, sequence_length))
    # a = a.masked_fill(a == 0, float("-inf"))
    # a = torch.softmax(a, dim=1)
    return (
        display_tensor(w_ones),
        display_tensor(X),
        display_tensor(out_ones),
        display_tensor(w_tril),
        display_tensor(X),
        display_tensor(out_tril),
        display_tensor(w_tril_norm),
        display_tensor(X),
        display_tensor(out_tril_norm),
        display_tensor(w_zeroes),
        display_tensor(softmaxed),
    )


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Self attention trick: Weigthed-aggregation using matrix multiplication

                In a nutshell - We want to replace the following code with a matrix multiplication:

                ```python
                x = torch.rand((T, C))
                x_bag_of_words = torch.rand((T, C))
                for t in range(T):
                    # keep all past tokens up to t
                    xprev = x[: t + 1]  # t x C
                    # average over the channels across the past tokens
                    x_bag_of_words[t] = xprev.mean(dim=0)
                ```
                """
            )

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            text1 = gr.Textbox(
                label="Sequence Length or Block/Context size (T)", value=3
            )
        with gr.Column(scale=1, min_width=100):
            text2 = gr.Textbox(label="Channel size or Embedding Size (C)", value=2)

    with gr.Row():
        inbtw = gr.Button("Submit", variant="primary")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Weight: Ones matrix (T, T)")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="X matrix (T, C)")
        with gr.Column(scale=1, min_width=100):
            out3 = gr.Textbox(label="Weight @ X (T, C)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out4 = gr.Textbox(label="Weights: lower triangluar matrix (T, T)")
        with gr.Column(scale=1, min_width=100):
            out5 = gr.Textbox(label="X matrix (T, C)")
        with gr.Column(scale=1, min_width=100):
            out6 = gr.Textbox(label="Weight @ X (T, C)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out7 = gr.Textbox(
                label="Weights: normalized lower triangluar matrix (T, T)"
            )
        with gr.Column(scale=1, min_width=100):
            out8 = gr.Textbox(label="X matrix (T, C)")
        with gr.Column(scale=1, min_width=100):
            out9 = gr.Textbox(label="Weight @ X (T, C)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out10 = gr.Textbox(label="Weights: lower triangluar zeros and infinities")
        with gr.Column(scale=1, min_width=100):
            out11 = gr.Textbox(label="Softmaxed weights")

    inbtw.click(
        aggergate_via_matrix_mult,
        inputs=[text1, text2],
        outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7880




In [27]:
from collections import deque
from pathlib import Path

import torch
import gradio as gr


class ScaledDotProductAttention(torch.nn.Module):
    def __init__(self, block_size, num_embed, head_size):
        super().__init__()
        self.head_size = head_size
        self.key = torch.nn.Linear(num_embed, head_size, bias=False)
        self.query = torch.nn.Linear(num_embed, head_size, bias=False)
        self.value = torch.nn.Linear(num_embed, head_size, bias=False)
        self.register_buffer(
            "tril_mask", torch.tril(torch.ones(block_size, block_size))
        )

    def forward(self, x):
        B, T, C = x.shape
        H = self.head_size

        # x (B, T, C) @ key (C, H) -> (B, T, C) @ (B, C, H) -> (B, T, H)
        K = self.key(x)

        # x (B, T, C) @ query (C, H) -> (B, T, C) @ (B, C, H) -> (B, T, H
        Q = self.query(x)

        # K (B, T, H) @ Q.T (B, H, T) -> (B, T, T)
        W = K @ Q.view(B, H, T)
        W = W / torch.sqrt(torch.tensor(self.head_size).float())

        # W (B, T, T) -- tril_mask (H, H) -> (B, T, T)
        W = W.masked_fill(self.tril_mask[:T, :T] == 0, float("-inf"))
        W = torch.softmax(W, dim=-1)

        # x (B, T, C) @ value (C, H) -> (B, T, C) @ (B, C, H) -> (B, T, H)
        V = self.value(x)

        # W (B, T, T) @ V (B, T, H) -> (B, T, H)
        out = W @ V

        return out


class MultiHeadAttention(torch.nn.Module):
    def __init__(self, block_size, num_embed, num_heads):
        super().__init__()
        assert num_embed % num_heads == 0
        self.attention_heads = torch.nn.ModuleList(
            [
                ScaledDotProductAttention(
                    block_size=block_size,
                    num_embed=num_embed,
                    head_size=num_embed // num_heads,
                )
                for _ in range(num_heads)
            ]
        )

    def forward(self, x):
        # dim = -1 is H/n - when concatenated back together, we want the last dim to be H
        return torch.cat([head(x) for head in self.attention_heads], dim=-1)


class TransformerBlock(torch.nn.Module):
    """A Transformer block (with pre-normalization)."""

    def __init__(self, block_size, num_embed, num_heads):
        super().__init__()
        self.ln_1 = torch.nn.LayerNorm(num_embed)
        self.attn = MultiHeadAttention(
            block_size=block_size, num_embed=num_embed, num_heads=num_heads
        )
        self.ln_2 = torch.nn.LayerNorm(num_embed)
        self.mlp = torch.nn.ModuleDict(
            dict(
                c_fc=torch.nn.Linear(num_embed, 4 * num_embed),
                c_proj=torch.nn.Linear(4 * num_embed, num_embed),
            )
        )
        m = self.mlp
        self.mlpf = lambda x: m.c_proj(
            torch.nn.functional.gelu(m.c_fc(x))
        )  # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x


class TransformerModel(torch.nn.Module):
    """Transformer Language Model, GPT-2 like."""

    def __init__(self, vocab_size, num_embed, num_heads, num_layers, block_size):
        super().__init__()
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.num_embed = num_embed

        self.transformer = torch.nn.ModuleDict(
            dict(
                wte=torch.nn.Embedding(vocab_size, num_embed),
                wpe=torch.nn.Embedding(block_size, num_embed),
                h=torch.nn.ModuleList(
                    [
                        TransformerBlock(
                            block_size=block_size,
                            num_embed=num_embed,
                            num_heads=num_heads,
                        )
                        for _ in range(num_layers)
                    ]
                ),
                ln_f=torch.nn.LayerNorm(num_embed),
            )
        )

        self.lm_head = torch.nn.Linear(num_embed * block_size, vocab_size, bias=False)

    def forward(self, x):
        device = x.device
        B, T = x.shape

        # token embeddings of shape (b, t, num_embed)
        tok_emb = self.transformer.wte(x)

        # position embeddings of shape (t, num_embed)
        pos = torch.arange(T, dtype=torch.long, device=device)
        pos_emb = self.transformer.wpe(pos)

        # add embeddings by broadcast (b, t, num_embed)
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)

        logits = self.lm_head(x.view(B, T * self.num_embed))

        return logits


def character_tokenizer(names_str):
    vocab = sorted(list(set(names_str)) + ["<S>"])
    s_to_i = {s: i for i, s in enumerate(vocab)}
    i_to_s = {i: s for i, s in enumerate(vocab)}
    return s_to_i, i_to_s


def tokenize_name(name, s_to_i):
    token_sequence = [s_to_i["<S>"]]  # special token for start of name
    for character in name:
        token_sequence.append(s_to_i[character])
    token_sequence.append(s_to_i["<S>"])  # special token for end of name
    return token_sequence


def build_dataset(tokenized_names, context_length):
    xs = []
    ys = []
    for tokenized_name in tokenized_names:
        # initialize the context to the special token <S> whose id is 0
        context = deque([0] * context_length, maxlen=context_length)
        for token in tokenized_name[1:]:
            y = token
            x = list(context)
            context.append(token)
            xs.append(x)
            ys.append(y)

    return xs, ys


class BatchedDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


def fit_model(model, x, targets):
    ds = BatchedDataset(x, targets)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    for _ in range(2000):
        for x, targets in torch.utils.data.DataLoader(ds, batch_size=min(64, len(x))):
            logits = model(x)
            loss = torch.nn.functional.cross_entropy(logits, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return loss


def transformer_model(
    names,
    block_size,
    num_embed,
    num_heads,
    num_layers,
):
    torch.manual_seed(0)
    names = names.split("\n")
    block_size, num_embed, num_heads, num_layers = map(
        int, [block_size, num_embed, num_heads, num_layers]
    )

    s_to_i, i_to_s = character_tokenizer("".join(names))
    vocab_size = len(s_to_i)

    tokenized_names = [tokenize_name(name, s_to_i) for name in names]

    xs, ys = build_dataset(tokenized_names, context_length=block_size)

    batch_size = min(64, len(xs))

    model = TransformerModel(
        block_size=block_size,
        vocab_size=vocab_size,
        num_embed=num_embed,
        num_layers=num_layers,
        num_heads=num_heads,
    )

    model_summary = repr(model)

    # report number of parameters (note we don't count the decoder parameters in lm_head)
    n_params = sum(p.numel() for p in model.transformer.parameters())

    x = torch.tensor(xs)
    targets = torch.tensor(ys)
    loss = fit_model(model, x, targets)

    return (model_summary, n_params, f"{loss.item():.2f}")


names = ["emma", "ava"]
# names = Path("names.txt").read_text().splitlines()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Transformer Model: Attention is all you need
                """
            )

    with gr.Row():
        with gr.Column():
            names = gr.Textbox(label="Names", value="\n".join(names))

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            text1 = gr.Textbox(label="Block size (T)", value=3)
        with gr.Column(scale=1, min_width=100):
            text2 = gr.Textbox(label="Embedding Size (C)", value=4)

        with gr.Column(scale=1, min_width=100):
            text3 = gr.Textbox(label="Number of Heads (N)", value=2)

        with gr.Column(scale=1, min_width=100):
            text4 = gr.Textbox(label="Number of Layers (L)", value=2)

    with gr.Row():
        inbtw = gr.Button("Submit", variant="primary")

    with gr.Row():
        with gr.Column():
            out1 = gr.Textbox(label="Model Summary", max_lines=50)

    with gr.Row():
        with gr.Column():
            out2 = gr.Textbox(label="Number of Parameters")
        with gr.Column():
            out3 = gr.Textbox(label="Training Loss - Negative mean Log Likelihood")

    inbtw.click(
        transformer_model,
        inputs=[
            names,
            text1,
            text2,
            text3,
            text4,
        ],
        outputs=[out1, out2, out3],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7882




## Hyperparameter tuning with Ray Tune

### Ray cluster creation

In [14]:
import re
import yaml
import subprocess
import gradio as gr

cluster_file = "cluster.yaml"


def create_cluster(cluster_spec):
    cluster_spec = yaml.load(cluster_spec, Loader=yaml.SafeLoader)
    with open(cluster_file, "w") as f:
        yaml.dump(cluster_spec, f)

    s = subprocess.run(
        ["ray", "up", "-y", cluster_file], check=True, capture_output=True
    )
    captured_stdout = s.stdout.decode("utf-8")
    # Remove ANSI color codes from captured stdout
    ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
    stripped_stdout = ansi_escape.sub("", captured_stdout)
    return stripped_stdout


cluster_spec = {
    "cluster_name": "ray-tune-cluster",
    "provider": {
        "type": "aws",
        "region": "us-west-2",
    },
    "auth": {
        "ssh_user": "ubuntu",
    },
    "min_workers": 0,
    "max_workers": 3,
    "available_node_types": {
        "head_node": {
            "node_config": {
                "InstanceType": "c5.xlarge",
                "ImageId": "ami-032a22dd6280fbf04",
            }
        },
        "worker_nodes": {
            "node_config": {
                "InstanceType": "c5.xlarge",
                "ImageId": "ami-032a22dd6280fbf04",
            },
        },
    },
    "head_node_type": "head_node",
    "setup_commands": [
        "sudo apt-get update",
        "sudo apt-get install python-is-python3",
        "pip3 install pip --upgrade",
        "pip3 install ray[air] torch pandas tensorboard",
    ],
}

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            title = gr.Markdown(
                """
                # Ray Cluster Setup on AWS (EC2)
                """
            )

    with gr.Row():
        with gr.Column():
            input_ = gr.Textbox(
                label="Cluster Specification",
                value=yaml.dump(cluster_spec, sort_keys=False),
                max_lines=30,
            )

    with gr.Row():
        with gr.Column():
            inbtw = gr.Button("Create Cluster", variant="primary")

    with gr.Row():
        with gr.Column():
            output = gr.Textbox(label="Logs", value="")

    inbtw.click(
        create_cluster,
        inputs=[input_],
        outputs=[output],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7873




In [16]:
%%writefile simple_tuner.py

import torch
import time
import torch.nn as nn
import argparse
import ray
from ray import train, tune
from ray.tune.tuner import Tuner, TuneConfig
from ray.air import session, Checkpoint
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


class NeuralNetwork(nn.Module):
    def __init__(self, input_size, layer_size, output_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, layer_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(layer_size, output_size)

    def forward(self, input):
        return self.layer2(self.relu(self.layer1(input)))


def train_loop_per_worker(config):
    dataset_shard = session.get_dataset_shard("train")
    model = NeuralNetwork(
        input_size=config["input_size"],
        layer_size=config["layer_size"],
        output_size=config["output_size"],
    )
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=config["lr"])

    model = train.torch.prepare_model(model)

    for epoch in range(config["num_epochs"]):
        for batches in dataset_shard.iter_torch_batches(
            batch_size=32, dtypes=torch.float
        ):
            inputs, labels = torch.unsqueeze(batches["x"], 1), batches["y"]
            output = model(inputs)
            loss = loss_fn(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        session.report(
            {"loss": loss.item()},
            # note checkpointing requires s3 storage path
            # checkpoint=Checkpoint.from_dict(
            #     dict(epoch=epoch, model=model.state_dict())
            # ),
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--address")
    args = parser.parse_args()
    ray.init(address=args.address)

    train_dataset = ray.data.from_items([{"x": x, "y": 2 * x + 1} for x in range(200)])
    scaling_config = ScalingConfig(num_workers=3, use_gpu=False)
    
    # Trainer with default config
    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "input_size": 1,
            "layer_size": 15,
            "output_size": 1,
            "num_epochs": 3,
            "lr": 1e-3,
        },
        scaling_config=scaling_config,
        datasets={"train": train_dataset},
    )

    # Define the search space.
    param_space = {"train_loop_config": {"lr": tune.loguniform(0.0001, 0.01)}}

    tuner = Tuner(
        trainer,
        param_space=param_space,
        tune_config=TuneConfig(num_samples=5, metric="loss", mode="min"),
    )

    # Execute tuning.
    result_grid = tuner.fit()

    # Fetch the best result.
    best_result = result_grid.get_best_result()
    print("Best Result:", best_result)

Overwriting simple_tuner.py


In [18]:
!ray down -y cluster.yaml

2023-08-21 12:09:14,768	INFO util.py:375 -- setting max workers for head node type to 0
2023-08-21 12:09:14,768	INFO util.py:379 -- setting max workers for worker_nodes to 3
[33mLoaded cached provider configuration[39m
[33mIf you experience issues with the cloud provider, try re-running the command with [1m--no-config-cache[22m[26m.[39m
Destroying cluster. [4mConfirm [y/N]:[24m y [2m[automatic, due to --yes][22m
2023-08-21 12:09:14,847	INFO util.py:375 -- setting max workers for head node type to 0
2023-08-21 12:09:14,847	INFO util.py:379 -- setting max workers for worker_nodes to 3
[37mFetched IP[39m: [1m35.84.181.167[22m
[33mStopped only 7 out of 8 Ray processes within the grace period 16 seconds. Set `[1m-v[22m[33m` to see more details. Remaining processes [psutil.Process(pid=9587, name='gcs_server', status='terminated', started='16:01:01')] will be forcefully terminated.[39m
[33mYou can also use `[1m--force[22m[33m` to forcefully terminate processes or set h

## A note on tokenizers

In [29]:
import gradio as gr
from transformers import AutoTokenizer
from string import ascii_lowercase, ascii_uppercase, punctuation, digits, whitespace


def tokenize_input(input_text="Hello tokenization!"):
    # -------------- Character tokenization ---------------
    vocab = ascii_lowercase + ascii_uppercase + punctuation + digits + whitespace
    char_tokens = [x for x in input_text]
    char_token_ids = [vocab.index(x) for x in char_tokens]
    char_vocab_size = len(vocab)
    char_seq_len = len(char_tokens)

    # --------------- Sub-word tokenization ---------------
    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    gpt2_tokens = gpt2_tokenizer.tokenize(input_text)
    gpt2_token_ids = gpt2_tokenizer.convert_tokens_to_ids(gpt2_tokens)
    gpt2_vocab_size = gpt2_tokenizer.vocab_size
    gpt2_seq_len = len(gpt2_tokens)

    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    bert_tokens = bert_tokenizer.tokenize(input_text)
    bert_token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
    bert_vocab_size = bert_tokenizer.vocab_size
    bert_seq_len = len(bert_tokens)

    return (
        char_tokens,
        gpt2_tokens,
        bert_tokens,
        char_token_ids,
        gpt2_token_ids,
        bert_token_ids,
        char_vocab_size,
        f"{gpt2_vocab_size:,}",
        f"{bert_vocab_size:,}",
        char_seq_len,
        gpt2_seq_len,
        bert_seq_len,
    )


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            text1 = gr.Textbox(label="Text", value="hello tokenization!")
            inbtw = gr.Button("Tokenize", variant="primary")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out1 = gr.Textbox(label="Character level Tokens")
        with gr.Column(scale=1, min_width=100):
            out2 = gr.Textbox(label="Sub-word level Tokens (BPE/GPT2)")
        with gr.Column(scale=1, min_width=100):
            out3 = gr.Textbox(label="Sub-word level Tokens (SentencePiece/BERT)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out4 = gr.Textbox(label="Character level Token Ids")
        with gr.Column(scale=1, min_width=100):
            out5 = gr.Textbox(label="Sub-word level Token Ids (BPE/GPT2)")
        with gr.Column(scale=1, min_width=100):
            out6 = gr.Textbox(label="Sub-word level Token Ids (SentencePiece/BERT)")

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out7 = gr.Textbox(label="Character level Tokenizer Vocab Size")
        with gr.Column(scale=1, min_width=100):
            out8 = gr.Textbox(label="Sub-word level Tokenizer Vocab Size (BPE/GPT2)")
        with gr.Column(scale=1, min_width=100):
            out9 = gr.Textbox(
                label="Sub-word level Tokenizer Vocab Size (SentencePiece/BERT)"
            )

    with gr.Row():
        with gr.Column(scale=1, min_width=100):
            out10 = gr.Textbox(label="Character level Tokenizer Sequence Length")
        with gr.Column(scale=1, min_width=100):
            out11 = gr.Textbox(
                label="Sub-word level Tokenizer Sequence Length (BPE/GPT2)"
            )
        with gr.Column(scale=1, min_width=100):
            out12 = gr.Textbox(
                label="Sub-word level Tokenizer Sequence Length (SentencePiece/BERT)"
            )

    inbtw.click(
        tokenize_input,
        inputs=text1,
        outputs=[
            out1,
            out2,
            out3,
            out4,
            out5,
            out6,
            out7,
            out8,
            out9,
            out10,
            out11,
            out12,
        ],
    )

demo.launch(quiet=True)

Running on local URL:  http://127.0.0.1:7884


