In [1]:
import tensorflow as tf

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

config = tf.compat.v1.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95 # Change this value as per requirement
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

2024-02-07 17:18:47.732425: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-07 17:18:47.796438: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-07 17:18:47.796474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-07 17:18:47.798215: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-07 17:18:47.809049: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import pandas as pd
import numpy as np
import transformers
import datasets
import pickle as pkl
import keras_nlp
import transformers
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Dense, Lambda, Input, Softmax, Add, Concatenate
from tensorflow.keras import Model
from random import randint
from tqdm import tqdm
from datasets import load_dataset
from tensorflow import keras
from transformers import RobertaTokenizerFast, TFRobertaForCausalLM, TFRobertaModel, TFRobertaForMaskedLM, get_scheduler


Using TensorFlow backend


In [3]:
L = 50
C = 20
B = L-C
K = 5


### Dataset

In [4]:
# dataset = load_dataset("Skylion007/openwebtext", streaming=True, split='train')

# num_rows_to_read = 500000
# texts = []

# for i, item in enumerate(tqdm(dataset, total=num_rows_to_read)):
#     text = item['text']
#     if i < num_rows_to_read:
#         texts.append(text)
#     else:
#         break

In [5]:
texts = pkl.load(open('./data/500k-row.pkl', 'rb'))

### Tokenizer and endless datagenerator

In [6]:
roberta_path = 'roberta-large'
tf_hub_roberta_path = 'roberta_base_en'

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained(roberta_path)

vocab_size = tokenizer.vocab_size

In [8]:
def sample_text():
    while True:
        text = texts[randint(0, len(texts))]

        tokens = tokenizer.encode_plus(text, return_tensors='np')
        
        start_index = np.random.randint(0, tokens['input_ids'].shape[-1]-L)
        
        input_ids = tokens['input_ids'][0][start_index:start_index+L]
        attention_mask = tokens['attention_mask'][0][start_index:start_index+L]

        yield input_ids, attention_mask

def dict_map(input_ids, attention_mask):
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

### Diffusion stuff

In [9]:
timesteps = 2500
beta = np.linspace(0.0001, 0.02, timesteps)

alpha = 1 - beta
alpha_bar = np.cumprod(alpha, 0)
alpha_bar = np.concatenate((np.array([1.]), alpha_bar[:-1]), axis=0)
sqrt_alpha_bar = np.sqrt(alpha_bar)
one_minus_sqrt_alpha_bar = np.sqrt(1-alpha_bar)

def set_key(key):
    np.random.seed(key)

def forward_noise(x_0, t, key=0):
    set_key(key)
    noise = np.random.normal(size=x_0.shape)
    reshaped_sqrt_alpha_bar_t = np.reshape(np.take(sqrt_alpha_bar, t), (-1, 1, 1))
    reshaped_one_minus_sqrt_alpha_bar_t = np.reshape(np.take(one_minus_sqrt_alpha_bar, t), (-1, 1, 1))
    noisy_image = reshaped_sqrt_alpha_bar_t  * x_0 + reshaped_one_minus_sqrt_alpha_bar_t  * noise
    return noisy_image, noise

def generate_timestamp(num, key=0):
    set_key(key)
    return tf.random.uniform(shape=[num], minval=0, maxval=timesteps, dtype=tf.int32)

def tokens_to_logits(toks):
    toks = tf.cast(toks, dtype=tf.int32)
    return tf.one_hot(toks, vocab_size, K, -K, dtype=tf.float16)

In [10]:
tokens = tokenizer.batch_encode_plus(["This is one", "This is two"], return_tensors='tf')['input_ids']
logits = tokens_to_logits(tokens)
noised, noise = forward_noise(logits, 20)

2024-02-07 17:19:01.538881: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-07 17:19:01.541153: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-07 17:19:01.542900: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

- optimizer
```optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)```

- timestep encoder
```timestep_layer = torch.nn.Linear(1, hidden_size, bias=True)```

- embedding sum layer
```embedding_sum_layer = torch.nn.Linear(vocab_size, hidden_size, bias=False)```



### Import RoBERTa model

In [11]:
# roberta = keras_nlp.models.RobertaBackbone.from_preset(tf_hub_roberta_path)

In [12]:
# embedding = roberta.get_layer("embeddings")
# embedding_layer_norm = roberta.get_layer("embeddings_layer_norm")
# embeddings_dropout = roberta.get_layer("embeddings_dropout")

# roberta_embedder_context = Model(inputs=embedding.input, outputs=embeddings_dropout.output, name='roberta_embedder_context')
# roberta_embedder_diffusion = Model(inputs=embedding.input, outputs=embeddings_dropout.output, name='roberta_embedder_diffusion')
# hidden_size = embedding.weights[0].shape[1]

In [13]:
# timestamp_input = Input(shape=(1), name='timestamp_input', dtype=tf.int32)
# timestamp_embeddings = Dense(hidden_size, name='timestamp_embeddings')(timestamp_input)
# timestamp_encoder = Model(inputs=timestamp_input, outputs=timestamp_embeddings, name='timestamp_encoder')

In [14]:
# token_ids = Input(shape=(L, hidden_size), name='new_token_ids')
# padding_mask = Input(shape=(L), name='new_padding_mask')

# x = roberta.layers[5](token_ids, padding_mask=padding_mask)

# for i in range(6, len(roberta.layers)):
#     x = roberta.layers[i](x, padding_mask=padding_mask)

# new_roberta = Model(inputs=[token_ids, padding_mask], outputs=x)

In [15]:
# new_roberta.summary()

## huggingface

In [23]:
BATCH_SIZE = 10

dataset = tf.data.Dataset.from_generator(
    sample_text, 
    output_signature=(
       tf.TensorSpec(shape=(L), dtype=tf.int32),
       tf.TensorSpec(shape=(L), dtype=tf.int32),
    )).batch(BATCH_SIZE).map(dict_map).prefetch(tf.data.AUTOTUNE)

In [17]:
roberta = TFRobertaForMaskedLM.from_pretrained(roberta_path)
roberta.resize_token_embeddings(vocab_size)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForMaskedLM: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


<tf.Variable 'tf_roberta_for_masked_lm/roberta/embeddings/word_embeddings/weight:0' shape=(50265, 1024) dtype=float32, numpy=
array([[-0.140625  , -0.0096283 ,  0.03909302, ...,  0.05078125,
        -0.00592422, -0.03604126],
       [ 0.0078125 , -0.015625  ,  0.015625  , ..., -0.015625  ,
         0.02305603,  0.015625  ],
       [-0.08282471, -0.0007    , -0.11737061, ...,  0.10864258,
         0.06964111, -0.0355835 ],
       ...,
       [ 0.03930664,  0.00305557,  0.04650879, ..., -0.02404785,
        -0.05050659,  0.03424072],
       [ 0.04986572,  0.02723694,  0.04125977, ..., -0.0369873 ,
        -0.00996399,  0.00713348],
       [-0.01489258, -0.01136017, -0.02224731, ...,  0.04406738,
         0.0116272 , -0.03302002]], dtype=float32)>

In [18]:
roberta_embedder = roberta.get_input_embeddings()
hidden_size = roberta.get_input_embeddings().weight.shape[-1]

In [19]:
timestamp_input = Input(shape=(1), name='timestamp_input', dtype=tf.int32)
timestamp_embeddings = Dense(hidden_size, name='timestamp_embeddings')(timestamp_input)
timestamp_encoder = Model(inputs=timestamp_input, outputs=timestamp_embeddings, name='timestamp_encoder')

In [20]:
preprocess_text_input = Input(shape=(B, vocab_size), name='diffusion text input')
embedding_sum_layer = Dense(hidden_size, use_bias=False, name='middle_embeddings')(preprocess_text_input)
pertubed_inputs = Softmax(name='softmax')(embedding_sum_layer)
diffusion_preprocessor = Model(inputs=[preprocess_text_input], outputs=[pertubed_inputs])

diffusion_preprocessor.layers[1].set_weights([roberta_embedder.weights[0]])

## Test Run

In [21]:
max_steps = 50000
decay_steps = 1000
base_lr = 1e-4

lr_scheduler = tf.keras.optimizers.schedules.CosineDecayRestarts(
    base_lr,
    decay_steps,
    alpha=0.1
)
optimizer_preprocessor=keras.optimizers.AdamW(learning_rate=lr_scheduler)
optimizer_embedder=keras.optimizers.AdamW(learning_rate=lr_scheduler)
optimizer_roberta=keras.optimizers.AdamW(learning_rate=lr_scheduler)

optimizer_preprocessor.build(diffusion_preprocessor.trainable_weights)
optimizer_embedder.build(roberta_embedder.trainable_weights)
optimizer_roberta.build(roberta.trainable_weights)

2024-02-07 17:19:06.576956: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [24]:
i = 0
while i < max_steps:
    test_batch = dataset.as_numpy_iterator().next()
    test_ids = test_batch['input_ids']
    attention_mask = test_batch['attention_mask']

    context = test_ids[:, :C]
    diffusion_text = test_ids[:, C:L]

    diffusion_logits = tokens_to_logits(diffusion_text)

    t = randint(0, timesteps)

    t_embed = timestamp_encoder(np.array([t]))

    diffusion_logits, noise = forward_noise(diffusion_logits, t)

    with tf.GradientTape() as tape:

        perturbed_input_text = diffusion_preprocessor(diffusion_logits)
        diffusion_stuff = perturbed_input_text + t_embed

        context_embeds = roberta_embedder(context)

        final_embeds = tf.concat((context_embeds, perturbed_input_text), axis=1)

        outputs = roberta.call(inputs_embeds=final_embeds, output_hidden_states=False).logits
        diffusion_outputs = outputs[:, C:]
        loss_value = keras.losses.sparse_categorical_crossentropy(diffusion_text, diffusion_outputs, from_logits=True)
        loss_value = tf.math.reduce_mean(keras.losses.sparse_categorical_crossentropy(diffusion_text, diffusion_outputs, from_logits=True))

    preprocessor_grads, embedder_grads, model_grads = tape.gradient(loss_value, [diffusion_preprocessor.trainable_weights, roberta_embedder.trainable_weights, roberta.trainable_weights])

    optimizer_preprocessor.apply_gradients(zip(preprocessor_grads, diffusion_preprocessor.trainable_weights))
    optimizer_embedder.apply_gradients(zip(embedder_grads, roberta_embedder.trainable_weights))
    optimizer_roberta.apply_gradients(zip(model_grads, roberta.trainable_weights))

    print(f"{i}: {loss_value}", end='\r')
    i+=1

985: 7.5660467147827155