In [1]:
import os
import time

In [2]:
os.environ['KERAS_BAXKEND']='tensorflow'

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import keras
#import keras_nlp
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer

In [4]:
#keras.mixed_precision.set_global_policy("mixed_float16")
print(tf.__version__)

2.14.0


In [20]:
root = '/Users/pals/MICS/MICS_207/Project'
wikitext2_root = root + '/datasets/kaggle-wikitext/wikitext-2/'
train_file = wikitext2_root + 'wiki.train.tokens'
test_file  = wikitext2_root + 'wiki.test.tokens'
valid_file = wikitext2_root + 'wiki.valid.tokens'
unittest_file = wikitext2_root + 'unittest.tokens'

In [6]:
BATCH_SIZE = 128
NUM_EPOCHS = 3
SEQUENCE_LENGTH = 128
#BLOCK_SIZE = 128

In [7]:
def load_dataset(file_path, tokenizer, max_length=SEQUENCE_LENGTH):
    data = None
    with open(file_path, 'r', encoding='utf-8') as fd:
        data = fd.read()
    tokens = tokenizer(data, return_tensors='tf', padding='max_length', truncation=True, max_length=max_length)
    return tf.data.Dataset.from_tensor_slices((tokens['input_ids'], tokens['attention_mask']))


In [8]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
#gpt2_tokenizer.padding_side = 'left'


In [21]:
train_dataset = load_dataset(train_file, gpt2_tokenizer).shuffle(buffer_size=1024).batch(BATCH_SIZE)
test_dataset  = load_dataset(test_file, gpt2_tokenizer).batch(BATCH_SIZE)
valid_dataset = load_dataset(valid_file, gpt2_tokenizer).batch(BATCH_SIZE)
unittest_dataset = load_dataset(unittest_file, gpt2_tokenizer).batch(BATCH_SIZE)

In [10]:
model_name = 'gpt2'
gpt2_lm = TFGPT2LMHeadModel.from_pretrained(model_name)
gpt2_lm.resize_token_embeddings(len(gpt2_tokenizer)) 

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<keras.src.layers.core.embedding.Embedding at 0x32edca200>

In [11]:
gpt2_lm.summary()

Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  124439808 
 er)                                                             
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 124439808 (474.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
optimizer = DPKerasAdamOptimizer(
    l2_norm_clip=1.0,
    noise_multiplier=1.1,
    num_microbatches=8,
    learning_rate=1e-4
)
#optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [13]:
gpt2_lm.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [14]:
def generate(input_text, max_length=256):
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='tf')
    attention_mask = tf.ones_like(input_ids)
    outputs   = gpt2_lm.generate(input_ids, max_length = max_length, attention_mask=attention_mask, 
                                 num_return_sequences=5,
                                 no_repeat_ngram_size=2,  # Avoid repeating n-grams of size 2
                                 top_k=25,  # Limits the sampling pool to top_k tokens
                                 do_sample=True,
                                 temperature = 10.0,
                                 #top_p=0.95, 
                                 pad_token_id=gpt2_tokenizer.eos_token_id)
    gen_text  = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return gen_text

In [15]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))

I went on a trip to see Tajmahal in Agra. My trip was really interesting, a big show from both a fan of The Rock's sound-craftsmanship. You'll meet more fans for this show through a couple nights than most of his live productions will see during shows.

The tour also had some nice performances on stage which gave some laughs throughout:

 of Jaxxon (the latter one is very special as we all can guess).
- A few highlights were this live: -The only surprise (no word how big) in here in Mumbai! : - A nice live performance here at BMO
—This post on social Media
 I have seen more than once. But we had no other idea there might even are live live show shows on here
 I didn, because if no one really cares that you'll die, don, this tour and it has some special, amazing live experiences you know will last a month of yours! — A live gig. -
We had several other experiences like watching The Big Boss Band show at an Odez on Friday where some shows ended up here and others had them going straight o

In [16]:
# Training step
@tf.function
def train_step(input_ids, attention_mask):
    with tf.GradientTape() as tape:
        # Shift the input IDs and get the labels
        labels = tf.concat([input_ids[:, 1:], tf.fill([input_ids.shape[0], 1], tokenizer.pad_token_id)], axis=-1)
        logits = model(input_ids, attention_mask=attention_mask).logits
        loss = loss_fn(labels, logits)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


In [24]:
print(train_dataset)

<_BatchDataset element_spec=(TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None, 128), dtype=tf.int32, name=None))>


In [25]:
for x in train_dataset:
    print(x)

(<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[  220,   198,   796,   569, 18354,  7496, 17740,  6711,   796,
          220,   198,   220,   198,  2311,    73, 13090,   645,   569,
        18354,  7496,   513,  1058,  1279,  2954,    29, 17740,   357,
         4960,  1058, 10545,   230,    99,   161,   254,   112,  5641,
        44444,  9202, 25084, 24440, 12675, 11839,    18,   837,  6578,
          764,   569, 18354,  7496,   286,   262, 30193,   513,  1267,
          837,  8811,  6412,   284,   355,   569, 18354,  7496, 17740,
         6711,  2354,  2869,   837,   318,   257, 16106,  2597,  2488,
           12,    31,  2712,  2008,   983,  4166,   416, 29490,   290,
         6343,    13, 44206,   329,   262, 14047, 44685,   764, 28728,
          287,  3269,  2813,   287,  2869,   837,   340,   318,   262,
         2368,   983,   287,   262,   569, 18354,  7496,  2168,   764,
         1279,  2954,    29,   262,   976, 21748,   286, 16106,   290,
         1103,  2488,    12,

In [None]:
st = time.time()
# Training loop
for epoch in range(3):
    print(f'Epoch {epoch + 1}')
    for step, (input_ids, attention_mask) in enumerate(train_dataset):
        loss = train_step(input_ids, attention_mask)
        if step % 100 == 0:
            print(f'Step {step}, Loss: {loss.numpy()}')

en = time.time()

In [None]:
loss, accuracy = gpt2_lm.evaluate(test_dataset)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

In [None]:
print(f'Training time {(en-st)/3600} hours')

In [None]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))