In [1]:
import horovod.tensorflow as hvd

In [2]:
import json
import tensorflow as tf2
import tensorflow.compat.v1 as tf
import numpy as np
from gpt2_keras.gpt2 import GPT2
from gpt2_keras.builder import original_gpt2
from gpt2_keras.builder.builder import build
# from .builder.builder import build
from gpt2_keras.encoder import get_encoder

In [None]:
hvd.init()

In [None]:
def top_k_logits(logits, k):
    if k == 0:
        # no truncation
        return logits

    def _top_k():
        values, _ = tf.nn.top_k(logits, k=k)
        min_values = values[:, -1, tf.newaxis]
        return tf.compat.v1.where(
            logits < min_values,
            tf.ones_like(logits, dtype=logits.dtype) * -1e10,
            logits,
        )
    return tf.cond(
        pred=tf.equal(k, 0),
        true_fn=lambda: logits,
        false_fn=lambda: _top_k(),
    )




with open("./models/124M/hparams.json") as f:
    config = json.load(f)
#
gpt2 = GPT2(config, name='gpt2')

# x= tf.placeholder(dtype=tf.int32, shape=[None, None])
# y = gpt2(x)

# print(type(config))

# gpt2= build(config, "./models/124M/model.ckpt.data-00000-of-00001", name='gpt2')
gpt2= build(config, "./models/124M/model.ckpt", name='gpt2')

print(type(gpt2))
# print(gpt2.layers[1].layers) # The Transformer

embedding_layer = gpt2.layers[0]


print(embedding_layer)  # The Embedding Layer


print("printing vocab size:",  embedding_layer.vocab_size) #50257
print("printing word embedding:",  embedding_layer.word_embedding) #(50257 , 768)=


# gpt2.compile(
#     optimizer=tf2.optimizers.RMSprop(lr=0.01),
#     loss = tf2.keras.losses.MeanSquaredError(),
#     metrics = ['accuracy']
# )


print(gpt2.summary())
print("printing Transformer summary")
print(gpt2.layers[1].summary())

# print(gpt2.summary())
batch_size =1
max_seq_length = 1024
word_embedding = 768
tf.keras.backend.set_floatx('float64')
# input1 = np.random.randint(embedding_layer.vocab_size, size=(batch_size, 5, embedding_layer.word_embedding[-1]))



# input1 = np.random.randint(embedding_layer.vocab_size, size=(batch_size,max_seq_length))
# output = gpt2(input1)
# print(output)

model_dir = "./models/"
model_name = "124M"

In [None]:
enc =get_encoder(model_name, model_dir)
raw_text = "What is interesting is the fact that the first"
raw_text1 = "My family is doing fine."
raw_text2 = "But, I think"
# raw_text += '<|endoftext|>'
bpe_tokens = enc.encode(raw_text)
bpe_tokens1 = enc.encode(raw_text1)
# bpe_tokens2 = enc.encode(raw_text2)



print("bpe_tokens: ", bpe_tokens)
print("bpe_tokens1: ", bpe_tokens1)

In [None]:
# bpe_tokens1.append(50256)
# print(bpe_tokens1)
decoded = enc.decode(bpe_tokens)
print(decoded)

In [None]:
while len(bpe_tokens1) != len(bpe_tokens):
    bpe_tokens1.append(220)
    
print("bpe_tokens1 AFTER \n pad: ", bpe_tokens1)

    

"""
The tokens have to be either padded or be of the same length to be input as batch.
"""

In [None]:
#Without the endoftext : [3792, 534, 1641, 880, 30]

In [None]:
# start_token = enc.encoder['<|startoftext|>'] #
end_token = enc.encoder['<|endoftext|>']

# print(enc.decoder)

In [None]:
output2 = gpt2([bpe_tokens])
two_batch = [bpe_tokens, bpe_tokens1]
output3 = gpt2(two_batch)

print("**printing output2.shape**")
print(output2.shape)

print("printing argmax of logits")
output2_int = np.argmax(output2, axis=2)


print("**printing output3.shape**")
print(output3.shape)

print("printing output3")
print(output3)



print(output2_int)


In [None]:
# from transformers import GPT2Tokenizer

# tokenizer = GPT2Tokenizer()


In [None]:
decoded_output = enc.decode(output2_int[0])
print(decoded_output)

"""
The decoded output is similar to that of the RNN where one token gives one output.
(This one output is the argmax of the gpt output, the immediate token that has the highest logit value)
(So, top_k is basically taking k argmaxes from one row vector of the logit matrix)
SO, for raw_text = "What is interesting is the fact that the first"

the output is "is the about that fact that the first time"

This means
<input>      <output>
What ==>     is
is ===>      the
interesting ===> about
is ===> that

"""

In [None]:
encoded_space = enc.encode(" ")
print(encoded_space)

In [None]:
enc.decode([220])

In [None]:
enc.decode([198])

In [None]:
enc.decode([12])

In [None]:
# print("end_token is: ", start_token)
print("end_token is: ", end_token)

In [None]:
print("printing output2 :", output2)
print("")

In [None]:
print("printing output3 :", output3)
print("printing decoded output3")
output3_int_arr = np.argmax(output3, axis=2)
decoded_output3 = [enc.decode(output3_int) for output3_int in output3_int_arr]
print(decoded_output3)

In [None]:
print(np.array(two_batch).shape)

In [None]:
"""

For sparse_softmax_cross_entropy_with_logits, labels must have the shape [batch_size] and the dtype int32 or int64. Each label is an int in range [0, num_classes-1].
For softmax_cross_entropy_with_logits, labels must have the shape [batch_size, num_classes] and dtype float32 or float64.
Labels used in softmax_cross_entropy_with_logits are the one hot version of labels used in sparse_softmax_cross_entropy_with_logits.

Another tiny difference is that with sparse_softmax_cross_entropy_with_logits, you can give -1 as a label to have loss 0 on this label.

"""

In [None]:
print(two_batch)

In [None]:
tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=np.array(two_batch)[:, 1:],
            logits=output3[:, :-1])

In [None]:
loss = tf.reduce_mean(
        input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=np.array(two_batch)[:, 1:],
            logits=output3[:, :-1])
)


In [None]:
print(loss)

In [3]:
optimizer = tf2.keras.optimizers.Adam
optimizer = hvd.DistributedOptimizer(optimizer)


ValueError: Provided optimizer doesn't inherit from either legacy TensorFlow or Keras optimizer: <class 'tensorflow.python.keras.optimizer_v2.adam.Adam'>