In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained model and tokenizer with error handling
try:
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    exit()

# Tokenization
inputs = tokenizer.encode("How big is the Earth?", return_tensors='pt')

# Prepare attention mask
attention_mask = torch.ones(inputs.shape)

# Set pad token id
pad_token_id = tokenizer.eos_token_id

# Model Processing with enhancements
with torch.no_grad():  # Efficient generation without gradients
    outputs = model.generate(
        inputs,
        max_length=50,  # Control the maximum length
        top_k=50,       # Top-k sampling
        top_p=0.95,     # Top-p sampling
        no_repeat_ngram_size=2,  # Avoid repeating n-grams
        attention_mask=attention_mask,  # Use attention mask
        pad_token_id=pad_token_id       # Set pad token id
    )

# Detokenization
text_output = tokenizer.decode(outputs[0])
print(text_output)




How big is the Earth?

The Earth is about 1.5 billion miles (2.4 billion kilometers) across. The Earth's surface is composed of about 2.6 billion cubic miles of water.
...
 (The


In [None]:
print()

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model

# Define the Transformer block
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Token and Position Embedding
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# Create the model
class TransformerLanguageModel(Model):
    def __init__(self, vocab_size, maxlen, embed_dim, num_heads, ff_dim, num_blocks, rate=0.1):
        super(TransformerLanguageModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(num_blocks)]
        self.out = Dense(vocab_size)

    def call(self, inputs):
        x = self.embedding_layer(inputs)
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x)
        x = self.out(x)
        return x

# Model parameters
vocab_size = 10000  # Size of the vocabulary
maxlen = 100  # Maximum sequence length
embed_dim = 256  # Embedding size for each token
num_heads = 4  # Number of attention heads
ff_dim = 256  # Hidden layer size in feedforward network
num_blocks = 2  # Number of transformer blocks

model = TransformerLanguageModel(vocab_size, maxlen, embed_dim, num_heads, ff_dim, num_blocks)

# Example usage
input_sequence = tf.random.uniform((1, maxlen), dtype=tf.int32, minval=0, maxval=vocab_size)
output = model(input_sequence)
print(output.shape)  # (batch_size, sequence_length, vocab_size)


In [None]:
import math
import torch
import torch.nn as nn

# Define the model
class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

# Implement positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Example usage
ntokens = 1000  # size of vocabulary
emsize = 200    # embedding dimension
nhid = 200      # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2     # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2       # the number of heads in the multiheadattention models
dropout = 0.2   # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)

# Mock input to the model for demonstration (a sequence of token IDs)
src = torch.randint(ntokens, (35, 20))  # 35 sequence length, 20 batch size
src_mask = model._generate_square_subsequent_mask(src.size(0))
output = model(src, src_mask)

print(output.shape)  # (sequence length, batch size, vocabulary size)
