In [1]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import math
import time
import inspect
import json
import requests
import tiktoken
from tqdm import tqdm
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

In [3]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection 
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # nh is "number of heads", hs is "head size", and C (number of channels)(The dimensionality of token embeddings) = nh * hs
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

In [4]:
class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [5]:
class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [6]:
@dataclass
class GPTConfig:
    block_size: int = 512 # max sequence length
    vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers  12
    n_head: int = 12 # number of heads   12
    n_embd: int = 768 # embedding dimension  768

In [7]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        # index is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and position embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        #decay_params: Contains parameters that are 2-dimensional or more (like weight matrices). These will have weight decay applied.
        #nodecay_params: Contains parameters that are less than 2-dimensional (like biases or layer normalization parameters).
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

In [8]:
model = GPT(GPTConfig)

In [10]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(512, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [9]:
from transformers import GPT2Tokenizer
model.load_state_dict(torch.load('A:\\llms\\data\\model_weights.pth', map_location=torch.device('cpu')))

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    for _ in range(max_length):
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs[0][:, -1, :]  # Extract logits for the last token

            # Apply temperature and top-k sampling
            logits = logits / temperature
            top_k_logits, top_k_indices = torch.topk(logits, top_k)
            probabilities = torch.softmax(top_k_logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)

            input_ids = torch.cat([input_ids, top_k_indices.gather(-1, next_token)], dim=-1)

    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return generated_text




  model.load_state_dict(torch.load('A:\\llms\\data\\model_weights.pth', map_location=torch.device('cpu')))


In [24]:
prompt = "what is Aortic"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

Generated Text:
what is Aortic Valve ?	Aortic valve is a valve that connects the abdomen to the major arteries of the heart. It is a valve that connects two or more chambers of the heart, called the aortic valve (the valve that is controlled by a valve) to the heart. The valve opens to allow blood to exit the heart. The aortic valve opens from the left ventricle, in the heart's lower third chamber, and to fill with the blood.
- A small,


In [32]:
prompt = "What is Aortic Valve"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

Generated Text:
What is Aortic Valve Valve Replacement? Aortic Valve Replacement is an artificial valve that is usually constructed of a balloon, which is placed in the chest with the valve open. This usually takes several years to build. In many cases, the valve is also replaced with a balloon. You may want to have your doctor take the pressure off. This can be done by placing your hands behind the valve and then placing your hands behind the valve. It is a very important procedure for many people with aortic valve replacement.


In [33]:
prompt = "Give me some random text"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

Generated Text:
Give me some random text strings like ""Hello, World!""" and click the ""Play"" button on the radio button.
A new study found that certain kinds of brain cells, found in the cerebellum, are important for movement. The finding, published in Nature Neuroscience, suggests that brain cells in people with Parkinson’s are more vulnerable to injury.
"The word “tribe” can be traced to the word “tribe” in Sanskrit, which means “great man


In [29]:
prompt = "What is Aortic Valve"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

Generated Text:
What is Aortic Valve Surgery?   Aortic valve replacement surgery is surgery to replace the lost valve. In many cases, aortic valve replacement is the only treatment for aortic valve stenosis. However, the complications can be serious and can be life-threatening.                           -   Heart valve replacement is surgery to replace part of aortic valve. Aortic valve


In [51]:
prompt = "I have cold. what shall I do?"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

Generated Text:
I have cold. what shall I do?
"The latest news from academia, regulators
"A research study has shown that a specific amino acid may be a good source of a protein that provides the structure and function of a particular enzyme (protein) called the retinoid protein. The study, which was led by Svante Pääbo, M.D., of the Universiteit Brussel, Belgium, and colleagues, found that a gene called OASAP, which is found in both eyes, can be a


In [10]:
prompt = "What is Aortic Valve"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)

  y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention


Generated Text:
What is Aortic Valve Replacement?
The first line of defense is to keep your cat from getting sick and to keep your cat indoors. You should keep the cat indoors at all times. Keeping it indoors is also important. If you do not bring your cat outdoors, make sure it is well-ventilated and does not allow air to enter your home.
"The history of the United States
"The following HTML text is provided to enhance online
It is important to have a balanced diet because it has many nutrients


In [None]:
prompt = "What is Aortic Valve"
generated_text = generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7, top_k=40)
print("Generated Text:")
print(generated_text)