In [2]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch
from rotary_embedding_torch import RotaryEmbedding


In [3]:
# pip install transformers
#https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config.json
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM2-135M"
device = "cpu" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
inputs = tokenizer.encode("Gravity is", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))


  from .autonotebook import tqdm as notebook_tqdm
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Gravity is the force that holds the Earth and the Moon together.

The Moon is a satellite of the


In [4]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb): LlamaRotaryEm

In [5]:
# The tokenizer is an instance of GPT2TokenizerFast, pre-trained with the checkpoint "HuggingFaceTB/SmolLM2-135M".
# It has a vocabulary size of 49152 and a maximum model length of 8192 tokens.
# Special tokens like <|endoftext|>, <|im_start|>, and others are defined for specific purposes.
# https://huggingface.co/HuggingFaceTB/SmolLM2-360M/raw/main/tokenizer.json
# This tokenizer is used to encode and decode text for the language model.

In [6]:
with open('input.txt', 'r') as f:
    text = f.read()
print (text[0:100])
sample = text[0:100]
inputs = tokenizer.encode(sample,return_tensors="pt")
print (inputs)
print(tokenizer.decode(inputs[0]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
tensor([[ 5345, 32062,    42,   198,  6121,   392,  7219,   750,  2030,    28,
          4875,   549,  3287,    30,   198,   198,  4518,    42,   198, 15024,
           494,    28,  3287,    30,   198,   198,  5345, 32062,    42,   198,
          2683]])
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
import json

# Load the JSON data
json_data = ''
with open('config.json', 'r') as f:
    json_data = f.read()


data = json.loads(json_data)

# Fill the class with the JSON data


In [8]:
@dataclass
class SmolLM2Config:
        architectures: list
        attention_bias: bool
        attention_dropout: float
        bos_token_id: int
        eos_token_id: int
        hidden_act: str
        hidden_size: int
        initializer_range: float
        intermediate_size: int
        is_llama_config: bool
        max_position_embeddings: int
        model_type: str
        num_attention_heads: int
        num_hidden_layers: int
        num_key_value_heads: int
        pretraining_tp: int
        rms_norm_eps: float
        rope_interleaved: bool
        rope_scaling: any
        rope_theta: int
        tie_word_embeddings: bool
        torch_dtype: str
        device: str
        transformers_version: str
        use_cache: bool
        vocab_size: int

config = SmolLM2Config(**data)
print(config)

SmolLM2Config(architectures=['LlamaForCausalLM'], attention_bias=False, attention_dropout=0.0, bos_token_id=0, eos_token_id=0, hidden_act='silu', hidden_size=576, initializer_range=0.041666666666666664, intermediate_size=1536, is_llama_config=True, max_position_embeddings=8192, model_type='llama', num_attention_heads=9, num_hidden_layers=30, num_key_value_heads=3, pretraining_tp=1, rms_norm_eps=1e-05, rope_interleaved=False, rope_scaling=None, rope_theta=100000, tie_word_embeddings=True, torch_dtype='bfloat16', device='cpu', transformers_version='4.40.1', use_cache=True, vocab_size=49152)


#explain the below code with an example 

In [9]:
class CausalLMAttention(nn.Module):
    """
    Causal Attention module for the SmolLM2 model.
     LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=576, bias=False)
          (v_proj): Linear(in_features=576, out_features=576, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
    """
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = config.hidden_size // self.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size,bias=False)  #(576,576)
        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size,bias=False)  #(576,576)
        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size,bias=False) #(576,576)
        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size,bias=False)
        rotary_emb = RotaryEmbedding(dim = config.hidden_size)
        self.rotary_emb = rotary_emb
        self.dropout = nn.Dropout(config.attention_dropout)
        self.register_buffer("bias", torch.tril(torch.ones(8192, 8192)).view(1, 1, 8192, 8192))
    
    def forward(self,x):
        bsz, seq_len, _ = x.size()  #(B,S,HiddenSize) (Eg: (2,10,576) Batch=2,SeqLen=10,HiddenSize=576)
        #Below one change the dimension from (2,10,576) to (2,10,15,64) and then transpose to (2,15,)
        query_layer = self.q_proj(x).view(bsz, seq_len, self.num_attention_heads, self.attention_head_size).transpose(1, 2) #(2,15,10,64)
        key_layer = self.k_proj(x).view(bsz, seq_len, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
        value_layer = self.v_proj(x).view(bsz, seq_len, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) / (self.attention_head_size ** 0.5)
        attention_scores = attention_scores.masked_fill(self.bias[:, :, :seq_len, :seq_len] == 0, float("-inf"))
        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.transpose(1, 2).contiguous().view(bsz, seq_len, self.all_head_size)

        return context_layer


In [10]:
class CausalLMAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.num_kv_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_attention_heads // self.num_kv_heads
        self.attention_head_size = config.hidden_size // self.num_attention_heads   

        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size,bias=False)  #(576,576)
        self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.attention_head_size,bias=False)  #(576,192)
        self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.attention_head_size,bias=False) #(576,192)
        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size,bias=False)
        self.dropout = nn.Dropout(config.attention_dropout)
        self.register_buffer("bias", torch.tril(torch.ones(8192, 8192)).view(1, 1, 8192, 8192))
    
    def forward(self,x):
        B, S, _ = x.size()  #(B,S,HiddenSize) (Eg: (2,10,576) Batch=2,SeqLen=10,HiddenSize=576)
        H, K = self.num_attention_heads, self.num_kv_heads
        D = self.attention_head_size
        #print (B,S,H,D)
        q = self.q_proj(x).view(B, S, H, D).transpose(1, 2)  # (B, H, S, D)
        k = self.k_proj(x).view(B, S, K, D).transpose(1, 2)  # (B, K, S, D)
        v = self.v_proj(x).view(B, S, K, D).transpose(1, 2)  # (B, K, S, D)
        #Below one change the dimension from (2,10,576) to (2,10,15,64) and then transpose to (2,15,)
        
        if K < H:
            kv_repeat = H // K
            k = k.repeat_interleave(kv_repeat, dim=1)  # (B, H, S, D)
            v = v.repeat_interleave(kv_repeat, dim=1)  # (B, H, S, D)

        attention_scores = torch.matmul(q, k.transpose(-1, -2)) / (self.attention_head_size ** 0.5)
        attention_scores = attention_scores.masked_fill(self.bias[:, :, :S, :S] == 0, float("-inf"))
        attention_probs = F.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, v)
        context_layer = context_layer.transpose(1, 2).contiguous().view(B, S, H * D)

        return context_layer


In [11]:
attention = CausalLMAttention(config)
input_tensor = torch.randn(2, 10, config.hidden_size)
print (input_tensor.shape)
output = attention(input_tensor)
print (output.shape)


torch.Size([2, 10, 576])
torch.Size([2, 10, 576])


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math




class CausalLMAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.head_dim

        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
        self.o_proj = nn.Linear(self.all_head_size, config.hidden_size, bias=False)

        #self.rotary_emb = RotaryEmbedding(dim=self.head_dim, base=config.rope_theta, max_position_embeddings=config.max_position_embeddings)
        self.dropout = nn.Dropout(config.attention_dropout)

        self.register_buffer("bias", torch.tril(torch.ones(config.max_position_embeddings, config.max_position_embeddings)).view(1, 1, config.max_position_embeddings, config.max_position_embeddings), persistent=False)

    def forward(self, x):
        bsz, seq_len, _ = x.size()

        q = self.q_proj(x).view(bsz, seq_len, self.num_attention_heads, self.head_dim).transpose(1, 2)  # (B, H, S, D)
        k = self.k_proj(x).view(bsz, seq_len, self.num_attention_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(bsz, seq_len, self.num_attention_heads, self.head_dim).transpose(1, 2)

        # Attention
        attn_scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.head_dim)  # (B, H, S, S)
        attn_scores = attn_scores.masked_fill(self.bias[:, :, :seq_len, :seq_len] == 0, float("-inf"))
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_probs = self.dropout(attn_probs)

        context = torch.matmul(attn_probs, v)  # (B, H, S, D)
        context = context.transpose(1, 2).contiguous().view(bsz, seq_len, self.all_head_size)

        output = self.o_proj(context)
        return output


In [13]:
attention = CausalLMAttention(config)
input_tensor = torch.randn(2, 10, config.hidden_size)
print (input_tensor.shape)
output = attention(input_tensor)
print (output.shape)
#assertEqual(output.shape, (2, 10, config.num_attention_heads * (config.hidden_size // config.num_attention_heads)))

torch.Size([2, 10, 576])
torch.Size([2, 10, 576])


In [14]:
class LlamaMLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
        self.act_fn = nn.SiLU()

    def forward(self, x):
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        hidden_states = self.act_fn(gate) * up
        hidden_states = self.down_proj(hidden_states)
        return hidden_states
    

    
    

In [15]:
class TestLlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.mlp = LlamaMLP(config)
        
    def forward(self, x):
        return self.mlp(x)

mlp = LlamaMLP(config)
input_tensor = torch.randn(2, 10, config.hidden_size)
print (input_tensor.shape)
output = mlp(input_tensor)  
print (output.shape)  # Should print the shape of the output tensor
print (output)


torch.Size([2, 10, 576])
torch.Size([2, 10, 576])
tensor([[[-1.1089e-01, -5.0312e-02,  6.5516e-04,  ..., -3.8605e-02,
           4.0738e-02, -7.1248e-02],
         [-1.0259e-01,  1.1967e-01,  3.9113e-02,  ..., -1.8470e-02,
           4.4099e-02,  1.5179e-01],
         [-8.5697e-02,  2.2259e-01,  1.1384e-01,  ..., -1.5216e-01,
           2.0092e-02,  1.6551e-01],
         ...,
         [ 1.3000e-01,  3.3360e-02, -4.3441e-03,  ..., -1.9729e-01,
           3.4625e-02, -1.3330e-01],
         [ 9.3133e-02, -3.0581e-02,  1.7063e-01,  ...,  1.6873e-01,
          -1.1176e-01, -6.2194e-02],
         [ 8.5999e-02, -7.2520e-03,  1.4242e-01,  ...,  1.2495e-01,
           1.8139e-01, -1.0540e-01]],

        [[ 2.4602e-02,  5.0409e-02,  8.1532e-03,  ..., -1.0132e-01,
           2.5985e-02, -5.1852e-03],
         [-1.5375e-01,  1.1229e-01, -2.5368e-02,  ..., -2.9468e-01,
           5.9173e-02,  3.9871e-02],
         [-1.0333e-04,  2.4436e-02, -2.0330e-01,  ..., -6.2836e-04,
           1.3342e-02,  1.

In [16]:

class LlamaDecoderLayer(nn.Module):
    """"""
    def __init__(self, config):
        super().__init__()
        self.attention = CausalLMAttention(config)
        self.mlp = LlamaMLP(config)
        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
    def forward(self, x):
        attn_output = self.attention(self.norm1(x))
        #print (self.norm1(x).shape,attn_output.shape,x.shape)
        x = x + attn_output
        mlp_output = self.mlp(self.norm2(x))
        x = x + mlp_output
        return x
 
decoder_layer = LlamaDecoderLayer(config)
input_tensor = torch.randn(2, 10, config.hidden_size)
print (input_tensor.shape)
output = decoder_layer(input_tensor)
print (output.shape)  # Should print the shape of the output tensor
print (output)

torch.Size([2, 10, 576])
torch.Size([2, 10, 576])
tensor([[[ 0.2847,  0.6319,  0.2329,  ...,  0.0667, -1.4657,  1.1215],
         [-0.6469,  1.3853,  1.6129,  ..., -0.6839, -0.3107,  0.4714],
         [-0.3615,  1.4723, -0.9383,  ..., -1.0452,  1.1087,  0.3238],
         ...,
         [ 1.2221,  0.3445, -0.6124,  ...,  0.3709, -1.1333, -1.0381],
         [ 0.2758,  0.0827, -0.2702,  ...,  0.0993,  0.2575,  0.3023],
         [ 0.1112, -0.6585,  0.6734,  ..., -2.2866, -0.3144, -0.1757]],

        [[-0.6015,  0.5314,  1.9509,  ...,  0.7730,  0.0490, -1.3500],
         [ 0.0369, -0.6855, -0.5591,  ...,  1.5819,  1.5275, -1.4021],
         [-0.9858,  0.9937, -0.2405,  ...,  0.5869, -0.8140,  0.5876],
         ...,
         [ 1.7669,  0.0436, -0.4536,  ..., -0.6836,  0.3049, -0.3023],
         [-1.8075, -0.1020,  0.0537,  ...,  0.3341, -1.7952,  0.4658],
         [ 1.1545,  0.2420, -1.1062,  ...,  0.9561, -0.5342,  2.0066]]],
       grad_fn=<AddBackward0>)


In [17]:
class LlamaDecoder(nn.Module):
    """ LlamaDecoder module for the Llama model.
    """
    def __init__(self, config):
        super().__init__()
        self.head_dim = config.hidden_size // config.num_attention_heads
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)        
        self.rotary_emb = RotaryEmbedding(config.hidden_size)
        self.config = config
    def forward(self, x):
        B, T = x.size()
        assert T <= self.config.max_position_embeddings, f"Cannot forward sequence of length {T}, block size is only {self.config.max_position_embeddings}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=config.device) # shape (T)
        tok_emb = self.embed_tokens(x) # token embeddings of shape (B, T, n_embd)
        x = tok_emb #+ pos_emb
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        x = self.rotary_emb.rotate_queries_or_keys(x)
        return x

decoder = LlamaDecoder(config)
input_tensor = torch.randint(0,49152,(2, 10))
print (input_tensor)
input_tensor = input_tensor.to(torch.long)
print (input_tensor)
output = decoder(input_tensor)
print (output.shape)  # Should print the shape of the output tensor
print (output)

tensor([[14927, 32118, 28275,  4132, 38682, 32321, 20814, 34845, 13881, 22592],
        [ 6902, 47651, 40095,  1074, 46998,  3829, 31049, 19573, 26302, 45389]])
tensor([[14927, 32118, 28275,  4132, 38682, 32321, 20814, 34845, 13881, 22592],
        [ 6902, 47651, 40095,  1074, 46998,  3829, 31049, 19573, 26302, 45389]])
torch.Size([2, 10, 576])
tensor([[[ 1.8976,  0.3304,  0.9388,  ...,  1.5390,  0.0916,  0.1486],
         [ 1.2567,  1.2446, -0.7670,  ...,  0.1287, -0.9730,  0.1379],
         [-0.1880,  1.9482,  0.4236,  ...,  1.1748,  0.3775, -0.1171],
         ...,
         [ 1.1041, -0.4231,  0.9535,  ...,  0.6886,  0.5546,  1.3367],
         [-0.2245,  0.4594,  0.8721,  ..., -0.2695,  0.0997,  1.5988],
         [ 0.1299,  0.5747,  0.9191,  ...,  0.3919,  0.5205, -1.0632]],

        [[ 0.5842, -0.5169,  0.8482,  ...,  2.0122, -0.3850,  1.6524],
         [ 1.5441,  0.8089,  1.6379,  ...,  0.0744,  0.4798,  0.8396],
         [-0.3542,  1.5570, -0.7681,  ...,  1.4156,  0.6676, -0.0556]

In [18]:
class LlamaForCausalLM(nn.Module):
    """ LlamaForCausalLM module for the Llama model.
    """
    def __init__(self, config):
        super().__init__()
        self.decoder = LlamaDecoder(config)
        self.config = config
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

    def forward(self, input_ids=None, labels=None):
        hidden_states = self.decoder(input_ids)
        logits = self.lm_head(hidden_states)
        outputs = {"logits": logits}
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss

    def generate(self, input_ids, max_length=20):
        self.eval()
        with torch.no_grad():
            for _ in range(max_length):
                outputs = self.forward(input_ids)
                next_token = torch.argmax(outputs[0][:, -1, :], dim=-1)
                input_ids = torch.cat([input_ids, next_token.unsqueeze(-1)], dim=-1)
        return input_ids

class TestLlamaForCausalLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.model = LlamaForCausalLM(config)
        
    def forward(self, input_ids=None, labels=None):
        return self.model(input_ids, labels)
    

model = LlamaForCausalLM(config)
model = model.to(device)
input_tensor = torch.randint(0,49152,(2, 10))
print (input_tensor)
input_tensor = input_tensor.to(torch.long)
model = model.to(torch.float32)
print(input_tensor.shape)  
logits,loss = model(input_tensor,input_tensor)
print(logits.shape)  # Should print the shape of the logits tensor
print(loss)
print(tokenizer.decode(input_tensor[0]))
input_ids = model.generate(input_tensor, max_length=20)
print(input_ids)
print(tokenizer.decode(input_ids[0]))


tensor([[31447, 15302,  8555, 20932, 45924, 44282, 22771, 13163, 40173, 46096],
        [40877,  8941, 47071, 33536, 41507,  4486, 17802,  2556,  7814, 43266]])
torch.Size([2, 10])
torch.Size([2, 10, 49152])
tensor(11.1913, grad_fn=<NllLossBackward0>)
Mic financing Dou Niktrunctow transmitting PRchronicenters
tensor([[31447, 15302,  8555, 20932, 45924, 44282, 22771, 13163, 40173, 46096,
         46354, 16249, 24727, 26709,  6890, 42639, 26942, 38848,  2857, 41454,
         36231, 19169, 47439, 16512,  9972, 39999, 29108, 15537,  5980, 21076],
        [40877,  8941, 47071, 33536, 41507,  4486, 17802,  2556,  7814, 43266,
         26331, 25537,  8730, 26910, 15305, 24471,  5296, 44414,  8869,  9213,
         43290, 10142, 22393, 33649, 23573,  1279, 38954, 15559, 27768, 36842]])
Mic financing Dou Niktrunctow transmitting PRchronicenters desperation tragicCreat fp participation+\ grit fmt injlatentolysisilis� cervical Costowe boosts
      icing contagious


In [19]:
def saveCheckpoint(epoch,step,model,optimizer,loss):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, 'checkpoint.pth'+str(epoch)+str(step))

def loadCheckpoint(model,optimizer,path='checkpoint.pth0'):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return epoch,loss

In [None]:
def inference(model,prompt, max_length=50):
    model.eval()
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs, max_length=max_length)
    print ("Input = ",tokenizer.decode(inputs[0]))
    print ("Output = ",tokenizer.decode(outputs[0]))
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:

def train_model(model, inputs, labels, epochs=1, learning_rate=5e-5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        #print (inputs.shape,labels.shape)
        logits,loss = model(inputs, labels=labels)
        loss.backward()
        optimizer.step()
        #print(f'step{i}, loss: {loss.item()}')
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

: 

In [None]:
#Create a Data Loader for training data
from torch.utils.data import DataLoader, Dataset    
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=128):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.examples = []
        tokens = tokenizer.encode(text)
        for i in range(0, len(tokens) - block_size + 1, block_size):
            self.examples.append(tokens[i:i + block_size])
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)
    
dataset = TextDataset(text, tokenizer, block_size=128)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

for epoch in range(2):
    step = 0
    for batch in dataloader:
        inputs = batch
        labels = batch.clone()
        train_model(model, inputs.to(device), labels.to(device), epochs=1, learning_rate=5e-5)  
        step += 1
        if step % 50 == 0:
            saveCheckpoint(epoch,step,model,torch.optim.AdamW(model.parameters(), lr=5e-5),loss.item())
            print(f"Checkpoint saved at epoch {epoch}, step {step}")
            inference(model,"Gravity is", max_length=50)
            inference(model,"The meaning of life is", max_length=50)
            inference(model,"Once upon a time", max_length=50)
saveCheckpoint(2,0,model,torch.optim.AdamW(model.parameters(), lr=5e-5),loss.item())
loadCheckpoint(model,torch.optim.AdamW(model.parameters(), lr=5e-5),'checkpoint.pth20')


Token indices sequence length is longer than the specified maximum sequence length for this model (341094 > 8192). Running this sequence through the model will result in indexing errors


Epoch 1, Loss: 10.946313858032227
Epoch 1, Loss: 10.460905075073242
Epoch 1, Loss: 10.045276641845703
Epoch 1, Loss: 9.67160701751709
Epoch 1, Loss: 9.852033615112305
Epoch 1, Loss: 9.15063190460205
Epoch 1, Loss: 8.992661476135254
Epoch 1, Loss: 8.475113868713379
Epoch 1, Loss: 8.938053131103516
Epoch 1, Loss: 8.432129859924316
Epoch 1, Loss: 8.705982208251953
Epoch 1, Loss: 8.345471382141113
Epoch 1, Loss: 8.268653869628906
Epoch 1, Loss: 8.193044662475586
Epoch 1, Loss: 7.912673473358154
Epoch 1, Loss: 7.4542236328125
Epoch 1, Loss: 7.805539131164551
Epoch 1, Loss: 7.682666778564453
Epoch 1, Loss: 7.520929336547852
Epoch 1, Loss: 7.702826499938965
Epoch 1, Loss: 7.208317756652832
Epoch 1, Loss: 7.33823299407959
Epoch 1, Loss: 7.3570733070373535
Epoch 1, Loss: 7.377012729644775
Epoch 1, Loss: 6.798823833465576
Epoch 1, Loss: 6.387131214141846
Epoch 1, Loss: 6.940001964569092
Epoch 1, Loss: 6.674792289733887
Epoch 1, Loss: 6.93224573135376
Epoch 1, Loss: 6.980488300323486
Epoch 1, Los

In [None]:
print("New model built from scratch ",model)

LlamaForCausalLM(
  (decoder): LlamaDecoder(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (attention): CausalLMAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=576, bias=False)
          (v_proj): Linear(in_features=576, out_features=576, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (norm1): LayerNorm((576,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((576,), eps=1e-05, elementwise_affine=True)
      )
    )
  

In [1]:

inference(model,"Gravity is", max_length=50)
inference(model,"The meaning of life is", max_length=50)
inference(model,"Once upon a time", max_length=50)

NameError: name 'inference' is not defined