In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! cp /content/drive/MyDrive/session_12/shakespeare_drama.txt .

In [6]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import os

In [7]:
# Define the SmolLM2-135M model (a simplified version of a Transformer)
class SmolLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, max_seq_len):
        super(SmolLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_len, embed_dim))
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.embedding(x) + self.pos_embedding[:, :seq_len, :]
        for layer in self.layers:
            x = layer(x)
        return self.fc_out(x)

    def parameter_count(self):
        """Calculates the number of trainable parameters in the model."""
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [9]:
# Shakespeare dataset
class ShakespeareDataset(Dataset):
    def __init__(self, tokenizer, text, seq_len):
        self.tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True).input_ids[0]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) - self.seq_len

    def __getitem__(self, idx):
        return (
            self.tokens[idx:idx + self.seq_len],
            self.tokens[idx + 1:idx + self.seq_len + 1]
        )

In [10]:
# Training parameters
embed_dim = 512
num_heads = 8
num_layers = 4
max_seq_len = 128
vocab_size = 50257
batch_size = 16
initial_steps = 5000
resume_steps = 50
eval_interval = 500

# Optimizations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_float32_matmul_precision("high")

# Load tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

text = open("shakespeare_drama.txt").read()  # Load Shakespeare text
train_dataset = ShakespeareDataset(tokenizer, text, max_seq_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model
model = SmolLM(vocab_size, embed_dim, num_heads, num_layers, max_seq_len).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
model = torch.compile(model)  # Torch.compile for optimization

In [11]:
print(f"Model has {model.parameter_count():,} trainable parameters.")

Model has 64,188,497 trainable parameters.


In [12]:
print(model)

OptimizedModule(
  (_orig_mod): SmolLM(
    (embedding): Embedding(50257, 512)
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (fc_out): Linear(in_features=512, out_features=50257, bias=True)
  )
)


In [3]:

# Training loop
def train_model(model, optimizer, criterion, dataloader, steps, eval_interval, checkpoint_path):
    model.train()
    progress = tqdm(total=steps, desc="Training")
    step = 0
    for epoch in range(steps // len(dataloader) + 1):
        for batch_idx, (x, y) in enumerate(dataloader):
            if step >= steps:
                break

            x, y = x.to(device), y.to(device)

            with torch.autocast(device_type="cuda", dtype=torch.float16):  # Autocast
                outputs = model(x)
                loss = criterion(outputs.view(-1, vocab_size), y.view(-1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            step += 1
            progress.update(1)

            if step % eval_interval == 0:
                print(f"Step {step}: Loss = {loss.item():.4f}")
                eval_model(model, tokenizer, device)

            if step >= steps:
                break

    # Save checkpoint
    torch.save(model.state_dict(), checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

# Evaluation
@torch.no_grad()
def eval_model(model, tokenizer, device, prompt="To be, or not to be"):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)
    predictions = torch.argmax(outputs, dim=-1)
    decoded = tokenizer.decode(predictions[0], skip_special_tokens=True)
    print(f"Model utterance: {decoded}")

# Train for initial steps
checkpoint_path = "smollm_checkpoint.pth"
train_model(model, optimizer, criterion, train_loader, initial_steps, eval_interval, checkpoint_path)

# Load checkpoint and continue training
model.load_state_dict(torch.load(checkpoint_path))
print("Checkpoint loaded. Resuming training...")
train_model(model, optimizer, criterion, train_loader, resume_steps, eval_interval, "final_checkpoint.pth")

print("Training complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Training:  10%|█         | 500/5000 [01:07<05:06, 14.68it/s]

Step 500: Loss = 0.7291


Training:  10%|█         | 502/5000 [01:08<16:04,  4.66it/s]

Model utterance: , barren not
 arms give barren


Training:  20%|██        | 1002/5000 [01:44<04:52, 13.66it/s]

Step 1000: Loss = 0.4729
Model utterance: , barren not
 arms
 barren


Training:  30%|███       | 1502/5000 [02:21<04:16, 13.66it/s]

Step 1500: Loss = 0.2971
Model utterance: , partly must I arms tire partly


Training:  40%|████      | 2002/5000 [02:57<03:38, 13.73it/s]

Step 2000: Loss = 0.1934
Model utterance: , partly must I arms tire partly


Training:  50%|█████     | 2502/5000 [03:33<03:03, 13.58it/s]

Step 2500: Loss = 0.1538
Model utterance: , partly in I arms be partly


Training:  60%|██████    | 3002/5000 [04:10<02:26, 13.61it/s]

Step 3000: Loss = 0.1381
Model utterance: , partly not it be tire barren


Training:  70%|███████   | 3502/5000 [04:46<01:49, 13.67it/s]

Step 3500: Loss = 0.1283
Model utterance: , barren in it arms tire barren


Training:  80%|████████  | 4002/5000 [05:22<01:12, 13.69it/s]

Step 4000: Loss = 0.1039
Model utterance: , barren in it be tire barren


Training:  90%|█████████ | 4502/5000 [05:59<00:36, 13.66it/s]

Step 4500: Loss = 0.0938
Model utterance: , barren in it arms tire barren


Training: 100%|██████████| 5000/5000 [06:35<00:00, 13.68it/s]

Step 5000: Loss = 0.0958
Model utterance: , barren in it be be barren
Checkpoint saved to smollm_checkpoint.pth


  model.load_state_dict(torch.load(checkpoint_path))


Checkpoint loaded. Resuming training...



Training:   0%|          | 0/50 [00:00<?, ?it/s][A
Training:   4%|▍         | 2/50 [00:00<00:02, 18.14it/s][A
Training:   8%|▊         | 4/50 [00:00<00:03, 15.10it/s][A
Training:  12%|█▏        | 6/50 [00:00<00:03, 14.35it/s][A
Training:  16%|█▌        | 8/50 [00:00<00:02, 14.01it/s][A
Training:  20%|██        | 10/50 [00:00<00:02, 13.87it/s][A
Training:  24%|██▍       | 12/50 [00:00<00:02, 13.79it/s][A
Training:  28%|██▊       | 14/50 [00:00<00:02, 13.62it/s][A
Training:  32%|███▏      | 16/50 [00:01<00:02, 13.61it/s][A
Training:  36%|███▌      | 18/50 [00:01<00:02, 13.56it/s][A
Training:  40%|████      | 20/50 [00:01<00:02, 13.57it/s][A
Training:  44%|████▍     | 22/50 [00:01<00:02, 13.57it/s][A
Training:  48%|████▊     | 24/50 [00:01<00:01, 13.59it/s][A
Training:  52%|█████▏    | 26/50 [00:01<00:01, 13.52it/s][A
Training:  56%|█████▌    | 28/50 [00:02<00:01, 13.51it/s][A
Training:  60%|██████    | 30/50 [00:02<00:01, 13.52it/s][A
Training:  64%|██████▍   | 32/50 [00

Checkpoint saved to final_checkpoint.pth
Training complete.





AttributeError: 'SmolLM' object has no attribute 'parameter_count'

In [4]:
! cp final_checkpoint.pth /content/drive/MyDrive/session_12

Training: 100%|██████████| 5000/5000 [06:50<00:00, 13.68it/s]