#**GPT-0.5M** by **onemriganka**

Step 0: Install Dependencies

In [1]:
!pip install torch requests -q


Step 1: Base GPT (Character-Level)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import requests
import time

device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# Load dataset from a URL
# ----------------------------
def load_text(url):
    text = requests.get(url).text.lower()
    print("Dataset length:", len(text))
    return text

# ----------------------------
# Build character-level vocab
# ----------------------------
def build_vocab(text):
    chars = sorted(list(set(text)))
    stoi = {ch:i for i,ch in enumerate(chars)}
    itos = {i:ch for i,ch in enumerate(chars)}
    vocab_size = len(chars)
    print("Vocab size:", vocab_size)
    return stoi, itos, vocab_size

# ----------------------------
# Encode / Decode
# ----------------------------
def encode(s, stoi): return [stoi[c] for c in s]
def decode(l, itos): return ''.join([itos[i] for i in l])

# ----------------------------
# Positional Encoding
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# ----------------------------
# Transformer Block
# ----------------------------
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x, attn_mask=None):
        attn_output, _ = self.attn(x, x, x, attn_mask=attn_mask)
        x = self.ln1(x + attn_output)
        ff_output = self.ff(x)
        x = self.ln2(x + ff_output)
        return x

# ----------------------------
# baseGPT Model
# ----------------------------
class baseGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=4, ff_dim=256, block_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(embed_dim, block_size)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_dim, vocab_size)
        self.block_size = block_size

    def forward(self, x):
        x = self.embed(x)
        x = self.pos_enc(x)
        seq_len = x.size(1)
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
        for layer in self.layers:
            x = layer(x, attn_mask=mask)
        logits = self.fc_out(x)
        return logits


Step 2: Prepare Data & Batching

In [3]:
block_size = 128
batch_size = 32

def get_batch(data):
    ix = torch.randint(len(data)-block_size-1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)


Step 3: Train the Base Model

In [4]:
def train_model(text, steps=100000, lr=3e-4):
    stoi, itos, vocab_size = build_vocab(text)
    data = torch.tensor(encode(text, stoi), dtype=torch.long)

    model = baseGPT(vocab_size).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for step in range(steps):
        xb, yb = get_batch(data)
        logits = model(xb)
        loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 1000 == 0:
            print(f"Step {step} | Loss: {loss.item():.4f}")

    return model, stoi, itos


Step 4: Text Generation

In [5]:
def generate(model, stoi, itos, start="The", max_new_tokens=200):
    model.eval()
    idx = torch.tensor([stoi.get(c,0) for c in start], dtype=torch.long).unsqueeze(0).to(device)
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -model.block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)
    return ''.join([itos[i] for i in idx[0].tolist()])


Step 5: Run Everything

In [6]:
# Example: train on Alice in Wonderland
url = "https://www.gutenberg.org/files/11/11-0.txt"
text = load_text(url)

# Train base GPT
start_base = time.time()
model, stoi, itos = train_model(text, steps=50000)  # increase steps for better output
end_base = time.time()

# Generate text
print("\nGenerated Text:\n")
print(generate(model, stoi, itos, start="alice", max_new_tokens=5000))


Dataset length: 144696
Vocab size: 50
Step 0 | Loss: 4.0982
Step 1000 | Loss: 1.6565
Step 2000 | Loss: 1.3842
Step 3000 | Loss: 1.1767
Step 4000 | Loss: 1.0614
Step 5000 | Loss: 0.9461
Step 6000 | Loss: 0.8740
Step 7000 | Loss: 0.8208
Step 8000 | Loss: 0.7401
Step 9000 | Loss: 0.6286
Step 10000 | Loss: 0.5860
Step 11000 | Loss: 0.5376
Step 12000 | Loss: 0.4627
Step 13000 | Loss: 0.4181
Step 14000 | Loss: 0.3815
Step 15000 | Loss: 0.3369
Step 16000 | Loss: 0.2991
Step 17000 | Loss: 0.2659
Step 18000 | Loss: 0.2565
Step 19000 | Loss: 0.2409
Step 20000 | Loss: 0.2380
Step 21000 | Loss: 0.2367
Step 22000 | Loss: 0.2265
Step 23000 | Loss: 0.2051
Step 24000 | Loss: 0.2073
Step 25000 | Loss: 0.2064
Step 26000 | Loss: 0.2001
Step 27000 | Loss: 0.1881
Step 28000 | Loss: 0.2043
Step 29000 | Loss: 0.1919
Step 30000 | Loss: 0.1875
Step 31000 | Loss: 0.1784
Step 32000 | Loss: 0.1820
Step 33000 | Loss: 0.1756
Step 34000 | Loss: 0.1717
Step 35000 | Loss: 0.1797
Step 36000 | Loss: 0.1714
Step 37000 | 

Now we have a fully functional base character-level GPT.

we can paste any dataset link into url and it will train.

After training, it will generate text in the style of your dataset.

---

Step 6: Next Step → Fine-Tuning Q&A

we have the base model trained, we can fine-tune it on a Q&A dataset:

Prepare a Q&A text dataset.

Combine it with your base dataset (or train only on Q&A).

Use the same train_model function but with smaller steps and lower learning rate.

Generate answers with prompt "Q: <your question>\nA:".

Step 1: Prepare Q&A Dataset

we can make a small Q&A text file like this (or use your own):


In [8]:
qa_data = """
<Q> Who follows the White Rabbit into the rabbit hole?
<A> Alice follows the White Rabbit.

<Q> What does Alice fall into when she follows the White Rabbit?
<A> A deep rabbit hole that leads to Wonderland.

<Q> What is written on the bottle Alice drinks from?
<A> "Drink Me."

<Q> What happens when Alice drinks from the bottle labeled 'Drink Me'?
<A> She shrinks to a very small size.

<Q> What is written on the cake Alice eats?
<A> "Eat Me."

<Q> What happens when Alice eats the cake labeled 'Eat Me'?
<A> She grows very tall.

<Q> Who is always worried about being late?
<A> The White Rabbit.

<Q> What animal does Alice try to play croquet with?
<A> A flamingo as the mallet and hedgehogs as the balls.

<Q> Who often says "Off with their heads!"?
<A> The Queen of Hearts.

<Q> Who is the Queen of Hearts married to?
<A> The King of Hearts.

<Q> What creature gives Alice advice while smoking a hookah?
<A> The Caterpillar.

<Q> What does the Caterpillar tell Alice about the mushroom?
<A> One side makes her grow taller, and the other side makes her smaller.

<Q> Who smiles and disappears, leaving only its grin behind?
<A> The Cheshire Cat.

<Q> Where does Alice meet the Mad Hatter?
<A> At the Mad Tea Party.

<Q> Who are the guests at the Mad Tea Party?
<A> The Mad Hatter, the March Hare, and the Dormouse.

<Q> What day do the Mad Hatter and March Hare celebrate endlessly?
<A> Their unbirthday.

<Q> Who tries to put Alice on trial?
<A> The Queen of Hearts and her court.

<Q> What crime is Alice accused of during the trial?
<A> Stealing tarts.

<Q> Who testifies against Alice at the trial?
<A> The Mad Hatter and the Cook.

<Q> What finally wakes Alice from Wonderland?
<A> She wakes up from a dream.

"""


Combine Datasets to Build a Full Vocabulary

Before training/fine-tuning, combine the base dataset + Q&A dataset:

In [9]:
# Combine datasets
full_text = text.lower() + "\n" + qa_data.lower()

# Rebuild vocab
chars = sorted(list(set(full_text)))
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
vocab_size = len(chars)
print("New vocab size:", vocab_size)

# Encode full text for training/fine-tuning
data_full = torch.tensor(encode(full_text, stoi), dtype=torch.long)
train_tokens = data_full  # you can use this for fine-tuning too


New vocab size: 54


Step 2: Encode Q&A Dataset

In [10]:
train_tokens = torch.tensor(encode(qa_data.lower(), stoi), dtype=torch.long)


Step 3: Fine-Tune Function

We’ll reuse the same training loop but with:

Lower learning rate (1e-4)

Fewer steps (500-1000) for demo

Smaller batch if dataset is tiny

In [11]:
def fine_tune_qa(model, data, steps=1000, lr=1e-4):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for step in range(steps):
        ix = torch.randint(len(data)-block_size-1, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
        x, y = x.to(device), y.to(device)

        logits = model(x)
        loss = loss_fn(logits.view(-1, model.fc_out.out_features), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 1000 == 0:
            print(f"Fine-tuning step {step} | Loss: {loss.item():.4f}")


Step 4: Generate Answers

Same generate function as before:

In [12]:
def generate_qa(model, prompt, max_new_tokens=100):
    return generate(model, stoi, itos, start=prompt, max_new_tokens=max_new_tokens)


Step 5: Run Fine-Tuning and Test

In [13]:
# Fine-tune base model on Q&A
start_finetune = time.time()
fine_tune_qa(model, train_tokens, steps=10000, lr=1e-4)
end_finetune = time.time()

# Test Q&A
prompt1 = "Q: Who is the White Rabbit?\nA:"
prompt2 = "Q: What does Alice drink to shrink?\nA:"
prompt3 = "Q: What happens when Alice eats the cake labeled 'Eat Me'?\nA:"
prompt4 = "Q: Who often says 'Off with their heads!'?\nA:"
prompt5 = "Q: What advice does the Caterpillar give Alice?\nA:"

print("Answer 1:\n", generate_qa(model, prompt1))
print("Answer 2:\n", generate_qa(model, prompt2))
print("Answer 3:\n", generate_qa(model, prompt3))
print("Answer 4:\n", generate_qa(model, prompt4))
print("Answer 5:\n", generate_qa(model, prompt5))


Fine-tuning step 0 | Loss: 26.1549
Fine-tuning step 1000 | Loss: 0.0620
Fine-tuning step 2000 | Loss: 0.0439
Fine-tuning step 3000 | Loss: 0.0422
Fine-tuning step 4000 | Loss: 0.0382
Fine-tuning step 5000 | Loss: 0.0401
Fine-tuning step 6000 | Loss: 0.0381
Fine-tuning step 7000 | Loss: 0.0391
Fine-tuning step 8000 | Loss: 0.0378
Fine-tuning step 9000 | Loss: 0.0345
Answer 1:
 
: 
ho is the 
hite 
abbit?

:<a> the mad hatter and the cook.

<q> what finally wakes alice from wonderland?
<a> she wakes up fro
Answer 2:
 
: 
hat does 
lice drink to shrink?

:—

<q> what creature grows very tall.

<q> who is always worried about being late?
<a> the white rab
Answer 3:
 
: 
hat happens when 
lice eats the cake labeled '
at 
e'?

:—


<q> —


<q> where does alice meet the mad hatter?
<a> at the mad tea party.

<q> who are the gue
Answer 4:
 
: 
ho often says '
ff with their heads!'?

:<a> the queen of hearts married to?
<a> the king of hearts.

<q> what creature gives alice advice wh
Answer 5:


** Model training time and Model parameter count**

In [15]:
print("Base model training time on google colab T4 GPU:", round((end_base - start_base) / 60, 2), "Minutes")



print("Fine-tuning training time on google colab T4 GPU:", round((end_finetune - start_finetune) / 60, 2), "Minutes")

total = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total} = {round(total/1e6, 2)} Million")


Base model training time on google colab T4 GPU: 10.97 Minutes
Fine-tuning training time on google colab T4 GPU: 2.21 Minutes
Total parameters: 542770 = 0.54 Million


GPT-0.5M, we’ve built a minimal yet powerful foundation to understand how GPT-style models work — a starting point for further learning, experimentation, and innovation in modern AI.

#onemriganka