In [6]:
# Load and use PicoGPT (or custom GPT) in Jupyter Notebook

# Step 1: Imports and Config
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken

# Define model config (adapt to your model)
cfg = {
    "vocab_size": 50257,
    "emb_dim": 256,
    "context_length": 128,
    "n_heads": 4,
    "n_layers": 4,
    "drop_rate": 0.1,
    "qkv_bias": True
}

# Step 2: Define GPTModel (must match training architecture)
class LayerNorm(nn.Module):
    def __init__(self, emb_dim): super().__init__()
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        return self.scale * (x - mean) / torch.sqrt(var + self.eps) + self.shift


class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * x ** 3)))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )

    def forward(self, x): return self.net(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        B, T, _ = x.shape
        q = self.q(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        mask = self.mask[:T, :T].unsqueeze(0).unsqueeze(0).to(x.device)
        attn = attn.masked_fill(mask.bool(), float("-inf"))
        attn = torch.softmax(attn, dim=-1)
        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
        return self.proj(out)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg["emb_dim"])
        self.ln2 = LayerNorm(cfg["emb_dim"])
        self.att = MultiHeadAttention(cfg["emb_dim"], cfg["emb_dim"], cfg["context_length"],
                                      cfg["drop_rate"], cfg["n_heads"], cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.dropout = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        x = x + self.dropout(self.att(self.ln1(x)))
        x = x + self.dropout(self.ff(self.ln2(x)))
        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_embed = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_embed = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        self.blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.ln_f = LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, x):
        B, T = x.size()
        tok = self.token_embed(x)
        pos = self.pos_embed(torch.arange(T, device=x.device))
        x = self.dropout(tok + pos)
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)

# Step 3: Load pretrained model weights
model = GPTModel(cfg)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load("psychogpt.pt", map_location=device))
model.to(device)
model.eval()

# Step 4: Load tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Step 5: Define generation function
def generate(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, stop_token="<|endoftext|>"):
    model.eval()
    tokens = tokenizer.encode(prompt)
    x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    for _ in range(max_new_tokens):
        x_cond = x[:, -cfg["context_length"]:]  # Tronque si la séquence devient trop longue
        with torch.no_grad():
            logits = model(x_cond)
            logits = logits[:, -1, :] / temperature  # Appliquer température
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        x = torch.cat([x, next_token], dim=1)

        # Optionnel : arrêt sur token de fin
        if tokenizer.decode([next_token.item()]) == stop_token:
            break

    decoded = tokenizer.decode(x[0].tolist())
    return decoded

# Step 6: Use the model
prompt = "psychoanalysis is"
print(generate(model, tokenizer, prompt, max_new_tokens=40, temperature=0.8))

the cat is fast.
they eats on the bed.
the teacher jumps quietly.
they sleeps milk.
the teacher drinks walks with me.
a bird sings at school.
my friend reads fast
