In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import tiktoken
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cfg = {
    "vocab_size": 50257,
    "emb_dim": 256,
    "context_length": 128,
    "n_heads": 4,
    "n_layers": 4,
    "drop_rate": 0.1,
    "qkv_bias": True
}

In [2]:
with open("a_general_introduction_to_psychoanalysis.txt", "r", encoding="utf-8-sig") as f:
    text = f.read()

text = text.strip().lower()
text = re.sub(r'\n\s*\n', '\n\n', text)

In [3]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        if not text.endswith("<|endoftext|>"):
            text += " <|endoftext|>"

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
            self.target_ids.append(torch.tensor(token_ids[i+1:i+max_length+1]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(text, batch_size=16, max_length=128, stride=64):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)

    def collate_fn(batch):
        x, y = zip(*batch)
        return torch.stack(x), torch.stack(y)

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        pin_memory=torch.cuda.is_available()
    )
    return loader, tokenizer

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim): super().__init__()
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        return self.scale * (x - mean) / torch.sqrt(var + self.eps) + self.shift


class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * x ** 3)))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )

    def forward(self, x): return self.net(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        B, T, _ = x.shape
        q = self.q(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        attn = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        mask = self.mask[:T, :T].unsqueeze(0).unsqueeze(0).to(x.device)
        attn = attn.masked_fill(mask.bool(), float("-inf"))
        attn = torch.softmax(attn, dim=-1)
        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
        return self.proj(out)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg["emb_dim"])
        self.ln2 = LayerNorm(cfg["emb_dim"])
        self.att = MultiHeadAttention(cfg["emb_dim"], cfg["emb_dim"], cfg["context_length"],
                                      cfg["drop_rate"], cfg["n_heads"], cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.dropout = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        x = x + self.dropout(self.att(self.ln1(x)))
        x = x + self.dropout(self.ff(self.ln2(x)))
        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_embed = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_embed = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        self.blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.ln_f = LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, x):
        B, T = x.size()
        tok = self.token_embed(x)
        pos = self.pos_embed(torch.arange(T, device=x.device))
        x = self.dropout(tok + pos)
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)

In [5]:
model = GPTModel(cfg).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

loader, tokenizer = create_dataloader_v1(text, batch_size=32, max_length=cfg["context_length"], stride=64)

model.train()
for epoch in range(20):
    pbar = tqdm(loader)
    for x, y in pbar:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pbar.set_description(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

Epoch 1 | Loss: 6.4160: 100%|████████████████████████████████████████████████████████| 106/106 [03:59<00:00,  2.26s/it]
Epoch 2 | Loss: 5.9216: 100%|████████████████████████████████████████████████████████| 106/106 [03:47<00:00,  2.15s/it]
Epoch 3 | Loss: 5.5305: 100%|████████████████████████████████████████████████████████| 106/106 [03:51<00:00,  2.18s/it]
Epoch 4 | Loss: 5.4178: 100%|████████████████████████████████████████████████████████| 106/106 [03:47<00:00,  2.14s/it]
Epoch 5 | Loss: 5.0905: 100%|████████████████████████████████████████████████████████| 106/106 [03:40<00:00,  2.08s/it]
Epoch 6 | Loss: 4.8296: 100%|████████████████████████████████████████████████████████| 106/106 [03:38<00:00,  2.06s/it]
Epoch 7 | Loss: 4.3045: 100%|████████████████████████████████████████████████████████| 106/106 [03:37<00:00,  2.05s/it]
Epoch 8 | Loss: 4.5775: 100%|████████████████████████████████████████████████████████| 106/106 [03:41<00:00,  2.09s/it]
Epoch 9 | Loss: 4.2599: 100%|███████████

In [7]:
def generate(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, stop_token="<|endoftext|>"):
    model.eval()
    tokens = tokenizer.encode(prompt)
    x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    for _ in range(max_new_tokens):
        x_cond = x[:, -cfg["context_length"]:]  # Tronque si la séquence devient trop longue
        with torch.no_grad():
            logits = model(x_cond)
            logits = logits[:, -1, :] / temperature  # Appliquer température
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        x = torch.cat([x, next_token], dim=1)

        # Optionnel : arrêt sur token de fin
        if tokenizer.decode([next_token.item()]) == stop_token:
            break

    decoded = tokenizer.decode(x[0].tolist())
    return decoded

In [8]:
# Supposons que ton modèle est entraîné et que tu as :
# - model (instance de GPTModel)
# - tokenizer (tiktoken encoding GPT-2)

prompt = "psychoanalysis is"
generated_text = generate(model, tokenizer, prompt, max_new_tokens=50, temperature=0.8)

print("=== Generated text ===")
print(generated_text)

=== Generated text ===
psychoanalysis is enough that it is
was that it would be only in the dream. it is a dream should be a belief to
the dream as it necessary to us. it is obvious far that in the dream becomes seems
to the dream, and so


In [9]:
torch.save(model.state_dict(), "psycogpt.pt")