In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Charger le fichier texte
with open('exemple.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
text = text.strip().lower()

# 2. Vocabulaire
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# 3. Encodage
data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
block_size = 64

# 4. Dataset personnalisÃ©
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx : idx + self.block_size]
        y = self.data[idx + 1 : idx + 1 + self.block_size]
        return x, y

# 5. CrÃ©er le DataLoader
batch_size = 4
dataset = CharDataset(data, block_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 6. Exemple dâ€™utilisation
x, y = next(iter(dataloader))
print("Texte encodÃ©. Taille vocabulaire :", vocab_size)
print("x shape :", x.shape)  # [batch_size, block_size]
print("y shape :", y.shape)

Texte encodÃ©. Taille vocabulaire : 36
x shape : torch.Size([4, 64])
y shape : torch.Size([4, 64])


In [2]:
import torch.nn as nn
import torch.nn.functional as F

# DÃ©finition dâ€™un mini-modÃ¨le GPT
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, n_embed=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx):
        x = self.embed(idx)
        logits = self.lm_head(x)
        return logits

# Initialisation
model = TinyGPT(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# PrÃ©parer un itÃ©rateur persistant sur le dataloader
data_iter = iter(dataloader)

# EntraÃ®nement simple
for step in range(500):
    try:
        x, y = next(data_iter)
    except StopIteration:
        data_iter = iter(dataloader)
        x, y = next(data_iter)

    logits = model(x)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B * T, C), y.view(B * T))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Ã‰tape {step} â€“ Perte : {loss.item():.4f}")

print("âœ… EntraÃ®nement terminÃ©.")

Ã‰tape 0 â€“ Perte : 3.7334
Ã‰tape 100 â€“ Perte : 2.5172
Ã‰tape 200 â€“ Perte : 2.2097
Ã‰tape 300 â€“ Perte : 2.1297
Ã‰tape 400 â€“ Perte : 2.0330
âœ… EntraÃ®nement terminÃ©.


In [3]:
@torch.no_grad()
def generate(model, prompt, max_new_tokens=100, temperature=1.0, top_k=None, top_p=None):
    model.eval()
    idx = torch.tensor([stoi[c] for c in prompt], dtype=torch.long).unsqueeze(0)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :] / temperature  # tempÃ©rature

        # Appliquer top-k
        if top_k is not None:
            topk_vals, topk_idx = torch.topk(logits, top_k)
            logits_filtered = torch.full_like(logits, float('-inf'))
            logits_filtered.scatter_(1, topk_idx, topk_vals)
            logits = logits_filtered

        # Appliquer top-p (nucleus)
        if top_p is not None:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

            # Masque les tokens dÃ©passant top_p
            sorted_mask = cumulative_probs > top_p
            sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
            sorted_mask[..., 0] = 0  # Garder au moins 1

            logits[0, sorted_indices[0][sorted_mask[0]]] = float('-inf')

        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    out = ''.join([itos[i.item()] for i in idx[0]])
    return out

In [6]:
prompt = "un matin,"
generated_text = generate(model, prompt, max_new_tokens=200, temperature=0.1, top_k=20, top_p=0.9)

print("ðŸ”® Texte gÃ©nÃ©rÃ© :\n")
print(generated_text)

ðŸ”® Texte gÃ©nÃ©rÃ© :

un matin, le le mourit le me le le moure le le me le le le le mme le le le le le le le le le le me le moure le le le le le le le le le le le me le le mmit le le le le le le le le le le le mount le mourit le le
