In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Charger le fichier texte
with open('exemple.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
text = text.strip().lower()

# 2. Vocabulaire
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
print("stoi :",stoi)
itos = {i: ch for ch, i in stoi.items()}
print("itos :",itos)

# 3. Encodage
data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
print(data)
block_size = 64

# 4. Dataset personnalisé
class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx : idx + self.block_size]
        y = self.data[idx + 1 : idx + 1 + self.block_size]
        return x, y

# 5. Créer le DataLoader
batch_size = 4
dataset = CharDataset(data, block_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 6. Exemple d’utilisation
x, y = next(iter(dataloader))
print("Texte encodé. Taille vocabulaire :", vocab_size)
print("x shape :", x.shape)  # [batch_size, block_size]
print("y shape :", y.shape)

stoi : {'\n': 0, ' ': 1, ',': 2, '-': 3, '.': 4, ':': 5, '?': 6, 'a': 7, 'b': 8, 'c': 9, 'd': 10, 'e': 11, 'f': 12, 'g': 13, 'h': 14, 'i': 15, 'j': 16, 'l': 17, 'm': 18, 'n': 19, 'o': 20, 'p': 21, 'q': 22, 'r': 23, 's': 24, 't': 25, 'u': 26, 'v': 27, 'x': 28, 'y': 29, 'à': 30, 'è': 31, 'é': 32, 'ê': 33, 'î': 34, '’': 35}
itos : {0: '\n', 1: ' ', 2: ',', 3: '-', 4: '.', 5: ':', 6: '?', 7: 'a', 8: 'b', 9: 'c', 10: 'd', 11: 'e', 12: 'f', 13: 'g', 14: 'h', 15: 'i', 16: 'j', 17: 'l', 18: 'm', 19: 'n', 20: 'o', 21: 'p', 22: 'q', 23: 'r', 24: 's', 25: 't', 26: 'u', 27: 'v', 28: 'x', 29: 'y', 30: 'à', 31: 'è', 32: 'é', 33: 'ê', 34: 'î', 35: '’'}
tensor([25, 15, 25, 23, 11,  1,  5,  1, 17, 11,  1,  9, 14,  7, 25,  1,  9, 26,
        23, 15, 11, 26, 28,  0,  0, 26, 19,  1, 16, 20, 26, 23,  2,  1, 10,  7,
        19, 24,  1, 26, 19,  1, 21, 11, 25, 15, 25,  1, 27, 15, 17, 17,  7, 13,
        11,  1, 11, 19, 25, 20, 26, 23, 32,  1, 10, 11,  1,  9, 20, 17, 17, 15,
        19, 11, 24,  2,  1, 27, 15

In [2]:
import torch.nn as nn
import torch.nn.functional as F

# Définition d’un mini-modèle GPT
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, n_embed=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx):
        x = self.embed(idx)
        logits = self.lm_head(x)
        return logits

# Initialisation
model = TinyGPT(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Préparer un itérateur persistant sur le dataloader
data_iter = iter(dataloader)

# Entraînement simple
for step in range(500):
    try:
        x, y = next(data_iter)
    except StopIteration:
        data_iter = iter(dataloader)
        x, y = next(data_iter)

    logits = model(x)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B * T, C), y.view(B * T))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Étape {step} – Perte : {loss.item():.4f}")

print("✅ Entraînement terminé.")

Étape 0 – Perte : 3.7334
Étape 100 – Perte : 2.5172
Étape 200 – Perte : 2.2097
Étape 300 – Perte : 2.1297
Étape 400 – Perte : 2.0330
✅ Entraînement terminé.


In [3]:
@torch.no_grad()
def generate(model, prompt, max_new_tokens=100, temperature=1.0, top_k=None, top_p=None):
    model.eval()
    idx = torch.tensor([stoi[c] for c in prompt], dtype=torch.long).unsqueeze(0)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :] / temperature  # température

        # Appliquer top-k
        if top_k is not None:
            topk_vals, topk_idx = torch.topk(logits, top_k)
            logits_filtered = torch.full_like(logits, float('-inf'))
            logits_filtered.scatter_(1, topk_idx, topk_vals)
            logits = logits_filtered

        # Appliquer top-p (nucleus)
        if top_p is not None:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

            # Masque les tokens dépassant top_p
            sorted_mask = cumulative_probs > top_p
            sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
            sorted_mask[..., 0] = 0  # Garder au moins 1

            logits[0, sorted_indices[0][sorted_mask[0]]] = float('-inf')

        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    out = ''.join([itos[i.item()] for i in idx[0]])
    return out

In [10]:
prompt = "un matin,"
generated_text = generate(model, prompt, max_new_tokens=200, temperature=0.8, top_k=20, top_p=0.9)

print("🔮 Texte généré :\n")
print(generated_text)

🔮 Texte généré :

un matin, l’ait le pa l’ailet llle mererun fletaqun, choiese pes dare dese à ur. mmat qugerier. pait mis lesat l bo uvan chailant de mauge llqun mouromer. mitét mit, à ese lant les, borounta omomouge boses, ll
