In [None]:
import torch
print(torch.cuda.is_available())           # ✅ True attendu
print(torch.cuda.get_device_name(0))       # ✅ "GeForce GTX 1650 Ti"


In [1]:
###############################################################################
# 1) PRÉAMBULE ET IMPORTS
###############################################################################

# Pour affichage dans les notebooks Jupyter
%matplotlib inline

# Imports de base
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import random

# Imports des modules internes (à adapter selon l’organisation de ton projet)
from train import Arguments, train, train_m_models
from trainer_solution import train as train_loop_only
from checkpointing import get_extrema_performance_steps, get_all_checkpoints
from plotter import (
    plot_loss_accs, 
    plot_scaling_results,
    analyze_generalization,
    plot_scaling_results
)

# Infos PyTorch
print("PyTorch version :", torch.__version__)

# === Contrôle manuel du device ===
# Pour forcer l'utilisation du CPU, passe `use_gpu = False`
use_gpu = False
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
print("Device:", device)

# Contrôle du seed (optionnel, mais recommandé pour la reproductibilité)
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device.type == "cuda":
    torch.cuda.manual_seed_all(seed)


PyTorch version : 2.4.1+cu121
Device: cpu


In [2]:
###############################################################################
# Exemple : Entraînement LSTM et GPT (2 seeds) avec paramètres plus régularisés
###############################################################################
import torch
from train import Arguments, train
from plotter import plot_loss_accs

def run_model_with_params(model_type, seed):
    """
    Prépare et lance un entraînement d'un modèle (LSTM ou GPT)
    avec des hyperparamètres 'optimisés'.
    """
    args = Arguments()

    # Paramètres "data"
    args.p = 31
    args.operator = "+"
    args.r_train = 0.5
    args.operation_orders = 2

    # Paramètres "training"
    args.train_batch_size = 512
    args.eval_batch_size   = 2**12
    args.num_workers = 0
    args.n_steps = 8000 + 1     # on réduit un peu pour éviter de trop pousser le surapprentissage
    args.eval_first = 100
    args.eval_period = 500
    args.print_step = 500
    args.save_model_step = 1000
    args.save_statistic_step = 1000

    # Modèle
    args.model = model_type
    args.num_layers = 2
    args.embedding_size = 128
    if model_type == "lstm":
        args.hidden_size = 128
    # Ajout de dropout
    args.dropout = 0.2

    # Optimiseur
    args.optimizer = "adamw"
    args.lr = 1e-3
    # Weight decay plus élevé pour davantage de régularisation
    args.weight_decay = 1e-2

    # Exp & device
    args.device = "cuda" if torch.cuda.is_available() else "cpu"
    args.exp_name = f"{model_type}_seed{seed}_betterReg"
    args.log_dir = "./logs_better"
    args.seed = seed
    args.verbose = True

    # Lance l'entraînement
    all_metrics, checkpoint_path = train(args)
    return all_metrics, checkpoint_path

###############################################################################
# Boucle : on entraîne LSTM puis GPT, chacun pour 2 seeds (0 et 42).
###############################################################################
all_runs = {}
for model_type in ["lstm", "gpt"]:
    for sd in [0, 42]:
        print(f"\n=== Training {model_type} (seed={sd}) ===\n")
        mets, ckpt = run_model_with_params(model_type, sd)
        all_runs[(model_type, sd)] = (mets, ckpt)

###############################################################################
# Résultats :
#  - On a des logs dans ./logs_better/<expName> 
#  - On a tracé un PDF/PNG (si paramétré dans train(...)) 
#  - On peut re-tracer ici manuellement si on veut, par ex. pour comparer tout ensemble
###############################################################################

# Par exemple, on refait un plot "manuellement" pour l'un de nos runs 
# (car train(...) appelle déjà plot_loss_accs(...) en interne).
from plotter import plot_loss_accs

# Supposez qu'on veut re-visualiser LSTM seed=0
lstm_seed0_metrics = all_runs[("lstm", 0)][0]
plot_loss_accs(
    lstm_seed0_metrics, 
    multiple_runs=False, 
    log_x=False, 
    log_y=False, 
    fileName="LSTM_seed0_betterReg", 
    filePath=None, # ou un chemin 
    show=True
)

print("\n== Fini : Modèles entraînés avec dropout=0.2, weight_decay=1e-2, 8000 steps ==")



=== Training lstm (seed=0) ===



RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
###############################################################################
# 3) PARTIE 4.2 : Variation de r_train ∈ {0.1,...,0.9} avec des paramètres
#    ajustés pour limiter le surapprentissage.
###############################################################################

def run_rtrain_experiments(
    model_type="lstm", 
    r_train_values=None, 
    seeds=[0], 
    dropout=0.2, 
    weight_decay=1e-2, 
    n_steps=8000
):
    """
    Entraîne un modèle (LSTM ou GPT) pour plusieurs valeurs de r_train,
    en appliquant un dropout plus élevé, un weight_decay plus grand, 
    et un n_steps moins élevé que par défaut pour limiter le surapprentissage.

    Parameters
    ----------
    model_type : str
        "lstm" ou "gpt".
    r_train_values : list
        Les valeurs de r_train à tester (ex: [0.1, 0.2, ..., 0.9]).
    seeds : list
        Liste des seeds aléatoires.
    dropout : float
        Dropout appliqué au modèle.
    weight_decay : float
        Coefficient de L2 régularisation (AdamW).
    n_steps : int
        Nombre de pas d'entraînement (étapes).

    Returns
    -------
    results : dict
        Dictionnaire { (r_train, seed) : (all_metrics, checkpoint_path) }
        all_metrics contient les stats d'entraînement,
        checkpoint_path est le chemin de sauvegarde.
    """
    if r_train_values is None:
        r_train_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    results = {}
    for rtr in r_train_values:
        for sd in seeds:
            args = Arguments()

            # Données
            args.p = 31
            args.operator = "+"
            args.r_train = rtr
            args.operation_orders = 2

            # Entraînement
            args.train_batch_size = 512
            args.eval_batch_size  = 4096
            args.n_steps = n_steps + 1  
            args.eval_first = 100
            args.eval_period = 500
            args.print_step = 500
            args.save_model_step = 1000
            args.save_statistic_step = 1000

            # Modèle
            args.model = model_type
            args.num_layers = 2
            args.embedding_size = 128
            if model_type=="lstm":
                args.hidden_size = 128
            args.dropout = dropout

            # Optimiseur
            args.optimizer = "adamw"
            args.lr = 1e-3
            args.weight_decay = weight_decay

            # Nom de l'exp
            args.exp_name = f"rtrain_{model_type}_r{rtr}_seed{sd}_drop{dropout}_wd{weight_decay}"
            args.log_dir   = "./logs_rtrain"
            args.seed      = sd
            args.device    = device
            args.verbose   = False  # Passez à True pour plus de logs

            # Lance l'entraînement
            mets, ckp = train(args)
            results[(rtr, sd)] = (mets, ckp)
    return results


################################
# EXEMPLE D'EXECUTION
################################
r_values = [0.1, 0.2, 0.3, 0.4, 0.5]
seeds    = [0, 42]

# On applique dropout=0.2, weight_decay=1e-2, n_steps=8000
res_lstm = run_rtrain_experiments(
    model_type="lstm",
    r_train_values=r_values,
    seeds=seeds,
    dropout=0.2,
    weight_decay=1e-2,
    n_steps=8000
)

res_gpt = run_rtrain_experiments(
    model_type="gpt",
    r_train_values=r_values,
    seeds=seeds,
    dropout=0.2,
    weight_decay=1e-2,
    n_steps=8000
)

print("=== ENTRAÎNEMENT TERMINÉ ===")


# Pour analyser et imprimer un petit résumé :
from checkpointing import get_extrema_performance_steps
print("=== RÉSULTATS LSTM ===")
for (rtr, sd), (mets, ckp) in res_lstm.items():
    ext = get_extrema_performance_steps(mets)
    print(f"r_train={rtr}, seed={sd} => {ckp}")
    print("   - min_train_loss =", ext["min_train_loss"], "at step=", ext["min_train_loss_step"])
    print("   - max_train_acc  =", ext["max_train_accuracy"], "at step=", ext["max_train_accuracy_step"])
    print("   - min_val_loss   =", ext["min_test_loss"], "at step=", ext["min_test_loss_step"])
    print("   - max_val_acc    =", ext["max_test_accuracy"], "at step=", ext["max_test_accuracy_step"])
    print("")

print("=== RÉSULTATS GPT ===")
for (rtr, sd), (mets, ckp) in res_gpt.items():
    ext = get_extrema_performance_steps(mets)
    print(f"r_train={rtr}, seed={sd} => {ckp}")
    print("   - min_train_loss =", ext["min_train_loss"], "at step=", ext["min_train_loss_step"])
    print("   - max_train_acc  =", ext["max_train_accuracy"], "at step=", ext["max_train_accuracy_step"])
    print("   - min_val_loss   =", ext["min_test_loss"], "at step=", ext["min_test_loss_step"])
    print("   - max_val_acc    =", ext["max_test_accuracy"], "at step=", ext["max_test_accuracy_step"])
    print("")


In [None]:
###############################################################################
# 4) PARTIE 4.3 : Mélange binaire/ternaire (p=11, operation_orders=[2,3])
#    => Diviser le dataset manuellement (moitié binaire, moitié ternaire)
#       ... puis entraîner LSTM et GPT ... 
###############################################################################

import torch
from data import get_arithmetic_dataset

def get_mixed_bin_tern_dataset(p=11, operator="+", r_train=0.5, seed=0):
    """
    Extrait un dataset combinant eq_position=3 (binaire) et eq_position=5 (ternaire).
    On veut un train avec ~50% binaire + 50% ternaire, 
    et un val set identique. 
    Cf. instructions de l'énoncé.
    """
    from train import Arguments
    args = Arguments()
    args.p = p
    args.operator = operator
    args.r_train = 1.0  # On prend tout, on split manuellement
    args.operation_orders = [2,3]
    args.seed = seed

    (full_dataset, _), tokenizer, maxlen, pad_idx = get_arithmetic_dataset(
        p, p, operator, 1.0, [2,3], seed=seed
    )
    # eq_positions => 3 (binaire) ou 5 (ternaire)
    idx_bin = [i for i in range(len(full_dataset)) if full_dataset[i][2]==3]
    idx_ter = [i for i in range(len(full_dataset)) if full_dataset[i][2]==5]
    bin_subset = torch.utils.data.Subset(full_dataset, idx_bin)
    ter_subset = torch.utils.data.Subset(full_dataset, idx_ter)

    # On fait un random split proportion r_train sur bin, r_train sur ter
    # => r_train correspond à la fraction en train 
    from torch.utils.data import random_split
    n_bin = len(bin_subset)
    train_bin_len = int(r_train * n_bin)
    val_bin_len   = n_bin - train_bin_len
    bin_train, bin_val = random_split(bin_subset, [train_bin_len, val_bin_len], generator=torch.Generator().manual_seed(seed))

    n_ter = len(ter_subset)
    train_ter_len = int(r_train * n_ter)
    val_ter_len   = n_ter - train_ter_len
    ter_train, ter_val = random_split(ter_subset, [train_ter_len, val_ter_len], generator=torch.Generator().manual_seed(seed))

    # Combine
    from torch.utils.data import ConcatDataset
    train_dataset = ConcatDataset([bin_train, ter_train])
    val_dataset   = ConcatDataset([bin_val,   ter_val])

    return train_dataset, val_dataset, tokenizer, maxlen, pad_idx


def train_mixed_bin_tern(p=11, operator="+", r_train=0.5, seed=0, model_type="lstm"):
    """
    Entraîne un seul modèle (LSTM ou GPT) sur le dataset mixte 
    binaire/ternaire, p=11, r_train=0.5.
    """
    from train import Arguments
    args = Arguments()
    args.p = p
    args.operator = operator
    args.r_train = r_train # mais on va l'ignorer pour data, on a un data custom
    args.operation_orders = [2,3]
    args.seed = seed
    args.device=device
    args.model=model_type
    args.num_layers=2
    args.embedding_size=64
    if model_type=="lstm":
        args.hidden_size=64
    args.n_steps=5000
    args.train_batch_size=128
    args.eval_batch_size=512
    args.log_dir="./logs_mixed"
    args.exp_name=f"mixed_{model_type}_p{p}_seed{seed}"
    args.verbose=True

    # On récupère un dataset custom:
    train_dataset, val_dataset, tokenizer, maxlen, pad_idx = get_mixed_bin_tern_dataset(
        p, operator, r_train, seed
    )
    # On construit manuellement DataLoader
    from torch.utils.data import DataLoader
    train_loader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=args.eval_batch_size,  shuffle=False)

    # On appelle la fonction train(...) du module trainer_solution, 
    # mais on doit nous-même créer le modèle, l'optimizer, etc. 
    # ou on essaie de réutiliser "train(args)" => alors on doit patcher
    # train.py pour qu'il accepte un "custom dataset" en paramètre.
    # Pour illustrer, on va faire "train(args)" et forcer la data 
    # (vous adapterez si besoin).
    
    # Trick: On modifie un petit peu "train.py" ou on fait un contournement
    # direct pour appeler "train_loop_only(...)"
    
    # 1) On crée le modèle (LSTM ou GPT).
    from lstm_solution import LSTMLM
    from gpt_solution import GPT
    vocabulary_size = len(tokenizer)
    if model_type=="lstm":
        model = LSTMLM(vocabulary_size, args.embedding_size, args.hidden_size, args.num_layers,
                       dropout=args.dropout, padding_index=pad_idx,
                       bias_lstm=True, bias_classifier=args.bias_classifier,
                       share_embeddings=args.share_embeddings)
    else:
        model = GPT(num_heads=4, num_layers=args.num_layers,
                    embedding_size=args.embedding_size, vocabulary_size=vocabulary_size,
                    sequence_length=maxlen,
                    multiplier=4, dropout=0.0, non_linearity="gelu",
                    padding_index=pad_idx, bias_attention=True, bias_classifier=args.bias_classifier,
                    share_embeddings=args.share_embeddings)

    model.to(device)

    # 2) Create optimizer
    import torch.optim as optim
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
    # Dummy scheduler
    from train import DummyScheduler
    scheduler = DummyScheduler(optimizer)

    # 3) Appel du "train_loop_only"
    from trainer_solution import train as train_loop
    # => train_loop(model, train_loader, train_loader_eval, test_loader, optimizer, scheduler, ...)

    checkpoint_path = os.path.join(args.log_dir, args.exp_name)
    os.makedirs(checkpoint_path, exist_ok=True)

    all_metrics = train_loop(
        model, 
        train_loader, 
        train_loader,  # for eval on train
        val_loader, 
        optimizer, scheduler, 
        device,
        args.exp_name, checkpoint_path,
        n_steps=args.n_steps,
        eval_first=100,
        eval_period=200,
        print_step=200,
        save_model_step=1000,
        save_statistic_step=1000,
        verbose=True
    )
    # On peut tracer
    from plotter import plot_loss_accs
    plot_loss_accs(all_metrics, multiple_runs=False, fileName=args.exp_name, filePath=checkpoint_path, show=True)
    
    return all_metrics, checkpoint_path


# EXEMPLE D'UTILISATION:
mets_lstm, ckp_lstm = train_mixed_bin_tern(model_type="lstm", seed=0)
mets_gpt,  ckp_gpt  = train_mixed_bin_tern(model_type="gpt",  seed=0)
# => Ensuite, vous regardez si binaire (eq_pos=3) s'apprend plus vite que ternaire (eq_pos=5).


In [None]:
###############################################################################
# 5) PARTIE 4.4 : MISE A L'ECHELLE DU MODELE (L, d) => (L, d) ∈ {1,2,3} × {26,27,28}
###############################################################################

def run_scaling_experiments(model_type="lstm", L_values=[1,2,3], d_values=[26,27,28], seed=0):
    """
    Lance 3×3 expériences, pour L ∈ {1,2,3} et d ∈ {26,27,28}, 
    tout le reste inchangé (p=31, r_train=0.5, etc.).
    """
    results = {}
    for L in L_values:
        for d in d_values:
            args = Arguments()
            args.p=31
            args.operator="+"
            args.r_train=0.5
            args.operation_orders=2
            args.train_batch_size=512
            args.eval_batch_size=4096
            args.model=model_type
            args.num_layers=L
            args.embedding_size=d
            if model_type=="lstm":
                args.hidden_size=d
            args.dropout=0.0
            args.optimizer="adamw"
            args.lr=1e-3
            args.weight_decay=1e-3
            args.n_steps=10_000+1
            args.eval_period=500
            args.print_step=500
            args.exp_name=f"scaling_{model_type}_L{L}_d{d}_seed{seed}"
            args.log_dir="./logs_scaling"
            args.seed=seed
            args.device=device
            args.verbose=False

            mets, ckp = train(args)
            results[(L,d)] = (mets, ckp)

    return results

# EXEMPLE:
res_lstm_scaling = run_scaling_experiments("lstm", [1,2,3], [26,27,28], seed=0)
res_gpt_scaling  = run_scaling_experiments("gpt",  [1,2,3], [26,27,28], seed=0)
# => On récupère ensuite le nombre de paramètres (hors embeddings) 
#    et on trace la performance vs. P. 


In [None]:
###############################################################################
# 6) PARTIE 4.5 : Variation batch_size B et T=2×10^4
###############################################################################

def run_batch_experiments(model_type="lstm", B_values=[25,26,27,28,29], seed=0):
    results = {}
    for B in B_values:
        args = Arguments()
        args.p=31
        args.operator="+"
        args.r_train=0.5
        args.operation_orders=2
        args.model=model_type
        args.n_steps=20000+1
        args.train_batch_size=B
        args.eval_batch_size=4096
        args.num_layers=2
        args.embedding_size=64
        if model_type=="lstm":
            args.hidden_size=64
        args.optimizer="adamw"
        args.lr=1e-3
        args.weight_decay=1e-3
        args.exp_name=f"batch_{model_type}_B{B}_seed{seed}"
        args.log_dir="./logs_batch"
        args.seed=seed
        args.device=device
        args.verbose=False

        mets, ckp = train(args)
        results[B] = (mets, ckp)
    return results

# EX. => run_batch_experiments("lstm", [32,64,128], seed=0)
# Puis on analyse la performance en coupant l'entraînement à alpha*T steps, etc.


In [None]:
###############################################################################
# 7) PARTIE 4.6 : Régularisation (weight_decay)
#    => Ex: T=4×10^4 steps, wd ∈ {0.25, 0.5, 0.75,1.0}, et on trace la norme ℓ2 ...
###############################################################################

def run_weight_decay_experiments(wd_values=[0.25, 0.5, 0.75, 1.0], seed=0):
    results={}
    for wd in wd_values:
        args=Arguments()
        args.p=31
        args.operator="+"
        args.r_train=0.5
        args.operation_orders=2
        args.model="lstm"
        args.n_steps=40000+1
        args.weight_decay=wd
        args.exp_name=f"wd_lstm_{wd}_seed{seed}"
        args.log_dir="./logs_wd"
        args.seed=seed
        args.device=device
        args.verbose=False

        mets, ckp = train(args)
        results[wd] = (mets, ckp)
    return results

# => On pourra extraire la norme ℓ2 des paramètres si on modifie "eval_model" 
#    pour retourner la norme.


In [None]:
###############################################################################
# 8) PARTIE 4.7 : Interprétabilité GPT (Poids d'attention)
###############################################################################

def visualize_gpt_attention(model, tokenizer, sequences):
    """
    Montre un exemple de fonction pour visualiser 
    la matrice d'attentions (B, num_layers, num_heads, seq_len, seq_len).

    'model' : GPT entraîné
    'tokenizer' : votre tokenizer
    'sequences' : liste de strings (ex: ["[BOS] 3 + 4 = ?"])
    """
    model.eval()

    # 1) Tokenize
    encoded_batch = []
    max_len = 0
    for seq in sequences:
        # Supposez tokenizer.encode(seq) => liste d'IDs
        # Vous adapterez selon votre tokenizer
        ids = tokenizer.encode(seq)  
        if len(ids)>max_len:
            max_len = len(ids)
        encoded_batch.append(ids)

    # Padding
    pad_id = tokenizer.pad_token_id if hasattr(tokenizer, "pad_token_id") else 0
    batch_l = []
    for ids in encoded_batch:
        pad_len = max_len - len(ids)
        new_ids = ids + [pad_id]*pad_len
        batch_l.append(new_ids)
    batch_tensor = torch.tensor(batch_l, dtype=torch.long, device=device)

    # 2) Forward => (logits, (hidden_states, attentions))
    with torch.no_grad():
        logits, (hidden_states, attentions) = model(batch_tensor)
        # attentions : (B, num_layers, num_heads, seq_len, seq_len)

    import matplotlib.pyplot as plt
    import seaborn as sns

    B = batch_tensor.size(0)
    num_layers = attentions.size(1)
    num_heads  = attentions.size(2)

    for i in range(B):
        # get the i-th item
        att_i = attentions[i]  # shape (num_layers, num_heads, seq_len, seq_len)
        # label tokens
        tokens_ids = batch_l[i]
        # decode if possible
        tokens_str = tokenizer.decode(tokens_ids)  # ou autre

        fig, axes = plt.subplots(num_layers, num_heads, figsize=(4*num_heads, 4*num_layers))
        fig.suptitle(f"Attention for sample {i}")

        for layer in range(num_layers):
            for head in range(num_heads):
                mat = att_i[layer, head].cpu().numpy()
                ax = axes[layer, head] if num_layers>1 else axes[head]  # si single layer

                sns.heatmap(mat, vmin=0, vmax=1, cmap="Blues", ax=ax)
                ax.set_title(f"Layer={layer}, Head={head}", fontsize=8)
                # label x,y si vous voulez
        plt.tight_layout()
        plt.show()


In [None]:

from checkpointing import get_all_checkpoints
my_ckpt_dir = "./logs_verif/verif_gpt_seed0"
all_models, all_mets = get_all_checkpoints(my_ckpt_dir, "verif_gpt_seed0", just_files=False)
# supposez all_models[-1] = (last_model, last_step)
gpt_model = all_models[-1][0]

# Choisissez 2 échantillons, ex:
seqs = ["BOS 3 + 5 = ???", "BOS 7 + 2 = ???"]
visualize_gpt_attention(gpt_model, tokenizer, seqs)
