In [1]:
!pip install -U transformers datasets peft accelerate




In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from transformers import TrainingArguments, Trainer

# Choisis un modèle "Causal LM" (par exemple Bloom-560m)
MODEL_NAME = "bigscience/bloom-560m"

# Nom du dataset, ex. "Abirate/english_quotes" (ou autre si tu veux tester)
DATASET_NAME = "Abirate/english_quotes"


ModuleNotFoundError: No module named 'torch'

In [3]:
pip install torch


Collecting torch
  Downloading torch-2.6.0-cp313-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting setuptools (from torch)
  Downloading setuptools-76.1.0-py3-none-any.whl.metadata (6.7 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading MarkupSafe-3.0.2-

In [5]:
pip show torch


Name: torch
Version: 2.6.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages
Requires: filelock, fsspec, jinja2, networkx, setuptools, sympy, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install -U peft datasets transformers accelerate


Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting pyyaml (from peft)
  Downloading PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting tqdm (from peft)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting safetensors (from peft)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting huggingface-hub>=0.25.0 (from peft)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
C

In [8]:
from datasets import load_dataset

# On charge 10% du training set
dataset = load_dataset("Abirate/english_quotes", split="train[:10%]")
print("Taille du dataset :", len(dataset))
print("Exemple :", dataset[0])


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 2508/2508 [00:00<00:00, 260675.88 examples/s]

Taille du dataset : 251
Exemple : {'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']}





In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Charger le modèle pré-entraîné
foundation_model = AutoModelForCausalLM.from_pretrained(model_name)


In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["quote"], 
        truncation=True,
        max_length=128,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 251/251 [00:00<00:00, 4539.78 examples/s]


In [11]:
tokenized_dataset = tokenized_dataset.remove_columns(["quote", "author"])
tokenized_dataset.set_format("torch")


In [12]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,                       # low-rank dimension
    lora_alpha=32,            # alpha (scaling factor)
    target_modules=["query_key_value"],  # couches Bloom à modifier
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(foundation_model, lora_config)
peft_model.print_trainable_parameters()


trainable params: 786,432 || all params: 560,001,024 || trainable%: 0.1404


In [15]:
from transformers import TrainingArguments, Trainer
import torch

def data_collator(batch):
    return {
        "input_ids": torch.stack([f["input_ids"] for f in batch]),
        "attention_mask": torch.stack([f["attention_mask"] for f in batch]),
        "labels": torch.stack([f["input_ids"] for f in batch]),
    }

training_args = TrainingArguments(
    output_dir="peft_lora_outputs",
    num_train_epochs=1,                  # Ajuste si tu veux plus d'époques
    per_device_train_batch_size=4,       # Ajuste selon ta RAM GPU
    gradient_accumulation_steps=1,
    logging_steps=10,
    save_steps=50,
    overwrite_output_dir=True,
    fp16=True,            # si tu as un GPU compatible
    evaluation_strategy="no", 
    # si tu veux évaluer, mets "epoch" ou "steps" et fournis un eval_dataset
    report_to="none"      # ou "tensorboard"
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=None,   # pas de dataset de validation dans l'exemple
    data_collator=data_collator
)


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [16]:
def tokenize_function(examples):
    return tokenizer(
        examples["quote"],
        truncation=True,
        max_length=128,
        padding="max_length"   # <-- PADDING ICI
    )


In [17]:
from transformers import TrainingArguments, Trainer
import torch

def data_collator(batch):
    return {
        "input_ids": torch.stack([f["input_ids"] for f in batch]),
        "attention_mask": torch.stack([f["attention_mask"] for f in batch]),
        "labels": torch.stack([f["input_ids"] for f in batch]),
    }

training_args = TrainingArguments(
    output_dir="peft_lora_outputs",
    num_train_epochs=1,                  # Ajuste si tu veux plus d'époques
    per_device_train_batch_size=4,       # Ajuste selon ta RAM GPU
    gradient_accumulation_steps=1,
    logging_steps=10,
    save_steps=50,
    overwrite_output_dir=True,
    fp16=True,            # si tu as un GPU compatible
    evaluation_strategy="no", 
    # si tu veux évaluer, mets "epoch" ou "steps" et fournis un eval_dataset
    report_to="none"      # ou "tensorboard"
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=None,   # pas de dataset de validation dans l'exemple
    data_collator=data_collator
)


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [18]:
def tokenize_function(examples):
    return tokenizer(
        examples["quote"],
        truncation=True,
        max_length=128,
        padding="max_length"  # <-- Le plus important
    )


In [19]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 251/251 [00:00<00:00, 4612.76 examples/s]


In [20]:
def data_collator(batch):
    return {
        "input_ids": torch.stack([f["input_ids"] for f in batch]),
        "attention_mask": torch.stack([f["attention_mask"] for f in batch]),
        "labels": torch.stack([f["input_ids"] for f in batch]),
    }


In [21]:
def tokenize_function(examples):
    return tokenizer(
        examples["quote"],
        truncation=True,
        max_length=128,
        padding="max_length"  # On force le padding ici
    )


In [22]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["quote", "author"])
tokenized_dataset.set_format("torch")


In [23]:
import torch

def data_collator(batch):
    return {
        "input_ids": torch.stack([f["input_ids"] for f in batch]),
        "attention_mask": torch.stack([f["attention_mask"] for f in batch]),
        "labels": torch.stack([f["input_ids"] for f in batch]),
    }


In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_bloom_checkpoint",
    overwrite_output_dir=True,
    num_train_epochs=1,            # Ajuste selon tes besoins
    per_device_train_batch_size=4, # Ajuste selon ta RAM
    logging_steps=50,
    save_steps=100,
    save_total_limit=1,
    evaluation_strategy="no",      # ou "epoch"/"steps" si tu as un eval_dataset
    fp16=True,                     # si tu as un GPU compatible
    report_to="none"
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # eval_dataset=... (optionnel si tu as un dataset de validation)
    data_collator=data_collator
)


ValueError: fp16 mixed precision requires a GPU (not 'mps').