In [1]:
OUTPUT_DIR = "llama-1b-wikipedia"
BASE_MODEL = "meta-llama/Llama-3.2-1B"
EPOCHS = 2
MAX_STEPS = 0 # If >0, will only train for this many steps (ignores EPOCHS)
SYSTEM_PROMPT = "You are a corrector of French texts. Correct the text without explaining."
MAX_LENGTH = 1024

In [2]:
import sys
sys.path.append("..")
from datasets import load_dataset

dataset = load_dataset("wikipedia", "20220301.fr", split="train")
dataset = dataset.shuffle(seed=50)
dataset = dataset.select(range(2000))
dataset = dataset.filter(lambda example: len(example['text']) <= 2500) #filter out long articles (too slow to process & not useful for our purpose)

dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 1446
})

In [3]:
from src.data_utils import introduce_errors

def process_example(example):
    text = example['text']
    return {'input': introduce_errors(text), 'output': text}

dataset = dataset.map(process_example, batched=False)

We now get to training the model. We'll use the transformers library from hf.

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = BASE_MODEL
if BASE_MODEL is None:
    model_name = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [5]:
from src.train import generate_template

dataset = dataset.map(lambda examples: generate_template(examples, tokenizer=tokenizer, system_message=SYSTEM_PROMPT, max_length=MAX_LENGTH), batched=True, remove_columns=["input", "output"])

In [6]:
from transformers import Trainer, TrainingArguments

output_dir = "../models/" + OUTPUT_DIR

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    logging_steps=100,
    num_train_epochs=EPOCHS,
    max_steps=MAX_STEPS,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

Step,Training Loss
100,1.2811
200,1.3419
300,1.2026
400,1.3213
500,1.2047
600,1.1533
700,1.077
800,1.0461
900,0.8973
1000,1.0616


TrainOutput(global_step=2892, training_loss=0.8641085815957308, metrics={'train_runtime': 2075.5791, 'train_samples_per_second': 1.393, 'train_steps_per_second': 1.393, 'total_flos': 1.7291296087474176e+16, 'train_loss': 0.8641085815957308, 'epoch': 2.0})