In [2]:
import make_dataset

all_proverbs = make_dataset.load_data()


In [3]:
for key in all_proverbs:
    print(f"{key}: {len(all_proverbs[key])} proverbs")
print(f"Total: {sum(len(all_proverbs[key]) for key in all_proverbs)} proverbs")

print("\n\nExamples of proverbs:")
key = list(all_proverbs.keys())[0]
for i in range(5):
    print(all_proverbs[key][i])


proverbs_db.txt: 34142 proverbs
proverbs_db_only_english.txt: 2208 proverbs
proverbs_digest.txt: 1000 proverbs
Total: 37350 proverbs


Examples of proverbs:
Money's for buying and a horse is for riding.
A set of white teeth does not indicate a pure heart.
Time discloses the truth.
The earth has ears, the wind has a voice.
The fox will catch you with cunning, and the wolf with courage.


In [12]:
selected_proverbs_groups = [
    "proverbs_db_only_english.txt", 
    "proverbs_digest.txt"
]

proverbs = []
for group in selected_proverbs_groups:
    proverbs.extend(all_proverbs[group])

# Fine Tuning

In [13]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install accelerate
!pip install bitsandbytes

/nix/store/58br4vk3q5akf4g8lx0pqzfhn47k3j8d-bash-5.2p37/bin/bash: line 1: pip: command not found
/nix/store/58br4vk3q5akf4g8lx0pqzfhn47k3j8d-bash-5.2p37/bin/bash: line 1: pip: command not found
/nix/store/58br4vk3q5akf4g8lx0pqzfhn47k3j8d-bash-5.2p37/bin/bash: line 1: pip: command not found
/nix/store/58br4vk3q5akf4g8lx0pqzfhn47k3j8d-bash-5.2p37/bin/bash: line 1: pip: command not found
/nix/store/58br4vk3q5akf4g8lx0pqzfhn47k3j8d-bash-5.2p37/bin/bash: line 1: pip: command not found


In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

On crée un objet DataSet dans lequel on met nos proverbes non tokenisés.
Puis on les donne aux Tokenizer de opt-125m.

In [18]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": proverbs[0:1500]})
print(dataset)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Dataset({
    features: ['text'],
    num_rows: 1500
})


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [19]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 125,534,208 || trainable%: 0.2349




In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="no",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,7.1459
20,6.6181
30,6.4068
40,6.086
50,5.8664
60,5.5399
70,5.191
80,4.8687
90,4.3173
100,3.6874


TrainOutput(global_step=188, training_loss=3.819911114712979, metrics={'train_runtime': 400.471, 'train_samples_per_second': 3.746, 'train_steps_per_second': 0.469, 'total_flos': 98324250624000.0, 'train_loss': 3.819911114712979, 'epoch': 1.0})

In [25]:
from transformers import pipeline

# Configurer un pipeline de génération
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "The"
results = generator(prompt, max_length=50, num_return_sequences=3, do_sample=True, temperature=0.7)

for i, result in enumerate(results):
    print(f"--- Génération {i+1} ---")
    print(result['generated_text'])

Device set to use cpu
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapan

--- Génération 1 ---
The sun has never
--- Génération 2 ---
The world is
--- Génération 3 ---
The real
