In [6]:
import pandas as pd
import matplotlib.pyplot as plt

chemin_fichier_csv = 'DBfinal.csv'
dataset = pd.read_csv(chemin_fichier_csv)

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.01)
print(f"Training set size: {len(train)}")
print(f"Test set size: {len(test)}")

Taille de l'ensemble d'entraînement: 117203
Taille de l'ensemble de test: 1184


In [8]:
from transformers import MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [10]:
def preprocess_function(examples):
    
    inputs = tokenizer(examples["eng"], max_length=35, truncation=True, padding="max_length")
    targets = tokenizer(examples["fr"], max_length=35, truncation=True, padding="max_length")
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/117203 [00:00<?, ? examples/s]

Map:   0%|          | 0/1184 [00:00<?, ? examples/s]

In [13]:
from transformers import MarianMTModel, MarianConfig, TrainingArguments, Trainer
import torch

config = MarianConfig.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
print(f"Le modèle utilise : {device}")

training_args = TrainingArguments(
    output_dir="./resultsENG",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",  
    save_strategy="epoch",  
    save_total_limit=3,  
    no_cuda=not torch.cuda.is_available(),  
    dataloader_num_workers=4,  
    dataloader_prefetch_factor=2  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  
    eval_dataset=tokenized_test_dataset     
)

trainer.train()



Le modèle utilise : cuda


Epoch,Training Loss,Validation Loss
1,0.3485,0.277609
2,0.2506,0.237238
3,0.1968,0.221928
4,0.1652,0.21634


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=29304, training_loss=0.30409445008911334, metrics={'train_runtime': 8287.2016, 'train_samples_per_second': 56.571, 'train_steps_per_second': 3.536, 'total_flos': 4345457883217920.0, 'train_loss': 0.30409445008911334, 'epoch': 4.0})

In [14]:
tokenizer.save_pretrained('resultsENG\checkpoint-29304')

('resultsENG\\checkpoint-29304\\tokenizer_config.json',
 'resultsENG\\checkpoint-29304\\special_tokens_map.json',
 'resultsENG\\checkpoint-29304\\vocab.json',
 'resultsENG\\checkpoint-29304\\source.spm',
 'resultsENG\\checkpoint-29304\\target.spm',
 'resultsENG\\checkpoint-29304\\added_tokens.json')

In [3]:
from transformers import MarianMTModel, MarianTokenizer

model_path = "resultsENG\checkpoint-29304"
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_path)


def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text


source_text = "To kill two birds with one stone"
translated_text = translate(source_text, model, tokenizer)
print(translated_text)   

Faire d'une pierre deux coups


In [35]:

model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)


def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


text = "It is the most famous painting in the world, and yet, when viewers manage to see"
translated_text = translate(text)
print("Translated text:", translated_text)


Translated text: C'est la peinture la plus célèbre au monde, et pourtant, quand les téléspectateurs parviennent à voir


In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load models and tokenizers
model_path = "resultsENG/checkpoint-29304"  # Replace with the correct path
custom_model = MarianMTModel.from_pretrained(model_path)
custom_tokenizer = MarianTokenizer.from_pretrained(model_path)

pretrained_model_name = "Helsinki-NLP/opus-mt-en-fr"
pretrained_tokenizer = MarianTokenizer.from_pretrained(pretrained_model_name)
pretrained_model = MarianMTModel.from_pretrained(pretrained_model_name)

# Translation function
def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=35).to(model.device)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# New idiomatic phrases
idiomatic_phrases = [
    "We are in deep water, struggling to find a solution.",
    "It's not all sunshine and rainbows, there are challenges too.",
    "He has a chip on his shoulder, always ready for a fight.",
    "She bit off more than she could chew with this project.",
    "They are walking on thin ice with their risky decisions.",
    "She made goo-goo eyes at him, trying to win his affection.",
    "He is burning the midnight oil to get the job done.",
    "She is the black sheep of the family, always causing trouble.",
    "He hit the nail on the head with his analysis.",
    "They are on the same wavelength, understanding each other perfectly.",
    "She spilled the beans about the surprise party.",
    "He is in the doghouse after forgetting their anniversary.",
    "They are keeping their cards close to their chest about the merger.",
    "He is pulling strings to get his friend a job.",
    "She is in over her head with all these responsibilities.",
    "They are feeling under the weather and need some rest.",
    "With a batting of her eyelashes, she gave him the puppy-dog eyes to get what she wanted.",  
    "They accidentally let the cat out of the bag, ruining the whole plan.",  
    "He’s got the Midas touch, everything he attempts turns to gold.", 
    "He is bending over backwards to help his friends.",
    "She is in the spotlight after her recent success.",
    "They are breaking new ground with their innovative approach.",
    "He has a heart of gold, always helping others.",
    "She is skating on thin ice with her risky behavior."
    "He always lands on his feet, it's like he's born under a lucky star.",  
    "She made goo-goo eyes at him, trying to win his affection.",  
    "Despite their best efforts to keep it a secret, someone spilled the beans about the surprise party.",  
    "Every time, he dodges trouble by a hair's breadth, truly a stroke of luck.", 
]

# Compare translations
comparisons = []
for phrase in idiomatic_phrases:
    custom_translation = translate(phrase, custom_model, custom_tokenizer)
    pretrained_translation = translate(phrase, pretrained_model, pretrained_tokenizer)
    comparisons.append((phrase, custom_translation, pretrained_translation))

# Display results
for phrase, custom_translation, pretrained_translation in comparisons:
    print(f"Phrase: {phrase}")
    print(f"Custom Model Translation: {custom_translation}")
    print(f"Pretrained Model Translation: {pretrained_translation}")
    print("="*50)


Phrase: We are in deep water, struggling to find a solution.
Custom Model Translation: Nous sommes dans de beaux draps, ayant du mal à trouver une solution.
Pretrained Model Translation: Nous sommes en eau profonde, nous luttons pour trouver une solution.
Phrase: It's not all sunshine and rainbows, there are challenges too.
Custom Model Translation: Ce n'est pas tout le soleil et l'arcenciel, il y a aussi des défis.
Pretrained Model Translation: Ce n'est pas tout le soleil et les arcs-en-ciel, il y a aussi des défis.
Phrase: He has a chip on his shoulder, always ready for a fight.
Custom Model Translation: Il a une dent contre quelqu'un, toujours prêt pour une bagarre.
Pretrained Model Translation: Il a une puce sur l'épaule, toujours prête à se battre.
Phrase: She bit off more than she could chew with this project.
Custom Model Translation: Elle a vu trop grand avec ce projet.
Pretrained Model Translation: Elle a mordu plus qu'elle ne pouvait mâcher avec ce projet.
Phrase: They are wa