In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, DatasetDict
from datasets import Dataset

In [2]:
dataset = load_dataset("Khalida1w/funny_quotes")

In [3]:
# Chargez votre dataset
dataset = load_dataset("Khalida1w/funny_quotes")

# Divisez le dataset en ensembles d'entraînement, de validation et de test
train_testvalid = dataset['train'].train_test_split(test_size=0.25)

# Divisez l'ensemble de test + validation en deux parts égales (test et validation)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# Créez un nouveau DatasetDict avec les ensembles d'entraînement, de test et de validation
dataset_dict = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']
})


In [4]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2133
    })
    test: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 356
    })
    val: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 356
    })
})

In [5]:
# Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token


In [6]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2133
    })
    test: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 356
    })
    val: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 356
    })
})

In [23]:
max_seq_length = 80

In [7]:
# Tokenization 1

tokenized_dataset = dataset_dict.map(lambda examples: tokenizer(examples["quote"], padding=True, max_length=max_seq_length,truncation=True),batched=True)

Map:   0%|          | 0/2133 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

tokenized_dataset=tokenized_dataset.rename_columns({"tags":"labels"})

In [8]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [10]:
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    learning_rate=0.0001,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    output_dir="./quotes_model",
    overwrite_output_dir=True,
    save_total_limit=2
)

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2133
    })
    test: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 356
    })
    val: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 356
    })
})

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    data_collator=data_collator
)

trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=402, training_loss=1.9423117566464552, metrics={'train_runtime': 783.1519, 'train_samples_per_second': 8.171, 'train_steps_per_second': 0.513, 'total_flos': 261251205120000.0, 'train_loss': 1.9423117566464552, 'epoch': 3.0})

In [14]:
tokenizer.save_pretrained("quotes_model")

('quotes_model/tokenizer_config.json',
 'quotes_model/special_tokens_map.json',
 'quotes_model/vocab.json',
 'quotes_model/merges.txt',
 'quotes_model/added_tokens.json')

In [15]:
model.save_pretrained("quotes_model")

In [31]:
results = trainer.evaluate(tokenized_dataset["test"])

In [35]:
results

{'eval_loss': 2.1968119144439697,
 'eval_runtime': 15.4743,
 'eval_samples_per_second': 23.006,
 'eval_steps_per_second': 2.908,
 'epoch': 3.0}

In [44]:
model_generation = GPT2LMHeadModel.from_pretrained("/Users/n_ss/Desktop/joke-try/quotes_model")

In [46]:
tokenizer_generation = GPT2Tokenizer.from_pretrained("/Users/n_ss/Desktop/joke-try/quotes_model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [64]:
prompt="  "

In [69]:
# Générez du texte à partir du modèle
generated_text = model.generate(
    tokenizer.encode(prompt, return_tensors="pt").to("mps"),
    max_length=50,  # Longueur maximale du texte généré
    num_return_sequences=5,  # Nombre de séquences générées
    no_repeat_ngram_size=2,  # Évite les répétitions de n-grammes
    do_sample=True
)

# Obtenez le tenseur du texte généré à partir de la liste
generated_text_tensor = generated_text[1]

# Décodez le tenseur en texte lisible
generated_quote = tokenizer.decode(generated_text_tensor, skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [66]:
generated_quote

'  \xa0I had a dream about you last night. You were in a car with a guy who was trying to sell you a vibrator. I was in the back seat, and he was telling me how much he loved the vibrators he'

In [72]:

# Obtenez le tenseur du texte généré à partir de la liste
generated_text_tensor = generated_text[0]

# Décodez le tenseur en texte lisible
generated_quote = tokenizer.decode(generated_text_tensor, skip_special_tokens=True)


In [73]:
generated_quote

"   What if it's all a dream? That's my idea of inspiration, if you can believe it. I'm a full blown dream girl. And you'd be remiss if not to admit that if I were to be your mom,"