## All imports

In [1]:
import numpy as np                                                                                    # Algèbre linéaire
import torch                                                                                          # pytorch
from sklearn.model_selection import train_test_split                                                  # division train/test
import pandas as pd                                                                                   # traitement de données
from datasets import load_dataset                                                                     # chargement de dataset
import glob                                                                                           # glob pour les fichiers
import os                                                                                             # os pour les fichiers
import re                                                                                             # regex
import nltk
import torch
import datasets
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer,PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_checkpoint ='t5-small' # 
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, PreTrainedTokenizerFast)

pad_on_right = tokenizer.padding_side == "right"

## Data

In [2]:
def read_data(path):                                                              # fonction pour lire les fichiers
    data = []
    for topic in os.listdir(path):
        for file in os.listdir(path + "/" + topic):                               # parcourir les fichiers par topic
            with open(path + "/" + topic + "/" + file) as f:
                data.append(f.read())
    return data

original_text = read_data("files/BBC News Summary/Summaries")                      # lire le texte original
summary_text = read_data("files/BBC News Summary/News Articles")                   # lire le texte résumé

df = pd.DataFrame({'original':original_text,'summary':summary_text})               # créer un dataframe avec les deux colonnes

df.to_csv('files/summary.csv', index=False)                                        # sauvegarder le dataframe en csv

In [2]:
dataset = load_dataset('csv', data_files='files/summary.csv', split='train')       # charger le dataset
dataset = dataset.train_test_split(test_size=0.1)                                  # diviser le dataset en train/test
train_dataset = dataset['train']                                                   # train dataset
test_dataset = dataset['test'] 
dataset = train_dataset.train_test_split(test_size=0.1)                                  
train_dataset = dataset['train']
val_dataset = dataset['test']

Using custom data configuration default-bc168876467c789e
Found cached dataset csv (C:/Users/moham/.cache/huggingface/datasets/csv/default-bc168876467c789e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


In [15]:
max_input_length = 150
max_target_length = 80

In [16]:
def preprocess_function(examples):
    inputs = ['summarize:' + doc for doc in examples["original"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = val_dataset.map(preprocess_function, batched=True)

100%|██████████| 2/2 [00:06<00:00,  3.50s/ba]
100%|██████████| 1/1 [00:00<00:00,  1.36ba/s]


## Fine Tuning

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
batch_size = 16
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

Downloading: 100%|██████████| 242M/242M [00:21<00:00, 11.4MB/s] 


In [24]:
import gc
gc.collect()

1417

In [29]:
# determine the device we will be using for training
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("[INFO] using {} device".format(DEVICE))

[INFO] using cpu device


In [26]:
%env WANDB_DISABLED=True


env: WANDB_DISABLED=True


In [32]:
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-newsarticles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    #fp16=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

metric = load_metric("rouge")

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1801
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total opti

{'eval_loss': 2.488041877746582, 'eval_rouge1': 26.2538, 'eval_rouge2': 17.4694, 'eval_rougeL': 23.2642, 'eval_rougeLsum': 24.6511, 'eval_gen_len': 19.0, 'eval_runtime': 86.56, 'eval_samples_per_second': 2.322, 'eval_steps_per_second': 0.15, 'epoch': 1.0}


 40%|████      | 226/565 [52:57<59:24, 10.51s/it]  The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16
                                                 
 40%|████      | 226/565 [54:23<59:24, 10.51s/it]

{'eval_loss': 2.3971829414367676, 'eval_rouge1': 26.4822, 'eval_rouge2': 18.5141, 'eval_rougeL': 23.884, 'eval_rougeLsum': 25.0978, 'eval_gen_len': 19.0, 'eval_runtime': 85.0513, 'eval_samples_per_second': 2.363, 'eval_steps_per_second': 0.153, 'epoch': 2.0}


 60%|██████    | 339/565 [1:46:55<43:30, 11.55s/it]    The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16
                                                   
 60%|██████    | 339/565 [1:48:32<43:30, 11.55s/it]

{'eval_loss': 2.359138250350952, 'eval_rouge1': 26.1709, 'eval_rouge2': 18.0859, 'eval_rougeL': 23.601, 'eval_rougeLsum': 24.7753, 'eval_gen_len': 19.0, 'eval_runtime': 96.8689, 'eval_samples_per_second': 2.075, 'eval_steps_per_second': 0.134, 'epoch': 3.0}


 80%|████████  | 452/565 [2:13:29<22:06, 11.74s/it]  The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16
                                                   
 80%|████████  | 452/565 [2:15:04<22:06, 11.74s/it]

{'eval_loss': 2.3406260013580322, 'eval_rouge1': 26.1378, 'eval_rouge2': 17.9916, 'eval_rougeL': 23.6076, 'eval_rougeLsum': 24.8443, 'eval_gen_len': 19.0, 'eval_runtime': 94.0975, 'eval_samples_per_second': 2.136, 'eval_steps_per_second': 0.138, 'epoch': 4.0}


 88%|████████▊ | 500/565 [2:25:11<12:53, 11.90s/it]  Saving model checkpoint to t5-small-finetuned-newsarticles\checkpoint-500
Configuration saved in t5-small-finetuned-newsarticles\checkpoint-500\config.json


{'loss': 2.6348, 'learning_rate': 2.3008849557522127e-06, 'epoch': 4.42}


Model weights saved in t5-small-finetuned-newsarticles\checkpoint-500\pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-newsarticles\checkpoint-500\tokenizer_config.json
Special tokens file saved in t5-small-finetuned-newsarticles\checkpoint-500\special_tokens_map.json
Copy vocab file to t5-small-finetuned-newsarticles\checkpoint-500\spiece.model
100%|██████████| 565/565 [2:37:28<00:00, 10.37s/it]The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16
                                                   
100%|██████████| 565/565 [2:38:50<00:00, 10.37s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 565/565 [2:38:50<00:00, 16.8

{'eval_loss': 2.334956407546997, 'eval_rouge1': 26.1241, 'eval_rouge2': 18.0228, 'eval_rougeL': 23.5684, 'eval_rougeLsum': 24.8432, 'eval_gen_len': 19.0, 'eval_runtime': 81.1085, 'eval_samples_per_second': 2.478, 'eval_steps_per_second': 0.16, 'epoch': 5.0}
{'train_runtime': 9530.7585, 'train_samples_per_second': 0.945, 'train_steps_per_second': 0.059, 'train_loss': 2.621244231367533, 'epoch': 5.0}


TrainOutput(global_step=565, training_loss=2.621244231367533, metrics={'train_runtime': 9530.7585, 'train_samples_per_second': 0.945, 'train_steps_per_second': 0.059, 'train_loss': 2.621244231367533, 'epoch': 5.0})

In [33]:
# save model
trainer.save_model(f"files/{model_name}-finetuned-newsarticles")


Saving model checkpoint to files/t5-small-finetuned-newsarticles
Configuration saved in files/t5-small-finetuned-newsarticles\config.json
Model weights saved in files/t5-small-finetuned-newsarticles\pytorch_model.bin
tokenizer config file saved in files/t5-small-finetuned-newsarticles\tokenizer_config.json
Special tokens file saved in files/t5-small-finetuned-newsarticles\special_tokens_map.json
Copy vocab file to files/t5-small-finetuned-newsarticles\spiece.model


In [34]:
test_dataset = test_dataset.map(
                preprocess_function,
                batched=True)

predict_results = trainer.predict(
            test_dataset,max_length=128, num_beams=3)

metrics = predict_results.metrics
print(metrics)

100%|██████████| 1/1 [00:01<00:00,  1.09s/ba]
The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: original, summary. If original, summary are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 223
  Batch size = 16
100%|██████████| 14/14 [09:03<00:00, 38.83s/it]

{'test_loss': 2.333233594894409, 'test_rouge1': 45.4217, 'test_rouge2': 29.7591, 'test_rougeL': 35.8945, 'test_rougeLsum': 41.9565, 'test_gen_len': 97.9417, 'test_runtime': 588.5964, 'test_samples_per_second': 0.379, 'test_steps_per_second': 0.024}





In [35]:
predict_results

PredictionOutput(predictions=array([[    0, 26353,   739, ...,     0,     0,     0],
       [    0,  1983,  9377, ...,     0,     0,     0],
       [    0,    94,    56, ...,     0,     0,     0],
       ...,
       [    0, 13816,    65, ...,     0,     0,     0],
       [    0,  1163,   243, ...,     0,     0,     0],
       [    0,    37,   515, ...,     0,     0,     0]], dtype=int64), label_ids=array([[26353,   739, 17640, ...,     0,     0,     0],
       [13824,   348,  9204, ...,     0,     0,     0],
       [    3, 31105,  4420, ...,     0,     0,     0],
       ...,
       [13816,   177,   725, ...,     0,     0,     0],
       [ 1163,    31,     7, ...,     0,     0,     0],
       [ 2180,  1982,   144, ...,     0,     0,     0]], dtype=int64), metrics={'test_loss': 2.333233594894409, 'test_rouge1': 45.4217, 'test_rouge2': 29.7591, 'test_rougeL': 35.8945, 'test_rougeLsum': 41.9565, 'test_gen_len': 97.9417, 'test_runtime': 588.5964, 'test_samples_per_second': 0.379, 'test_step

In [36]:
if args.predict_with_generate:
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]

In [37]:
predictions[0]

['Mandelson, a former Labour communications director, told BBC Radio 4\'s Today programme: "I understand why the Tories will be gunning for Alastair Campbell because they fear his campaigning skills. That charge was denied by Mr Mandelson, who said the Tories were afraid of Mr Campbell\'s campaigning skills. The European commissioner and former Labour minister was speaking amid claims that Mr Campbell is part of a Labour "dirty tricks" campaign.',
 'Actress Nicole Kidman has won a restraining order against two paparazzi photographers who she claims left her fearful of leaving her Sydney mansion. Nicole Kidman was prompted to take action following a reported high-speed car chase with members of the paparazzi in Sydney last weekend. Magistrate Lee Gilmore, who issued the restraining order at Waverley Local Court in Sydney, said she understood the photographers were entitled to earn a living but there had to be limits to their behaviour.']

In [40]:
test_dataset['summary'][0]

'Mandelson warns BBC on Campbell\n\nThe BBC should steer away from "demonising" ex-Downing Street media chief Alastair Campbell, Peter Mandelson has said.\n\nThe European commissioner and former Labour minister was speaking amid claims that Mr Campbell is part of a Labour "dirty tricks" campaign. That charge was denied by Mr Mandelson, who said the Tories were afraid of Mr Campbell\'s campaigning skills. He warned the BBC that attacking Mr Campbell had brought it trouble before. That was a reference to the Hutton inquiry following a BBC story claiming Downing Street "sexed up" Iraq\'s weapons of mass destruction dossier.\n\nThe affair prompted the resignation of BBC chairman Gavyn Davies, director-general Greg Dyke and reporter Andrew Gilligan. Labour has attracted media criticism for using new freedom of information laws to dig up information about Tory leader Michael Howard\'s past.\n\nMr Mandelson, a former Labour communications director, told BBC Radio 4\'s Today programme: "I unde