In [None]:
!pip install datasets rouge_score nltk
!pip install accelerate -U
!pip install transformers==4.27.0

In [1]:
# Transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM     # BERT Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments         # These will help us to fine-tune our model
from transformers import pipeline                                         # Pipeline
from transformers import DataCollatorForSeq2Seq                           # DataCollator to batch the data 
import string                                                              # PyTorch
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from datasets import load_metric, Dataset
import re

2024-03-25 14:26:44.064927: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 14:26:44.065056: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 14:26:44.370735: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_text.csv')
val_data = pd.read_csv('validation.csv')

In [3]:
def clean_text(text):
        text = text.lower()
        text = re.sub('^.*?- ', '', text)
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return text


def clean_text_source(text):
        text = text.lower()
        text = re.sub('^.*?- ', '', text)
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
        return 'summarize: ' + text


def clean_df(df, cols):
    for col in cols:
        if col == 'text':
            df[col] = df[col].fillna('').apply(clean_text_source)
        else:
            df[col] = df[col].fillna('').apply(clean_text)
    return df

In [4]:
train_data = clean_df(train_data,['text', 'titles'])
test_data = clean_df(test_data,['text'])
val_data = clean_df(val_data,['text', 'titles'])

In [5]:
train_ds = Dataset.from_pandas(train_data)
test_ds = Dataset.from_pandas(test_data)
val_ds = Dataset.from_pandas(val_data)

In [6]:
checkpoint = 'moussaKam/barthez' # Model
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Loading Tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to('cuda')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["titles"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Applying preprocess_function to the datasets
tokenized_train = train_ds.map(preprocess_function, batched=True,
                               remove_columns=['text', 'titles']) # Removing features
# Removing features
tokenized_val = val_ds.map(preprocess_function, batched=True,
                               remove_columns=['text', 'titles']) # Removing features

  0%|          | 0/22 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
metric = load_metric('rouge')

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred# Obtaining predictions and true labels
    
    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Obtaining the true labels tokens, while eliminating any possible masked token (i.e., label = -100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip(), language='french')) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip(), language='french')) for label in decoded_labels]
    
    
    # Computing rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Extracting some results

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir = 'barthez',
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    seed = 8,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=4,
    num_train_epochs=16,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 121.06 MiB is free. Process 3441 has 14.63 GiB memory in use. Of the allocated memory 13.35 GiB is allocated by PyTorch, and 1.08 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# get all the directories in 'barthez' directory
import os
dirs = os.listdir('barthez')
tokenizer_kwargs = {'max_length':1024}

for dir in dirs:
    pipeline_sum = pipeline('summarization', model=f'barthez/{dir}', device=0 )
    test_data['titles'] = test_data['text'].apply(lambda x: pipeline_sum(x, **tokenizer_kwargs)[0]['summary_text'])
    test_data.drop('text', axis=1, inplace=True)
    test_data.to_csv(f'{dir}_test.csv', index=False)
    print(f'{dir} done')