# Master Thesis - Mattia Piazzalunga
In this notebook, the model development for the Italian dataset is carried out.

*Title*: Bridging a GAP: Text Style Transfer from Journalistic to Conversational for enhanced social media dissemination of news

*Supervisor*: Gabriella Pasi <br>
*Author*: Mattia Piazzalunga

*University*: Bicocca University of Milan <br>
*Department*: Informatics, Systems and Communication <br>
*Course*: Computer Science <br>
*Academic year*: 2023/2024

*Info*: This notebook was run on one of the servers of the DISCo department of the University of Milano Bicocca. Download the files offline if you want to run this.

*For suggestions or questions*: mattiapiazzalunga@outlook.com

## Inizializzazion

### Dowloading libraires

In [1]:
!pip install datasets sacrebleu meteor tiktoken transformers bert-score nltk peft psutil GPUtil torch datasets evaluate
!pip install --upgrade  nltk

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Importing libraries

In [2]:
# Import necessary libraries
from transformers import (
    MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer,
    DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, AutoTokenizer,
    AutoModelForSequenceClassification, EarlyStoppingCallback,
    GenerationConfig
)
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
import gc
import nltk
import re
import psutil
import GPUtil
import requests
import platform
from evaluate import load
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from peft.utils import TaskType
from datasets import Dataset



In [3]:
# Download additional NLTK resources required for METEOR
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Set the seed

In [4]:
# Set a fixed seed for reproducibility
seed = 1234
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

### Get Hardware information

In [5]:
#Set the CUDA GPU
if torch.cuda.is_available():
    device_name = torch.device("cuda:0")
else:
    device_name = torch.device('cpu')
print("Running on {}.".format(device_name))

Running on cuda:0.


In [6]:
def get_system_info():
    info = {}
    # Operating System Information
    info['Operating System'] = platform.system()
    info['OS Version'] = platform.version()
    info['Architecture'] = platform.machine()
    # Processor Information
    info['Processor'] = platform.processor()
    # RAM Information
    info['Total RAM'] = f"{round(psutil.virtual_memory().total / (1024.0 **3))} GB"
    # Python Version
    info['Python Version'] = platform.python_version()
    # CUDA and GPU Information
    if torch.cuda.is_available():
        info['CUDA Available'] = True
        info['CUDA Version'] = torch.version.cuda
        gpus = GPUtil.getGPUs()
        gpu_list = []
        for gpu in gpus:
            gpu_info = {
                'Name': gpu.name,
                'Total Memory': f"{gpu.memoryTotal} MB",
                'UUID': gpu.uuid
            }
            gpu_list.append(gpu_info)
        info['GPUs'] = gpu_list
    else:
        info['CUDA Available'] = False
        info['GPUs'] = 'No GPU available'
    return info

# Retrieve and display system information
system_info = get_system_info()
print("\n*** System Information ***")
for key, value in system_info.items():
    print(f"{key}: {value}")


*** System Information ***
Operating System: Linux
OS Version: #127-Ubuntu SMP Fri Jul 5 20:13:28 UTC 2024
Architecture: x86_64
Processor: x86_64
Total RAM: 377 GB
Python Version: 3.10.12
CUDA Available: True
CUDA Version: 12.1
GPUs: [{'Name': 'NVIDIA RTX A6000', 'Total Memory': '49140.0 MB', 'UUID': 'GPU-c4437fe0-d47e-a25f-f056-1322dcd5f6e3'}]


### Make sure you have emptied the GPU

In [7]:
#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

80

### Importing the dataset

In [8]:
df = pd.read_csv("../corpora/J2C_news_IT.csv")

In [9]:
len(df)

1478

### Clean the dataset

In [10]:
#This preprocessing was created with the aim of, as far as possible, imitating that of mT5.

# Function to ensure text ends with a period
def ensure_period(text):
    if pd.isna(text):
        return text
    text = text.strip()
    if not text.endswith('.'):
        return text + '.'
    return text

# Apply the function to both columns
df['journalistic'] = df['journalistic'].apply(ensure_period)
df['conversational'] = df['conversational'].apply(ensure_period)

# Remove Rows Containing Dirty Words
dirty_words_url = 'https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/it'
response = requests.get(dirty_words_url)

if response.status_code == 200:
    dirty_words = response.text.splitlines()
    dirty_words = [word.lower() for word in dirty_words if word.strip()]
else:
    print("Failed to fetch dirty words list.")
    dirty_words = []

if dirty_words:
    dirty_words_pattern = re.compile(r'\b(' + '|'.join(re.escape(word) for word in dirty_words) + r')\b', re.IGNORECASE)

    def contains_dirty_words(text):
        if pd.isna(text):
            return False
        return bool(dirty_words_pattern.search(text))

    df = df[~df['journalistic'].apply(contains_dirty_words) & ~df['conversational'].apply(contains_dirty_words)]

# Remove Rows Containing "{"
def contains_curly_bracket(text):
    if pd.isna(text):
        return False
    return '{' in text

df = df[~df['journalistic'].apply(contains_curly_bracket) & ~df['conversational'].apply(contains_curly_bracket)]

# Shuffle the dataset using a seed of 42
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df = df[["journalistic", "conversational"]]

In [11]:
len(df)

1377

### Split the dataset

In [12]:
# Split the Dataset
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

### Dataset preprocessing

In [13]:
# Convert to Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Train the classifier

### Prepare a new dataset

In [14]:
# Data Preparation
journalist_texts = df['journalistic'].dropna().tolist()
conversational_texts = df['conversational'].dropna().tolist()
texts = journalist_texts + conversational_texts
labels = [1] * len(journalist_texts) + [0] * len(conversational_texts)

# Create DataFrame and convert to Hugging Face Dataset
df_classifier = pd.DataFrame({'text': texts, 'label': labels})
df_classifier=df_classifier.sample(frac=1, random_state=42).reset_index(drop=True)

# First split into train and temp (80% train, 20% temp)
train_df_cls, temp_df_cls = train_test_split(df_classifier, test_size=0.2, random_state=seed)

# Then split temp into validation and test sets (each 10% of the original data)
val_df_cls, test_df_cls = train_test_split(temp_df_cls, test_size=0.5, random_state=seed)

In [15]:
# Convert DataFrames to Datasets
train_dataset_cls = Dataset.from_pandas(train_df_cls.reset_index(drop=True))
val_dataset_cls  = Dataset.from_pandas(val_df_cls.reset_index(drop=True))
test_dataset_cls  = Dataset.from_pandas(test_df_cls.reset_index(drop=True))

### Tokenization

In [16]:
# Initialize the Tokenizer
tokenizer_cls = AutoTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize the Data
def tokenize_function_cls(examples):
    return tokenizer_cls(examples['text'], truncation=True, max_length=256)

train_dataset_cls = train_dataset_cls.map(tokenize_function_cls, batched=True, remove_columns=['text'])
val_dataset_cls = val_dataset_cls.map(tokenize_function_cls,batched=True, remove_columns=['text'])
test_dataset_cls = test_dataset_cls.map(tokenize_function_cls, batched=True, remove_columns=['text'])

# Format the Datasets for PyTorch
train_dataset_cls.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset_cls.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset_cls.set_format(type='torch',columns=['input_ids', 'attention_mask', 'label'])

# Load the Pre-trained Model for Sequence Classification
id2label = {0: "conversational", 1: "journalistic"}
label2id = {"conversational": 0, "journalistic": 1}



Map:   0%|          | 0/2203 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Map:   0%|          | 0/276 [00:00<?, ? examples/s]

### Model Training

In [17]:
model_cls = AutoModelForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model_cls.to(device_name)

# Define Training Arguments
training_args_cls = TrainingArguments(
    output_dir='./results_cls_IT',
    eval_strategy='steps',              
    logging_steps=50,                          
    eval_steps=50,                             
    save_steps=50,                            
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,        
    learning_rate=1e-5,                 
    num_train_epochs=10,   
    logging_dir='./logs',                   
    load_best_model_at_end=True,            
    metric_for_best_model='f1', 
    overwrite_output_dir=True,
    save_strategy='steps',                      
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                    
    optim="adamw_torch",
    auto_find_batch_size=False,    
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Define Data Collator
data_collator_cls = DataCollatorWithPadding(tokenizer=tokenizer_cls, padding='longest')

# Define Evaluation Metrics
def compute_metrics_cls(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=1
    )
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Initialize the Trainer
trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=train_dataset_cls,
    eval_dataset=val_dataset_cls,    
    tokenizer=tokenizer_cls,
    data_collator=data_collator_cls,
    compute_metrics=compute_metrics_cls,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Fine-tune the Model
trainer_cls.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.6218,0.430411,0.923636,0.984252,0.868056,0.922509
100,0.2753,0.135185,0.956364,0.964789,0.951389,0.958042
150,0.139,0.249,0.938182,0.97037,0.909722,0.939068
200,0.1266,0.191893,0.974545,0.965986,0.986111,0.975945
250,0.0598,0.18931,0.970909,0.965753,0.979167,0.972414
300,0.0873,0.174676,0.974545,0.965986,0.986111,0.975945
350,0.0886,0.140231,0.981818,0.966443,1.0,0.982935
400,0.0717,0.184107,0.974545,0.965986,0.986111,0.975945
450,0.1173,0.133201,0.974545,0.965986,0.986111,0.975945
500,0.073,0.142792,0.978182,0.966216,0.993056,0.979452


TrainOutput(global_step=600, training_loss=0.1479583987593651, metrics={'train_runtime': 396.4505, 'train_samples_per_second': 55.568, 'train_steps_per_second': 6.962, 'total_flos': 624765423172800.0, 'train_loss': 0.1479583987593651, 'epoch': 2.1739130434782608})

In [18]:
# Save the Fine-tuned Model
trainer_cls.save_model('../models/IT/xlm-roberta-news-style-CLS-journalistic-conversational-IT-v1')

### Evaluate the model

In [19]:
# Evaluate the Model on the Test Set
test_results_cls = trainer_cls.evaluate(eval_dataset=test_dataset_cls)
print("Test Set Evaluation Metrics:")
for key, value in test_results_cls.items():
    if key.startswith("eval_"):
        print(f"{key.replace('eval_', '').capitalize()}: {value:.4f}")

# 20. Detailed Classification Report
predictions_output = trainer_cls.predict(test_dataset_cls)
predictions = predictions_output.predictions
labels = predictions_output.label_ids
preds = np.argmax(predictions, axis=1)
print("\nClassification Report:")
print(classification_report(
    labels, preds, target_names=["journalistic", "conversational"]
))

# 21. Confusion Matrix
conf_matrix = confusion_matrix(labels, preds)
print("Confusion Matrix:")
print(conf_matrix)

Test Set Evaluation Metrics:
Loss: 0.0562
Accuracy: 0.9928
Precision: 0.9857
Recall: 1.0000
F1: 0.9928
Runtime: 1.0296
Samples_per_second: 268.0770
Steps_per_second: 67.0190

Classification Report:
                precision    recall  f1-score   support

  journalistic       1.00      0.99      0.99       138
conversational       0.99      1.00      0.99       138

      accuracy                           0.99       276
     macro avg       0.99      0.99      0.99       276
  weighted avg       0.99      0.99      0.99       276

Confusion Matrix:
[[136   2]
 [  0 138]]


### Clear all the unuseful variables

In [20]:
# List of variable names to delete
variables_to_delete = [
    "journalist_texts",
    "conversational_texts",
    "texts",
    "preds",
    "predictions_output",
    "predictions",
    "conf_matrix",
    "test_results_cls",
    "labels",
    "df_classifier",
    "dataset_cls",
    "train_df_cls",
    "temp_df_cls",
    "val_df_cls",
    "test_df_cls",
    "train_dataset_cls",
    "val_dataset_cls",
    "test_dataset_cls",
    "id2label",
    "label2id",
    "model_cls",
    "lora_config_cls",
    "training_args_cls",
    "data_collator_cls",
    "save_path"
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: journalist_texts
Deleted variable: conversational_texts
Deleted variable: texts
Deleted variable: preds
Deleted variable: predictions_output
Deleted variable: predictions
Deleted variable: conf_matrix
Deleted variable: test_results_cls
Deleted variable: labels
Deleted variable: df_classifier
Variable 'dataset_cls' does not exist and cannot be deleted.
Deleted variable: train_df_cls
Deleted variable: temp_df_cls
Deleted variable: val_df_cls
Deleted variable: test_df_cls
Deleted variable: train_dataset_cls
Deleted variable: val_dataset_cls
Deleted variable: test_dataset_cls
Deleted variable: id2label
Deleted variable: label2id
Deleted variable: model_cls
Variable 'lora_config_cls' does not exist and cannot be deleted.
Deleted variable: training_args_cls
Deleted variable: data_collator_cls
Variable 'save_path' does not exist and cannot be deleted.


102

## Useful declarations

### Sequence2Sequence customization

In [21]:
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, generation_config=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.generation_config = generation_config

    def evaluate(self, *args, **kwargs):
        return super().evaluate(generation_config=self.generation_config, *args, **kwargs)

    def predict(self, *args, **kwargs):
        return super().predict(generation_config=self.generation_config, *args, **kwargs)

### Generation config

In [22]:
generation_config = GenerationConfig(
    max_new_tokens=64,
    num_beams=10,
    temperature=0.8,
    top_p=0.85,
    repetition_penalty=1.5,
    do_sample=True,
    length_penalty=0.85
)

### Compute metrics

In [23]:
# Define the Metrics
sacrebleu = load('sacrebleu')
bertscore = load('bertscore')
meteor = load('meteor')


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace invalid token IDs with pad_token_id
    preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Wrap labels in lists for sacrebleu
    decoded_labels_for_bleu = [[label] for label in decoded_labels]

    bleu = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)

    # Compute METEOR
    meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute BERTScore
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="it")

    # Aggregate BERTScore
    average_f1 = np.mean(bertscore_output['f1'])

    return {
        'bleu': bleu['score'],
        'meteor': meteor_score['meteor'],
        'bertscore_f1': average_f1
    }

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Finetune mT5-base

### Tokenization

In [70]:
# Tokenization
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-base')

In [71]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [72]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

### Training the model

In [27]:
# Load the pre-trained MT5 BASE model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-base')

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Move the model to the correct device (GPU or CPU)
model.to(device_name)

# Check the number of trainable parameters
model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir='./results-mT5-base',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,              # Batch size per device during evaluation
    learning_rate=1e-3,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

# Prepare trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the Model
trainer.train()

trainable params: 3,538,944 || all params: 585,940,224 || trainable%: 0.6040


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,10.4051,4.915122,1.968475,0.134317,0.64001
100,3.3187,1.805587,26.40636,0.378205,0.739518
150,2.4094,1.792886,29.046218,0.411764,0.756838
200,2.2011,1.772069,29.209905,0.415212,0.758725
250,2.2174,1.701956,29.71773,0.424992,0.762549
300,2.0419,1.722869,29.943848,0.426225,0.762473
350,2.0435,1.669188,30.354279,0.430727,0.764874
400,1.9845,1.687578,29.073243,0.418343,0.761361
450,2.0177,1.579032,30.038641,0.428363,0.762624
500,1.9648,1.578211,31.149092,0.437815,0.767224


TrainOutput(global_step=1000, training_loss=2.4395114440917967, metrics={'train_runtime': 1945.8595, 'train_samples_per_second': 5.658, 'train_steps_per_second': 0.709, 'total_flos': 9651230458306560.0, 'train_loss': 2.4395114440917967, 'epoch': 7.246376811594203})

In [28]:
trainer.save_model('../models/IT/mT5-base-news-style-J2C-IT-v1')

### Evaluate the model

In [29]:
# Generate Predictions
predictions = trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="it")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 32.84267047880379
METEOR Score: 0.4515500317833367
BERTScore F1: 0.7754


In [30]:
perplexity = load("perplexity")
results = perplexity.compute(model_id='LorenzoDeMattei/GePpeTto', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/9 [00:00<?, ?it/s]

56.24


In [76]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Number of Conversational Texts: 138/138
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [32]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: UCRAINA | Google starebbe lavorando ad una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. Secondo il sito 91Mobiles, la funzione potrebbe chiamarsi 'Pixel Power-off Finder', affinchè funzioni  Reference: Google starebbe lavorando a una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. La funzionalità ricalca quella che Apple ha sugli iPhone, utile quando il telefono si smarrisce o viene rubato. #ANSA.
The 20th
Predicted: Un grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Tarvisio. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave  Reference: Un grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Austria. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave. Lo
The 46th
Predicted: L'

### Clean unuseful variables

In [33]:
# List of variable names to delete
variables_to_delete = [
    'tokenizer',
    'predictions',
    'gen_df',
    'total_texts',
    'percentage_conversational',
    'num_conversational',
    'predicted_labels',
    'gen_dataset',
    'max_target_length',
    'tokenized_train',
    'tokenized_val',
    'decoded_labels',
    'decoded_labels_for_bleu',
    'tokenized_test',
    'average_f1',
    'meteor_score',
    'bertscore_output',
    'model',
    'bleu_score',
    'peft_config',
    'training_args',
    'data_collator',
    'trainer',
    'preds',
    'labels',
    'results'
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: tokenizer
Deleted variable: predictions
Deleted variable: gen_df
Deleted variable: total_texts
Deleted variable: percentage_conversational
Deleted variable: num_conversational
Deleted variable: predicted_labels
Deleted variable: gen_dataset
Deleted variable: max_target_length
Deleted variable: tokenized_train
Deleted variable: tokenized_val
Deleted variable: decoded_labels
Deleted variable: decoded_labels_for_bleu
Deleted variable: tokenized_test
Deleted variable: average_f1
Deleted variable: meteor_score
Deleted variable: bertscore_output
Deleted variable: model
Deleted variable: bleu_score
Deleted variable: peft_config
Deleted variable: training_args
Deleted variable: data_collator
Deleted variable: trainer
Deleted variable: preds
Deleted variable: labels
Deleted variable: results


368

## Finetune mT5-small

### Tokenization

In [34]:
# Tokenization
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

In [35]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [36]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

### Training the model

In [37]:
# Load the pre-trained MT5 BASE model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Move the model to the correct device (GPU or CPU)
model.to(device_name)

# Check the number of trainable parameters
model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir='./results-mT5-small',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,              # Batch size per device during evaluation
    learning_rate=1e-3,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

# Prepare trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the Model
trainer.train()

trainable params: 1,376,256 || all params: 301,553,024 || trainable%: 0.4564


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,13.047,3.393349,5.697258,0.199212,0.672524
100,4.6006,2.679756,12.631769,0.20721,0.684428
150,3.6754,2.578602,23.688076,0.331463,0.721669
200,3.4235,2.633232,21.462843,0.363197,0.721717
250,3.3283,2.532597,24.108165,0.393258,0.735868
300,3.1439,2.541026,24.109238,0.398742,0.737398
350,3.0971,2.353988,24.949443,0.409077,0.741133
400,2.8707,2.337944,25.963548,0.42237,0.746844
450,2.8913,2.178962,28.069828,0.396943,0.750622
500,2.8228,2.118191,28.762849,0.400485,0.752696


TrainOutput(global_step=1200, training_loss=3.2914048512776692, metrics={'train_runtime': 1526.411, 'train_samples_per_second': 7.213, 'train_steps_per_second': 0.904, 'total_flos': 5102602140499968.0, 'train_loss': 3.2914048512776692, 'epoch': 8.695652173913043})

In [38]:
trainer.save_model('../models/IT/mT5-small-news-style-J2C-IT-v1')

### Evaluate the model

In [39]:
# Generate Predictions
predictions = trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="it")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 30.484935341878064
METEOR Score: 0.42851745024909255
BERTScore F1: 0.7638


In [40]:
results = perplexity.compute(model_id='LorenzoDeMattei/GePpeTto', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/9 [00:00<?, ?it/s]

55.28


In [79]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Number of Conversational Texts: 138/138
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [42]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: Il sito 91Mobiles starebbe lavorando ad una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. Secondo il sito 91Mobiles, la funzione potrebbe chiamarsi 'Pixel Power-off Finder', affinchè funzioni  Reference: Google starebbe lavorando a una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. La funzionalità ricalca quella che Apple ha sugli iPhone, utile quando il telefono si smarrisce o viene rubato. #ANSA.
The 20th
Predicted: Il grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Tarvisio. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave  Reference: Un grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Austria. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave. Lo
The 46th
Predicted: L

### Clean unuseful variables

In [43]:
# List of variable names to delete
variables_to_delete = [
    'tokenizer',
    'predictions',
    'gen_df',
    'total_texts',
    'percentage_conversational',
    'num_conversational',
    'predicted_labels',
    'gen_dataset',
    'max_target_length',
    'tokenized_train',
    'tokenized_val',
    'decoded_labels',
    'decoded_labels_for_bleu',
    'tokenized_test',
    'average_f1',
    'meteor_score',
    'bertscore_output',
    'model',
    'bleu_score',
    'peft_config',
    'training_args',
    'data_collator',
    'trainer',
    'preds',
    'labels',
    'results'
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: tokenizer
Deleted variable: predictions
Deleted variable: gen_df
Deleted variable: total_texts
Deleted variable: percentage_conversational
Deleted variable: num_conversational
Deleted variable: predicted_labels
Deleted variable: gen_dataset
Deleted variable: max_target_length
Deleted variable: tokenized_train
Deleted variable: tokenized_val
Deleted variable: decoded_labels
Deleted variable: decoded_labels_for_bleu
Deleted variable: tokenized_test
Deleted variable: average_f1
Deleted variable: meteor_score
Deleted variable: bertscore_output
Deleted variable: model
Deleted variable: bleu_score
Deleted variable: peft_config
Deleted variable: training_args
Deleted variable: data_collator
Deleted variable: trainer
Deleted variable: preds
Deleted variable: labels
Deleted variable: results


380

## Knowledge Distillation from Fine-Tuned Model to mT5-small

### Tokenization

In [55]:
# Tokenization
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

In [56]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [57]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

### Set distillation trainer

In [58]:
class DistillationTrainer(CustomSeq2SeqTrainer):
    def __init__(self, teacher_model, tmp=1.0, alpha_ce=0.5, alpha_distil=0.5, **kwargs):
        super().__init__(**kwargs)
        self.teacher_model = teacher_model
        self.tmp = tmp
        self.alpha_ce = alpha_ce
        self.alpha_distil = alpha_distil

    def compute_loss(self, model, inputs, return_outputs=False):
        if model.training:
            # Training mode: compute distillation loss
            labels = inputs['labels']
            inputs_no_labels = {k: v for k, v in inputs.items() if k != 'labels'}
            outputs_student = model(**inputs_no_labels, labels=labels)
            student_loss = outputs_student.loss
            logits_student = outputs_student.logits

            with torch.no_grad():
                outputs_teacher = self.teacher_model(**inputs_no_labels)
                logits_teacher = outputs_teacher.logits

            # Ensure logits have the same shape
            min_length = min(logits_student.shape[1], logits_teacher.shape[1])
            logits_student = logits_student[:, :min_length, :]
            logits_teacher = logits_teacher[:, :min_length, :]

            # Flatten the logits
            logits_student = logits_student.reshape(-1, logits_student.size(-1))
            logits_teacher = logits_teacher.reshape(-1, logits_teacher.size(-1))

            # Compute distillation loss
            loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
            loss_distillation = loss_fct(
                torch.nn.functional.log_softmax(logits_student / self.tmp, dim=-1),
                torch.nn.functional.softmax(logits_teacher / self.tmp, dim=-1)
            ) * (self.tmp ** 2)

            # Combine the student loss and the distillation loss
            loss = self.alpha_ce * student_loss + self.alpha_distil * loss_distillation

            return (loss, outputs_student) if return_outputs else loss
        else:
            # Evaluation mode: use default loss computation
            return super().compute_loss(model, inputs, return_outputs)

### Training the model

In [59]:
# Load the fine-tuned teacher model
teacher_model = MT5ForConditionalGeneration.from_pretrained("../models/IT/mT5-base-news-style-J2C-IT-v1")
teacher_model.to(device_name)
teacher_model.eval()

# Load the student
student_model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
student_model.to(device_name)

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
student_model = get_peft_model(student_model, peft_config)

# Load the student
student_model.to(device_name)

# Check the number of trainable parameters
student_model.print_trainable_parameters()

# Initialize the Distillation Trainer
distil_training_args = Seq2SeqTrainingArguments(
    output_dir='./results-mT5-small-distilled',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,              # Batch size per device during evaluation
    learning_rate=1e-3,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=student_model, padding='longest', return_tensors='pt')

distil_trainer = DistillationTrainer(
    teacher_model=teacher_model,
    model=student_model,
    args=distil_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    tmp=0.8
)

# Train the Student Model
distil_trainer.train()

trainable params: 1,376,256 || all params: 301,553,024 || trainable%: 0.4564


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,11.8983,3.126115,7.347631,0.179111,0.677072
100,4.0997,2.511096,11.718584,0.204507,0.685626
150,2.6937,2.328475,19.264606,0.303426,0.713055
200,2.3194,2.358594,25.150221,0.366367,0.736306
250,2.1643,2.248984,27.30329,0.392228,0.746678
300,2.057,2.323971,27.541973,0.392864,0.748598
350,2.014,2.182204,29.631539,0.416385,0.757711
400,1.8805,2.222344,28.422335,0.404841,0.753836
450,1.9094,2.108819,28.104069,0.399769,0.751446
500,1.8619,2.104804,30.050692,0.420201,0.759285


TrainOutput(global_step=750, training_loss=2.7847647705078127, metrics={'train_runtime': 1431.0325, 'train_samples_per_second': 7.694, 'train_steps_per_second': 0.964, 'total_flos': 3188679760920576.0, 'train_loss': 2.7847647705078127, 'epoch': 5.434782608695652})

In [60]:
# Save the Fine-tuned Student Model
distil_trainer.save_model('../models/IT/mT5-small-news-style-J2C-distilled-IT-v1')

### Evaluate the model

In [61]:
# Generate Predictions
predictions = distil_trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="it")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 30.2949834487984
METEOR Score: 0.42185479608739235
BERTScore F1: 0.7600


In [62]:
results = perplexity.compute(model_id='LorenzoDeMattei/GePpeTto', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/9 [00:00<?, ?it/s]

53.65


In [81]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Number of Conversational Texts: 138/138
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [64]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: Il sito 91Mobiles starebbe lavorando ad una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. Secondo il sito 91Mobiles, la funzione potrebbe chiamarsi 'Pixel Power-off Finder', affinchè funzione  Reference: Google starebbe lavorando a una funzione che consente di localizzare uno smartphone Pixel anche quando è spento. La funzionalità ricalca quella che Apple ha sugli iPhone, utile quando il telefono si smarrisce o viene rubato. #ANSA.
The 20th
Predicted: Il grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Tarvisio. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave  Reference: Un grave incidente stradale si è verificato nella tarda mattinata sull'autostrada A23, nel tratto tra Gemona e Carnia, in direzione Austria. Nell'incidente una ventina di persone sarebbero rimaste ferite, alcune in modo grave. Lo
The 46th
Predicted: L

### Clean unuseful variables

In [69]:
# List of variable names to delete
variables_to_delete = [
    "tokenizer",
    "predicted_labels",
    "gen_df",
    "num_conversational",
    "predictions",
    "percentage_conversational",
    "total_texts",
    "gen_dataset",
    "max_input_length",
    "max_target_length",
    "tokenized_train",
    "tokenized_val",
    "tokenized_test",
    "teacher_model",
    "student_model",
    "peft_config",
    "distil_training_args",
    "data_collator",
    "distil_trainer",
    "predictions",
    "preds",
    "labels",
    "decoded_preds",
    "decoded_labels",
    "decoded_labels_for_bleu",
    "bleu_score",
    "meteor_score",
    "bertscore_output",
    "average_f1",
    "results"
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Variable 'tokenizer' does not exist and cannot be deleted.
Variable 'predicted_labels' does not exist and cannot be deleted.
Variable 'gen_df' does not exist and cannot be deleted.
Variable 'num_conversational' does not exist and cannot be deleted.
Variable 'predictions' does not exist and cannot be deleted.
Variable 'percentage_conversational' does not exist and cannot be deleted.
Variable 'total_texts' does not exist and cannot be deleted.
Variable 'gen_dataset' does not exist and cannot be deleted.
Variable 'max_input_length' does not exist and cannot be deleted.
Variable 'max_target_length' does not exist and cannot be deleted.
Variable 'tokenized_train' does not exist and cannot be deleted.
Variable 'tokenized_val' does not exist and cannot be deleted.
Variable 'tokenized_test' does not exist and cannot be deleted.
Variable 'teacher_model' does not exist and cannot be deleted.
Variable 'student_model' does not exist and cannot be deleted.
Variable 'peft_config' does not exist and 

662