# Master Thesis - Mattia Piazzalunga
In this notebook, the model development for the English dataset is carried out.

*Title*: Bridging a GAP: Text Style Transfer from Journalistic to Conversational for enhanced social media dissemination of news

*Supervisor*: Gabriella Pasi <br>
*Author*: Mattia Piazzalunga

*University*: Bicocca University of Milan <br>
*Department*: Informatics, Systems and Communication <br>
*Course*: Computer Science <br>
*Academic year*: 2023/2024

*Info*: This notebook was run on one of the servers of the DISCo department of the University of Milano Bicocca. Download the files offline if you want to run this.

*For suggestions or questions*: mattiapiazzalunga@outlook.com

## Inizializzazion

### Dowloading libraires

In [1]:
!pip install datasets sacrebleu meteor tiktoken transformers bert-score nltk peft psutil GPUtil torch datasets evaluate langdetect
!pip install --upgrade  nltk

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Importing libraries

In [2]:
# Import necessary libraries
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer,
    DataCollatorWithPadding, AutoModelForSequenceClassification,
    TrainingArguments, AutoTokenizer,
    AutoModelForSequenceClassification, EarlyStoppingCallback,
    GenerationConfig
)
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
from langdetect import detect_langs, LangDetectException
import gc
import sacrebleu
import nltk
import re
import psutil
import GPUtil
import requests
import platform
from evaluate import load
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from peft.utils import TaskType
from datasets import Dataset



In [3]:
# Download additional NLTK resources required for METEOR
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Set the seed

In [4]:
# Set a fixed seed for reproducibility
seed = 1234
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

### Get Hardware information

In [5]:
#Set the CUDA GPU
if torch.cuda.is_available():
    device_name = torch.device("cuda:0")
else:
    device_name = torch.device('cpu')
print("Running on {}.".format(device_name))

Running on cuda:0.


In [6]:
def get_system_info():
    info = {}
    # Operating System Information
    info['Operating System'] = platform.system()
    info['OS Version'] = platform.version()
    info['Architecture'] = platform.machine()
    # Processor Information
    info['Processor'] = platform.processor()
    # RAM Information
    info['Total RAM'] = f"{round(psutil.virtual_memory().total / (1024.0 **3))} GB"
    # Python Version
    info['Python Version'] = platform.python_version()
    # CUDA and GPU Information
    if torch.cuda.is_available():
        info['CUDA Available'] = True
        info['CUDA Version'] = torch.version.cuda
        gpus = GPUtil.getGPUs()
        gpu_list = []
        for gpu in gpus:
            gpu_info = {
                'Name': gpu.name,
                'Total Memory': f"{gpu.memoryTotal} MB",
                'UUID': gpu.uuid
            }
            gpu_list.append(gpu_info)
        info['GPUs'] = gpu_list
    else:
        info['CUDA Available'] = False
        info['GPUs'] = 'No GPU available'
    return info

# Retrieve and display system information
system_info = get_system_info()
print("\n*** System Information ***")
for key, value in system_info.items():
    print(f"{key}: {value}")


*** System Information ***
Operating System: Linux
OS Version: #127-Ubuntu SMP Fri Jul 5 20:13:28 UTC 2024
Architecture: x86_64
Processor: x86_64
Total RAM: 377 GB
Python Version: 3.10.12
CUDA Available: True
CUDA Version: 12.1
GPUs: [{'Name': 'NVIDIA RTX A6000', 'Total Memory': '49140.0 MB', 'UUID': 'GPU-c4437fe0-d47e-a25f-f056-1322dcd5f6e3'}]


### Make sure you have emptied the GPU

In [7]:
#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

55

### Importing the dataset

In [8]:
df = pd.read_csv("../corpora/J2C_news_EN.csv")

In [9]:
len(df)

5352

### Clean the dataset

In [10]:
#This preprocessing was created with the aim of, as far as possible, imitating that of T5.

# Function to ensure text ends with a period
def ensure_period(text):
    if pd.isna(text):
        return text
    text = text.strip()
    if not text.endswith('.'):
        return text + '.'
    return text

# Apply the function to both columns
df['journalistic'] = df['journalistic'].apply(ensure_period)
df['conversational'] = df['conversational'].apply(ensure_period)

# Remove Rows Containing Dirty Words
dirty_words_url = 'https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
response = requests.get(dirty_words_url)

if response.status_code == 200:
    dirty_words = response.text.splitlines()
    dirty_words = [word.lower() for word in dirty_words if word.strip()]
else:
    print("Failed to fetch dirty words list.")
    dirty_words = []

if dirty_words:
    dirty_words_pattern = re.compile(r'\b(' + '|'.join(re.escape(word) for word in dirty_words) + r')\b', re.IGNORECASE)

    def contains_dirty_words(text):
        if pd.isna(text):
            return False
        return bool(dirty_words_pattern.search(text))

    df = df[~df['journalistic'].apply(contains_dirty_words) & ~df['conversational'].apply(contains_dirty_words)]

# Remove Rows Containing "lorem ipsum"
def contains_phrase(text, phrase="lorem ipsum"):
    if pd.isna(text):
        return False
    return phrase.lower() in text.lower()

df = df[~df['journalistic'].apply(contains_phrase) & ~df['conversational'].apply(contains_phrase)]

# Remove Rows Containing "{"
def contains_curly_bracket(text):
    if pd.isna(text):
        return False
    return '{' in text

df = df[~df['journalistic'].apply(contains_curly_bracket) & ~df['conversational'].apply(contains_curly_bracket)]

# Remove Citation Markers
citation_pattern = re.compile(r'\[\s*(\d+|citation needed|citation|source)\s*\]', re.IGNORECASE)

def remove_citations(text):
    if pd.isna(text):
        return text
    return citation_pattern.sub('', text)

df['journalistic'] = df['journalistic'].apply(remove_citations)
df['conversational'] = df['conversational'].apply(remove_citations)

policy_phrases = [
    "terms of use",
    "privacy policy",
    "cookie policy",
    "use cookies",
    "use of cookies"
]

# Compile a regex pattern for policy phrases
policy_pattern = re.compile(r'\b(' + '|'.join(re.escape(phrase) for phrase in policy_phrases) + r')\b', re.IGNORECASE)

# Function to remove sentences that contain policy phrases
def remove_sentences_with_policies(text):
    if pd.isna(text):
        return text

    # Split text into sentences (basic splitting by '.', '!', and '?')
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Filter out sentences that contain any policy phrase
    filtered_sentences = [sentence for sentence in sentences if not policy_pattern.search(sentence)]

    # Join the filtered sentences back into a text
    return ' '.join(filtered_sentences)

# Apply the function to both columns
df['journalistic'] = df['journalistic'].apply(remove_sentences_with_policies)
df['conversational'] = df['conversational'].apply(remove_sentences_with_policies)

# Language Filtering
def is_english(text):
    if pd.isna(text):
        return False
    try:
        langs = detect_langs(text)
        if langs and langs[0].lang == 'en' and langs[0].prob >= 0.99:
            return True
    except LangDetectException:
        return False
    return False

df['is_journalist_en'] = df['journalistic'].apply(is_english)
df['is_conversational_en'] = df['conversational'].apply(is_english)

df = df[df['is_journalist_en'] & df['is_conversational_en']]

# Shuffle the dataset using a seed of 42
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df = df[["journalistic", "conversational"]]

In [11]:
len(df)

4686

### Split the dataset

In [12]:
# Split the Dataset
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

### Dataset preprocessing

In [13]:
# Convert to Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Train the classifier

### Prepare a new dataset

In [14]:
# Data Preparation
journalist_texts = df['journalistic'].dropna().tolist()
conversational_texts = df['conversational'].dropna().tolist()
texts = journalist_texts + conversational_texts
labels = [1] * len(journalist_texts) + [0] * len(conversational_texts)

# Create DataFrame and convert to Hugging Face Dataset
df_classifier = pd.DataFrame({'text': texts, 'label': labels})
df_classifier=df_classifier.sample(frac=1, random_state=42).reset_index(drop=True)

# First split into train and temp (80% train, 20% temp)
train_df_cls, temp_df_cls = train_test_split(df_classifier, test_size=0.2, random_state=seed)

# Then split temp into validation and test sets (each 10% of the original data)
val_df_cls, test_df_cls = train_test_split(temp_df_cls, test_size=0.5, random_state=seed)

In [15]:
# Convert DataFrames to Datasets
train_dataset_cls = Dataset.from_pandas(train_df_cls.reset_index(drop=True))
val_dataset_cls  = Dataset.from_pandas(val_df_cls.reset_index(drop=True))
test_dataset_cls  = Dataset.from_pandas(test_df_cls.reset_index(drop=True))

### Tokenization

In [16]:
# Initialize the Tokenizer
tokenizer_cls = AutoTokenizer.from_pretrained('roberta-base')

# Tokenize the Data
def tokenize_function_cls(examples):
    return tokenizer_cls(examples['text'], truncation=True, max_length=256)

train_dataset_cls = train_dataset_cls.map(tokenize_function_cls, batched=True, remove_columns=['text'])
val_dataset_cls = val_dataset_cls.map(tokenize_function_cls,batched=True, remove_columns=['text'])
test_dataset_cls = test_dataset_cls.map(tokenize_function_cls, batched=True, remove_columns=['text'])

# Format the Datasets for PyTorch
train_dataset_cls.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset_cls.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset_cls.set_format(type='torch',columns=['input_ids', 'attention_mask', 'label'])

# Load the Pre-trained Model for Sequence Classification
id2label = {0: "conversational", 1: "journalistic"}
label2id = {"conversational": 0, "journalistic": 1}



Map:   0%|          | 0/7497 [00:00<?, ? examples/s]

Map:   0%|          | 0/937 [00:00<?, ? examples/s]

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

### Model Training

In [17]:
model_cls = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model_cls.to(device_name)

# Define Training Arguments
training_args_cls = TrainingArguments(
    output_dir='./results_cls_EN',
    eval_strategy='steps',              
    logging_steps=50,                          
    eval_steps=50,                             
    save_steps=50,                            
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,        
    learning_rate=3e-5,                 
    num_train_epochs=10,   
    logging_dir='./logs',                   
    load_best_model_at_end=True,            
    metric_for_best_model='f1', 
    overwrite_output_dir=True,
    save_strategy='steps',                      
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                    
    optim="adamw_torch",
    auto_find_batch_size=False,    
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Define Data Collator
data_collator_cls = DataCollatorWithPadding(tokenizer=tokenizer_cls, padding='longest')

# Define Evaluation Metrics
def compute_metrics_cls(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=1
    )
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Initialize the Trainer
trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=train_dataset_cls,
    eval_dataset=val_dataset_cls,    
    tokenizer=tokenizer_cls,
    data_collator=data_collator_cls,
    compute_metrics=compute_metrics_cls,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Fine-tune the Model
trainer_cls.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.226,0.047732,0.992529,1.0,0.984375,0.992126
100,0.0941,0.058172,0.992529,1.0,0.984375,0.992126
150,0.119,0.038159,0.993597,0.997748,0.988839,0.993274
200,0.1145,0.064097,0.98826,1.0,0.975446,0.987571
250,0.0705,0.029582,0.995731,1.0,0.991071,0.995516
300,0.1109,0.030453,0.994664,1.0,0.988839,0.994388
350,0.0495,0.026359,0.995731,1.0,0.991071,0.995516
400,0.0505,0.054701,0.989328,1.0,0.977679,0.988713
450,0.0012,0.038362,0.994664,1.0,0.988839,0.994388
500,0.0652,0.030005,0.994664,1.0,0.988839,0.994388


TrainOutput(global_step=500, training_loss=0.09014105221629143, metrics={'train_runtime': 269.5656, 'train_samples_per_second': 278.114, 'train_steps_per_second': 34.797, 'total_flos': 524672222159520.0, 'train_loss': 0.09014105221629143, 'epoch': 0.5330490405117271})

In [18]:
# Save the Fine-tuned Model
trainer_cls.save_model('../models/EN/roberta-base-news-style-CLS-journalistic-conversational-EN-v1')

### Evaluate the model

In [19]:
# Evaluate the Model on the Test Set
test_results_cls = trainer_cls.evaluate(eval_dataset=test_dataset_cls)
print("Test Set Evaluation Metrics:")
for key, value in test_results_cls.items():
    if key.startswith("eval_"):
        print(f"{key.replace('eval_', '').capitalize()}: {value:.4f}")

# 20. Detailed Classification Report
predictions_output = trainer_cls.predict(test_dataset_cls)
predictions = predictions_output.predictions
labels = predictions_output.label_ids
preds = np.argmax(predictions, axis=1)
print("\nClassification Report:")
print(classification_report(
    labels, preds, target_names=["journalistic", "conversational"]
))

# 21. Confusion Matrix
conf_matrix = confusion_matrix(labels, preds)
print("Confusion Matrix:")
print(conf_matrix)

Test Set Evaluation Metrics:
Loss: 0.0580
Accuracy: 0.9915
Precision: 0.9933
Recall: 0.9889
F1: 0.9911
Runtime: 7.0864
Samples_per_second: 132.3660
Steps_per_second: 33.1620

Classification Report:
                precision    recall  f1-score   support

  journalistic       0.99      0.99      0.99       487
conversational       0.99      0.99      0.99       451

      accuracy                           0.99       938
     macro avg       0.99      0.99      0.99       938
  weighted avg       0.99      0.99      0.99       938

Confusion Matrix:
[[484   3]
 [  5 446]]


### Clear all the unuseful variables

In [20]:
# List of variable names to delete
variables_to_delete = [
    "journalist_texts",
    "conversational_texts",
    "texts",
    "preds",
    "predictions_output",
    "predictions",
    "conf_matrix",
    "test_results_cls",
    "labels",
    "df_classifier",
    "dataset_cls",
    "train_df_cls",
    "temp_df_cls",
    "val_df_cls",
    "test_df_cls",
    "train_dataset_cls",
    "val_dataset_cls",
    "test_dataset_cls",
    "id2label",
    "label2id",
    "model_cls",
    "lora_config_cls",
    "training_args_cls",
    "data_collator_cls",
    "save_path"
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: journalist_texts
Deleted variable: conversational_texts
Deleted variable: texts
Deleted variable: preds
Deleted variable: predictions_output
Deleted variable: predictions
Deleted variable: conf_matrix
Deleted variable: test_results_cls
Deleted variable: labels
Deleted variable: df_classifier
Variable 'dataset_cls' does not exist and cannot be deleted.
Deleted variable: train_df_cls
Deleted variable: temp_df_cls
Deleted variable: val_df_cls
Deleted variable: test_df_cls
Deleted variable: train_dataset_cls
Deleted variable: val_dataset_cls
Deleted variable: test_dataset_cls
Deleted variable: id2label
Deleted variable: label2id
Deleted variable: model_cls
Variable 'lora_config_cls' does not exist and cannot be deleted.
Deleted variable: training_args_cls
Deleted variable: data_collator_cls
Variable 'save_path' does not exist and cannot be deleted.


736

## Useful declarations

### Sequence2Sequence customization

In [21]:
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, generation_config=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.generation_config = generation_config

    def evaluate(self, *args, **kwargs):
        return super().evaluate(generation_config=self.generation_config, *args, **kwargs)

    def predict(self, *args, **kwargs):
        return super().predict(generation_config=self.generation_config, *args, **kwargs)

### Generation config

In [22]:
generation_config = GenerationConfig(
    max_new_tokens=50,
    num_beams=10,
    temperature=0.8,
    top_p=0.85,
    repetition_penalty=1.5,
    do_sample=True,
    length_penalty=0.85
)

### Compute metrics

In [23]:
# Define the Metrics
sacrebleu = load('sacrebleu')
bertscore = load('bertscore')
meteor = load('meteor')


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace invalid token IDs with pad_token_id
    preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Wrap labels in lists for sacrebleu
    decoded_labels_for_bleu = [[label] for label in decoded_labels]

    bleu = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)

    # Compute METEOR
    meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute BERTScore
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    # Aggregate BERTScore
    average_f1 = np.mean(bertscore_output['f1'])

    return {
        'bleu': bleu['score'],
        'meteor': meteor_score['meteor'],
        'bertscore_f1': average_f1
    }

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Finetune T5-v1_1-base

### Tokenization

In [24]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-base')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [25]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [26]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3748 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

### Training the model

In [27]:
# Load the pre-trained T5 base model
model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Move the model to the correct device (GPU or CPU)
model.to(device_name)

# Check the number of trainable parameters
model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir='./results-T5-v1_1-base',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,              # Batch size per device during evaluation
    learning_rate=5e-4,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

# Prepare trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the Model
trainer.train()

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,11.2072,2.640782,11.065351,0.296306,0.869267
100,3.4351,1.726576,24.126879,0.387254,0.892735
150,2.6984,1.628917,25.191774,0.418936,0.894505
200,2.5253,1.604958,26.827876,0.42545,0.89741
250,2.3741,1.487012,26.099312,0.416679,0.896127
300,2.321,1.513067,27.372873,0.434853,0.898144
350,2.2278,1.487814,25.987117,0.431191,0.896809
400,2.1282,1.48281,26.369834,0.418266,0.896901
450,2.1983,1.445764,25.085899,0.403081,0.894864
500,2.1188,1.449242,24.820913,0.418132,0.89332


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=550, training_loss=3.2145127868652343, metrics={'train_runtime': 4119.733, 'train_samples_per_second': 9.098, 'train_steps_per_second': 0.57, 'total_flos': 6104840002338816.0, 'train_loss': 3.2145127868652343, 'epoch': 2.3404255319148937})

In [28]:
trainer.save_model('../models/EN/T5-v1_1-base-news-style-J2C-EN-v1')

### Evaluate the model

In [29]:
# Generate Predictions
predictions = trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 27.27045041546539
METEOR Score: 0.4300642962600747
BERTScore F1: 0.8977


In [30]:
perplexity = load("perplexity")
results = perplexity.compute(model_id='openai-community/gpt2', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/30 [00:00<?, ?it/s]

50.64


In [31]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Number of Conversational Texts: 469/469
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [32]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: A Shiite militia fighter stands in front of a damaged building in Tikrit, Iraq, in April, just days after Islamic State fighters were driven out. Many of the city's Sunni residents have now returned,  Reference: ISIS is gone and Tikrit's Sunnis and Shiites are confounding expectations by getting along -- for now.
The 20th
Predicted: ly pardoned Robert Downey Jr. on Thursday for a nearly 20-year-old felony drug conviction that led to the Oscar-nominated actor's imprisonment for roughly a year.  Reference: California's governor has pardoned Robert Downey Jr. for a drug conviction that sent the 'Iron Man' actor to prison.
The 46th
Predicted: — You could sense the demons lurking around Los Angeles Dodgers ace Clayton Kershaw once again Sunday evening, in the inning that has haunted him throughout his postseason career.  Reference: Clayton Kershaw has turned this postseason into his playground.


### Clean unuseful variables

In [33]:
# List of variable names to delete
variables_to_delete = [
    'tokenizer',
    'predictions',
    'gen_df',
    'total_texts',
    'percentage_conversational',
    'num_conversational',
    'predicted_labels',
    'gen_dataset',
    'max_target_length',
    'tokenized_train',
    'tokenized_val',
    'decoded_labels',
    'decoded_labels_for_bleu',
    'tokenized_test',
    'average_f1',
    'meteor_score',
    'bertscore_output',
    'model',
    'bleu_score',
    'peft_config',
    'training_args',
    'data_collator',
    'trainer',
    'preds',
    'labels',
    'results'
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: tokenizer
Deleted variable: predictions
Deleted variable: gen_df
Deleted variable: total_texts
Deleted variable: percentage_conversational
Deleted variable: num_conversational
Deleted variable: predicted_labels
Deleted variable: gen_dataset
Deleted variable: max_target_length
Deleted variable: tokenized_train
Deleted variable: tokenized_val
Deleted variable: decoded_labels
Deleted variable: decoded_labels_for_bleu
Deleted variable: tokenized_test
Deleted variable: average_f1
Deleted variable: meteor_score
Deleted variable: bertscore_output
Deleted variable: model
Deleted variable: bleu_score
Deleted variable: peft_config
Deleted variable: training_args
Deleted variable: data_collator
Deleted variable: trainer
Deleted variable: preds
Deleted variable: labels
Deleted variable: results


442

## Finetune T5-v1_1-small

### Tokenization

In [34]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-small')

In [35]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [36]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3748 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

### Training the model

In [37]:
# Load the pre-trained T5 base model
model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-small')

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Move the model to the correct device (GPU or CPU)
model.to(device_name)

# Check the number of trainable parameters
model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir='./results-T5-v1_1-small',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,              # Batch size per device during evaluation
    learning_rate=5e-4,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

# Prepare trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the Model
trainer.train()

trainable params: 1,376,256 || all params: 78,337,408 || trainable%: 1.7568


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,7.1529,2.997083,6.380522,0.190161,0.860044
100,4.5737,2.666217,14.9487,0.278397,0.87627
150,4.0051,2.368714,18.529602,0.314737,0.880926
200,3.7593,2.181182,17.223578,0.294111,0.878148
250,3.5768,2.112851,19.412143,0.319765,0.880773
300,3.42,2.09596,18.930158,0.304862,0.879068
350,3.3286,2.036962,20.523591,0.327001,0.881889
400,3.1629,2.048702,20.975837,0.329002,0.882746
450,3.1736,1.988514,22.950289,0.354645,0.885345
500,3.0514,1.987706,24.64772,0.365311,0.888388


TrainOutput(global_step=750, training_loss=3.583602783203125, metrics={'train_runtime': 3586.0794, 'train_samples_per_second': 10.452, 'train_steps_per_second': 0.655, 'total_flos': 2274576096559104.0, 'train_loss': 3.583602783203125, 'epoch': 3.1914893617021276})

In [38]:
trainer.save_model('../models/EN/T5-v1_1-small-news-style-J2C-EN-v1')

### Evaluate the model

In [39]:
# Generate Predictions
predictions = trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 24.52604289951704
METEOR Score: 0.36452211863606526
BERTScore F1: 0.8884


In [40]:
results = perplexity.compute(model_id='openai-community/gpt2', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/30 [00:00<?, ?it/s]

65.59


In [41]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Number of Conversational Texts: 469/469
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [42]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: in April, just days after Islamic State fighters were driven out. Many of the city's Sunni residents have now returned, but the armed Shiite groups remain. For now, the armed Shiite groups remain  Reference: ISIS is gone and Tikrit's Sunnis and Shiites are confounding expectations by getting along -- for now.
The 20th
Predicted: the governor of California pardoned Robert Downey Jr. on Thursday for a nearly 20-year-old felony drug conviction that led to the Oscar-nominated actor's imprisonment for roughly a year.  Reference: California's governor has pardoned Robert Downey Jr. for a drug conviction that sent the 'Iron Man' actor to prison.
The 46th
Predicted: the demons lurking around Los Angeles Dodgers ace Clayton Kershaw once again Sunday evening, in the inning that has haunted him throughout his postseason career.  Reference: Clayton Kershaw has turned this postseason into his playground.


### Clean unuseful variables

In [43]:
# List of variable names to delete
variables_to_delete = [
    'tokenizer',
    'predictions',
    'gen_df',
    'total_texts',
    'percentage_conversational',
    'num_conversational',
    'predicted_labels',
    'gen_dataset',
    'max_target_length',
    'tokenized_train',
    'tokenized_val',
    'decoded_labels',
    'decoded_labels_for_bleu',
    'tokenized_test',
    'average_f1',
    'meteor_score',
    'bertscore_output',
    'model',
    'bleu_score',
    'peft_config',
    'training_args',
    'data_collator',
    'trainer',
    'preds',
    'labels',
    'results'
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: tokenizer
Deleted variable: predictions
Deleted variable: gen_df
Deleted variable: total_texts
Deleted variable: percentage_conversational
Deleted variable: num_conversational
Deleted variable: predicted_labels
Deleted variable: gen_dataset
Deleted variable: max_target_length
Deleted variable: tokenized_train
Deleted variable: tokenized_val
Deleted variable: decoded_labels
Deleted variable: decoded_labels_for_bleu
Deleted variable: tokenized_test
Deleted variable: average_f1
Deleted variable: meteor_score
Deleted variable: bertscore_output
Deleted variable: model
Deleted variable: bleu_score
Deleted variable: peft_config
Deleted variable: training_args
Deleted variable: data_collator
Deleted variable: trainer
Deleted variable: preds
Deleted variable: labels
Deleted variable: results


398

## Knowledge Distillation from Fine-Tuned Model to T5-v1_1-small

### Tokenization

In [44]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-small')

In [45]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [46]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3748 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

### Set distillation trainer

In [47]:
class DistillationTrainer(CustomSeq2SeqTrainer):
    def __init__(self, teacher_model, tmp=1.0, alpha_ce=0.5, alpha_distil=0.5, **kwargs):
        super().__init__(**kwargs)
        self.teacher_model = teacher_model
        self.tmp = tmp
        self.alpha_ce = alpha_ce
        self.alpha_distil = alpha_distil

    def compute_loss(self, model, inputs, return_outputs=False):
        if model.training:
            # Training mode: compute distillation loss
            labels = inputs['labels']
            inputs_no_labels = {k: v for k, v in inputs.items() if k != 'labels'}
            outputs_student = model(**inputs_no_labels, labels=labels)
            student_loss = outputs_student.loss
            logits_student = outputs_student.logits

            with torch.no_grad():
                outputs_teacher = self.teacher_model(**inputs_no_labels)
                logits_teacher = outputs_teacher.logits

            # Ensure logits have the same shape
            min_length = min(logits_student.shape[1], logits_teacher.shape[1])
            logits_student = logits_student[:, :min_length, :]
            logits_teacher = logits_teacher[:, :min_length, :]

            # Flatten the logits
            logits_student = logits_student.reshape(-1, logits_student.size(-1))
            logits_teacher = logits_teacher.reshape(-1, logits_teacher.size(-1))

            # Compute distillation loss
            loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
            loss_distillation = loss_fct(
                torch.nn.functional.log_softmax(logits_student / self.tmp, dim=-1),
                torch.nn.functional.softmax(logits_teacher / self.tmp, dim=-1)
            ) * (self.tmp ** 2)

            # Combine the student loss and the distillation loss
            loss = self.alpha_ce * student_loss + self.alpha_distil * loss_distillation

            return (loss, outputs_student) if return_outputs else loss
        else:
            # Evaluation mode: use default loss computation
            return super().compute_loss(model, inputs, return_outputs)

### Training the model

In [48]:
# Load the fine-tuned teacher model
teacher_model = T5ForConditionalGeneration.from_pretrained("../models/EN/T5-v1_1-base-news-style-J2C-EN-v1")
teacher_model.to(device_name)
teacher_model.eval()

# Load the student
student_model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
student_model.to(device_name)

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
student_model = get_peft_model(student_model, peft_config)

# Load the student
student_model.to(device_name)

# Check the number of trainable parameters
student_model.print_trainable_parameters()

# Initialize the Distillation Trainer
distil_training_args = Seq2SeqTrainingArguments(
    output_dir='./results-T5-v1_1-small-distilled',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,              # Batch size per device during evaluation
    learning_rate=5e-4,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=student_model, padding='longest', return_tensors='pt')

distil_trainer = DistillationTrainer(
    teacher_model=teacher_model,
    model=student_model,
    args=distil_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    tmp=0.8
)

# Train the Student Model
distil_trainer.train()

trainable params: 1,376,256 || all params: 78,337,408 || trainable%: 1.7568


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,8.2495,2.902421,6.487358,0.19013,0.864033
100,5.4608,2.22912,13.62108,0.258174,0.87049
150,4.2808,2.145549,16.819648,0.291458,0.874636
200,3.2003,2.082318,18.500232,0.302763,0.879775
250,2.672,1.95842,21.42464,0.342386,0.886009
300,2.4552,1.928303,23.016766,0.365283,0.888755
350,2.2879,1.939042,22.429352,0.364913,0.886304
400,2.1352,1.942986,23.380039,0.366018,0.889144
450,2.1567,1.944363,21.693231,0.369029,0.885082
500,2.0509,1.907337,22.547474,0.370537,0.888786




TrainOutput(global_step=800, training_loss=2.918199510574341, metrics={'train_runtime': 4206.1114, 'train_samples_per_second': 8.911, 'train_steps_per_second': 0.559, 'total_flos': 2426670787067904.0, 'train_loss': 2.918199510574341, 'epoch': 3.404255319148936})

In [49]:
# Save the Fine-tuned Student Model
distil_trainer.save_model('../models/EN/T5-v1_1-small-news-style-J2C-distilled-EN-v1-1')

### Evaluate the model

In [50]:
# Generate Predictions
predictions = distil_trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 25.013711295179323
METEOR Score: 0.3870211489816429
BERTScore F1: 0.8924


In [51]:
perplexity = load("perplexity")
results = perplexity.compute(model_id='openai-community/gpt2', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/30 [00:00<?, ?it/s]

57.66


In [52]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Number of Conversational Texts: 469/469
Percentage of Conversational Texts: 100.00%


### Print some results with reference

In [53]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

The 4th
Predicted: A Shiite militia fighter stands in front of a damaged building in Tikrit, Iraq, in April, just days after Islamic State fighters were driven out.  Reference: ISIS is gone and Tikrit's Sunnis and Shiites are confounding expectations by getting along -- for now.
The 20th
Predicted: The governor of California pardoned Robert Downey Jr. on Thursday for a nearly 20-year-old felony drug conviction that led to the Oscar-nominated actor's imprisonment for roughly a year.  Reference: California's governor has pardoned Robert Downey Jr. for a drug conviction that sent the 'Iron Man' actor to prison.
The 46th
Predicted: The Dodgers ace Clayton Kershaw once again Sunday evening, in the inning that has haunted him throughout his postseason career.  Reference: Clayton Kershaw has turned this postseason into his playground.


### Clean unuseful variables

In [54]:
# List of variable names to delete
variables_to_delete = [
    "tokenizer",
    "predicted_labels",
    "gen_df",
    "num_conversational",
    "predictions",
    "percentage_conversational",
    "total_texts",
    "gen_dataset",
    "max_input_length",
    "max_target_length",
    "tokenized_train",
    "tokenized_val",
    "tokenized_test",
    "teacher_model",
    "student_model",
    "peft_config",
    "distil_training_args",
    "data_collator",
    "distil_trainer",
    "predictions",
    "preds",
    "labels",
    "decoded_preds",
    "decoded_labels",
    "decoded_labels_for_bleu",
    "bleu_score",
    "meteor_score",
    "bertscore_output",
    "average_f1",
    "results"
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

Deleted variable: tokenizer
Deleted variable: predicted_labels
Deleted variable: gen_df
Deleted variable: num_conversational
Deleted variable: predictions
Deleted variable: percentage_conversational
Deleted variable: total_texts
Deleted variable: gen_dataset
Deleted variable: max_input_length
Deleted variable: max_target_length
Deleted variable: tokenized_train
Deleted variable: tokenized_val
Deleted variable: tokenized_test
Deleted variable: teacher_model
Deleted variable: student_model
Deleted variable: peft_config
Deleted variable: distil_training_args
Deleted variable: data_collator
Deleted variable: distil_trainer
Variable 'predictions' does not exist and cannot be deleted.
Deleted variable: preds
Deleted variable: labels
Deleted variable: decoded_preds
Deleted variable: decoded_labels
Deleted variable: decoded_labels_for_bleu
Deleted variable: bleu_score
Deleted variable: meteor_score
Deleted variable: bertscore_output
Deleted variable: average_f1
Deleted variable: results


420