# Master Thesis - Mattia Piazzalunga
In this notebook, the model development for the English dataset is carried out.

*Title*: Bridging a GAP: Text Style Transfer from Journalistic to Conversational for enhanced social media dissemination of news

*Supervisor*: Gabriella Pasi <br>
*Author*: Mattia Piazzalunga

*University*: Bicocca University of Milan <br>
*Department*: Informatics, Systems and Communication <br>
*Course*: Computer Science <br>
*Academic year*: 2023/2024

*Info*: This notebook was run on one of the servers of the DISCo department of the University of Milano Bicocca. Download the files offline if you want to run this.

*For suggestions or questions*: mattiapiazzalunga@outlook.com

## Inizializzazion

### Dowloading libraires

In [1]:
!pip install datasets sacrebleu meteor tiktoken transformers bert-score nltk peft psutil GPUtil torch datasets evaluate langdetect mtranslate
!pip install --upgrade  nltk

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Importing libraries

In [2]:
# Import necessary libraries
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer,
    DataCollatorWithPadding, TrainingArguments, AutoTokenizer,
    AutoModelForSequenceClassification, EarlyStoppingCallback,
    GenerationConfig
)
from langdetect import detect_langs, LangDetectException
import gc
import sacrebleu
import nltk
import re
import psutil
import GPUtil
import requests
import platform
from evaluate import load
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from peft.utils import TaskType
from datasets import Dataset



In [3]:
# Download additional NLTK resources required for METEOR
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Set the seed

In [4]:
# Set a fixed seed for reproducibility
seed = 1234
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

### Get Hardware information

In [5]:
#Set the CUDA GPU
if torch.cuda.is_available():
    device_name = torch.device("cuda:0")
else:
    device_name = torch.device('cpu')
print("Running on {}.".format(device_name))

Running on cuda:0.


In [6]:
def get_system_info():
    info = {}
    # Operating System Information
    info['Operating System'] = platform.system()
    info['OS Version'] = platform.version()
    info['Architecture'] = platform.machine()
    # Processor Information
    info['Processor'] = platform.processor()
    # RAM Information
    info['Total RAM'] = f"{round(psutil.virtual_memory().total / (1024.0 **3))} GB"
    # Python Version
    info['Python Version'] = platform.python_version()
    # CUDA and GPU Information
    if torch.cuda.is_available():
        info['CUDA Available'] = True
        info['CUDA Version'] = torch.version.cuda
        gpus = GPUtil.getGPUs()
        gpu_list = []
        for gpu in gpus:
            gpu_info = {
                'Name': gpu.name,
                'Total Memory': f"{gpu.memoryTotal} MB",
                'UUID': gpu.uuid
            }
            gpu_list.append(gpu_info)
        info['GPUs'] = gpu_list
    else:
        info['CUDA Available'] = False
        info['GPUs'] = 'No GPU available'
    return info

# Retrieve and display system information
system_info = get_system_info()
print("\n*** System Information ***")
for key, value in system_info.items():
    print(f"{key}: {value}")


*** System Information ***
Operating System: Linux
OS Version: #127-Ubuntu SMP Fri Jul 5 20:13:28 UTC 2024
Architecture: x86_64
Processor: x86_64
Total RAM: 377 GB
Python Version: 3.10.12
CUDA Available: True
CUDA Version: 12.1
GPUs: [{'Name': 'NVIDIA RTX A6000', 'Total Memory': '49140.0 MB', 'UUID': 'GPU-c4437fe0-d47e-a25f-f056-1322dcd5f6e3'}]


### Make sure you have emptied the GPU

In [7]:
#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

40

### Importing the dataset

In [8]:
df = pd.read_csv("../corpora/J2C_news_EN.csv")

In [9]:
len(df)

5352

### Clean the dataset

In [10]:
#This preprocessing was created with the aim of, as far as possible, imitating that of T5.

# Function to ensure text ends with a period
def ensure_period(text):
    if pd.isna(text):
        return text
    text = text.strip()
    if not text.endswith('.'):
        return text + '.'
    return text

# Apply the function to both columns
df['journalistic'] = df['journalistic'].apply(ensure_period)
df['conversational'] = df['conversational'].apply(ensure_period)

# Remove Rows Containing Dirty Words
dirty_words_url = 'https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en'
response = requests.get(dirty_words_url)

if response.status_code == 200:
    dirty_words = response.text.splitlines()
    dirty_words = [word.lower() for word in dirty_words if word.strip()]
else:
    print("Failed to fetch dirty words list.")
    dirty_words = []

if dirty_words:
    dirty_words_pattern = re.compile(r'\b(' + '|'.join(re.escape(word) for word in dirty_words) + r')\b', re.IGNORECASE)

    def contains_dirty_words(text):
        if pd.isna(text):
            return False
        return bool(dirty_words_pattern.search(text))

    df = df[~df['journalistic'].apply(contains_dirty_words) & ~df['conversational'].apply(contains_dirty_words)]

# Remove Rows Containing "lorem ipsum"
def contains_phrase(text, phrase="lorem ipsum"):
    if pd.isna(text):
        return False
    return phrase.lower() in text.lower()

df = df[~df['journalistic'].apply(contains_phrase) & ~df['conversational'].apply(contains_phrase)]

# Remove Rows Containing "{"
def contains_curly_bracket(text):
    if pd.isna(text):
        return False
    return '{' in text

df = df[~df['journalistic'].apply(contains_curly_bracket) & ~df['conversational'].apply(contains_curly_bracket)]

# Remove Citation Markers
citation_pattern = re.compile(r'\[\s*(\d+|citation needed|citation|source)\s*\]', re.IGNORECASE)

def remove_citations(text):
    if pd.isna(text):
        return text
    return citation_pattern.sub('', text)

df['journalistic'] = df['journalistic'].apply(remove_citations)
df['conversational'] = df['conversational'].apply(remove_citations)

policy_phrases = [
    "terms of use",
    "privacy policy",
    "cookie policy",
    "use cookies",
    "use of cookies"
]

# Compile a regex pattern for policy phrases
policy_pattern = re.compile(r'\b(' + '|'.join(re.escape(phrase) for phrase in policy_phrases) + r')\b', re.IGNORECASE)

# Function to remove sentences that contain policy phrases
def remove_sentences_with_policies(text):
    if pd.isna(text):
        return text

    # Split text into sentences (basic splitting by '.', '!', and '?')
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Filter out sentences that contain any policy phrase
    filtered_sentences = [sentence for sentence in sentences if not policy_pattern.search(sentence)]

    # Join the filtered sentences back into a text
    return ' '.join(filtered_sentences)

# Apply the function to both columns
df['journalistic'] = df['journalistic'].apply(remove_sentences_with_policies)
df['conversational'] = df['conversational'].apply(remove_sentences_with_policies)

# Language Filtering
def is_english(text):
    if pd.isna(text):
        return False
    try:
        langs = detect_langs(text)
        if langs and langs[0].lang == 'en' and langs[0].prob >= 0.99:
            return True
    except LangDetectException:
        return False
    return False

df['is_journalist_en'] = df['journalistic'].apply(is_english)
df['is_conversational_en'] = df['conversational'].apply(is_english)

df = df[df['is_journalist_en'] & df['is_conversational_en']]

# Shuffle the dataset using a seed of 42
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

df = df[["journalistic", "conversational"]]

In [11]:
len(df)

4678

### Split the dataset

In [12]:
# Split the Dataset
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

In [None]:
# Function to perform back-translation
def back_translate(text, source, target):
    try:
        # Translate to an intermediate language
        translated = translate(text, target, source)
        back_translated = translate(translated, source, target)

        return str(back_translated)
    except Exception as e:
        # If translation fails, print the error and return None
        print(f"Error: {e}")
        return None

# Calculate the number of rows to back-translate (25% of total)
total_rows = len(train_df)
target_count = int(0.25 * total_rows)

# Shuffle the rows to randomize which rows will be translated
df_shuffled = train_df.sample(frac=1).reset_index(drop=True)

# Initialize variables to keep track of success count
back_translated_count = 0

# List to store the back-translated texts
back_translated_texts = pd.DataFrame(columns=["journalistic","conversational"])

# Iterate through the dataset and perform back-translation on 25% of it
for index, row in df_shuffled.iterrows():
    if back_translated_count >= target_count:
        break  # Stop once we have successfully back-translated 25%
    
    original_text = row['journalistic']
    translated_text_1 = back_translate(original_text, 'en', 'de')

    original_text_2 = row['conversational']
    translated_text_2 = back_translate(original_text_2, 'en', 'de')
    # If translation was successful, add the translated text
    if translated_text_1 is not None and translated_text_2 is not None:
        back_translated_texts.loc[len(back_translated_texts)]=[translated_text_1, translated_text_2]
        back_translated_count += 1

# Combine the original DataFrame with the back-translated DataFrame
train_df = pd.concat([train_df, back_translated_texts], ignore_index=True)

### Dataset preprocessing

In [18]:
# Convert to Huggingface Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Load the classifier

### Tokenization

In [21]:
# Initialize the Tokenizer
tokenizer_cls = AutoTokenizer.from_pretrained('../models/EN/roberta-base-news-style-CLS-journalistic-conversational-EN-v1')

# Tokenize the Data
def tokenize_function_cls(examples):
    return tokenizer_cls(examples['text'], truncation=True, max_length=256)

# Load the Pre-trained Model for Sequence Classification
id2label = {0: "conversational", 1: "journalistic"}
label2id = {"conversational": 0, "journalistic": 1}



Map:   0%|          | 0/7484 [00:00<?, ? examples/s]

Map:   0%|          | 0/936 [00:00<?, ? examples/s]

Map:   0%|          | 0/936 [00:00<?, ? examples/s]

### Model Training

In [None]:
model_cls = AutoModelForSequenceClassification.from_pretrained(
    '../models/EN/roberta-base-news-style-CLS-journalistic-conversational-EN-v1',
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model_cls.to(device_name)

# Define Training Arguments
training_args_cls = TrainingArguments(
    output_dir='./results_cls_EN',
    eval_strategy='steps',              
    logging_steps=50,                          
    eval_steps=50,                             
    save_steps=50,                            
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,        
    learning_rate=3e-5,                 
    num_train_epochs=10,   
    logging_dir='./logs',                   
    load_best_model_at_end=True,            
    metric_for_best_model='f1', 
    overwrite_output_dir=True,
    save_strategy='steps',                      
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                    
    optim="adamw_torch",
    auto_find_batch_size=False,    
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Define Data Collator
data_collator_cls = DataCollatorWithPadding(tokenizer=tokenizer_cls, padding='longest')

# Initialize the Trainer
trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls, 
    tokenizer=tokenizer_cls,
    data_collator=data_collator_cls
)

## Useful declarations

### Sequence2Sequence customization

In [26]:
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def __init__(self, generation_config=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.generation_config = generation_config

    def evaluate(self, *args, **kwargs):
        return super().evaluate(generation_config=self.generation_config, *args, **kwargs)

    def predict(self, *args, **kwargs):
        return super().predict(generation_config=self.generation_config, *args, **kwargs)

### Generation config

In [27]:
generation_config = GenerationConfig(
    max_length=50,
    num_beams=10,
    temperature=0.8,
    top_p=0.85,
    repetition_penalty=1.5,
    do_sample=True,
    length_penalty=0.85
)

### Compute metrics

In [28]:
# Define the Metrics
sacrebleu = load('sacrebleu')
bertscore = load('bertscore')
meteor = load('meteor')


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace invalid token IDs with pad_token_id
    preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Wrap labels in lists for sacrebleu
    decoded_labels_for_bleu = [[label] for label in decoded_labels]

    bleu = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)

    # Compute METEOR
    meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute BERTScore
    bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    # Aggregate BERTScore
    average_f1 = np.mean(bertscore_output['f1'])

    return {
        'bleu': bleu['score'],
        'meteor': meteor_score['meteor'],
        'bertscore_f1': average_f1
    }

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Finetune T5-v1_1-base

### Tokenization

In [29]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-base')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [30]:
# Define the preprocessing function
max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = examples['journalistic']
    targets = examples['conversational']
    inputs = ['' + inp for inp in inputs]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )

    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True
    )

    labels['input_ids'] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels['input_ids']
    ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [31]:
# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6394 [00:00<?, ? examples/s]

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

### Training the model

In [32]:
# Load the pre-trained T5 base model
model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'k', 'v', 'o'],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Move the model to the correct device (GPU or CPU)
model.to(device_name)

# Check the number of trainable parameters
model.print_trainable_parameters()

training_args = Seq2SeqTrainingArguments(
    output_dir='./results-T5-v1_1-base',                     # Directory to save the results
    save_safetensors=False,
    eval_strategy='steps',                # Evaluation strategy to use
    logging_steps=50,                            # Number of steps between logging
    eval_steps=50,                               # Number of steps between evaluations
    save_steps=50,                             # Number of steps between model saves
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,              # Batch size per device during evaluation
    learning_rate=5e-4,                         # Learning rate for the optimizer
    num_train_epochs=10,                         # Total number of training epochs
    predict_with_generate=True,                 # Whether to use generate for predictions
    logging_dir='./logs',                       # Directory for storing logs
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model='bleu',               # Metric to use to compare two different models
    overwrite_output_dir=True,
    save_strategy='steps',                      # Save strategy matches evaluation strategy
    save_total_limit=25,
    gradient_checkpointing=False,
    report_to="none",                           # Disable reporting to W&B or other services
    optim="adamw_torch",
    auto_find_batch_size=False,                  # Automatically find best batch size
    lr_scheduler_type="linear",
    greater_is_better=True,
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

# Prepare trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    generation_config=generation_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Train the Model
trainer.train()

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093


Step,Training Loss,Validation Loss,Bleu,Meteor,Bertscore F1
50,11.5343,2.690338,10.3283,0.283595,0.867931
100,3.8318,1.783537,21.76988,0.361556,0.891967
150,3.2045,1.635976,23.235603,0.372078,0.890903
200,2.8853,1.618549,23.68627,0.383432,0.891272
250,2.6773,1.586405,23.414571,0.386755,0.890082
300,2.7158,1.551044,25.038004,0.391156,0.892325
350,2.6708,1.520558,24.748406,0.405787,0.893901
400,2.6128,1.49657,25.068835,0.401225,0.893784
450,2.5244,1.56676,24.828156,0.402015,0.892404
500,2.4621,1.490664,25.208959,0.409426,0.894398


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=750, training_loss=3.2695113016764323, metrics={'train_runtime': 5612.9667, 'train_samples_per_second': 11.391, 'train_steps_per_second': 0.713, 'total_flos': 8343200179814400.0, 'train_loss': 3.2695113016764323, 'epoch': 1.875})

In [34]:
#trainer.save_model('../models/EN/T5-v1_1-base-news-style-journalistic-to-conversational-EN-v1')

### Evaluate the model

In [35]:
# Generate Predictions
predictions = trainer.predict(tokenized_val)
preds = predictions.predictions
preds = np.where(
        (preds >= 0) & (preds < tokenizer.vocab_size),
        preds,
        tokenizer.pad_token_id)

# Extract labels from the predictions object
labels = predictions.label_ids

# Process predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Wrap labels in lists for sacreBLEU
decoded_labels_for_bleu = [[label] for label in decoded_labels]

bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
meteor_score = meteor.compute(predictions=decoded_preds, references=decoded_labels)
bertscore_output = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
average_f1 = np.mean(bertscore_output['f1'])

print(f"SacreBLEU Score: {bleu_score['score']}")
print(f"METEOR Score: {meteor_score['meteor']}")
print(f"BERTScore F1: {average_f1:.4f}")

SacreBLEU Score: 25.140368333444396
METEOR Score: 0.4089050709547752
BERTScore F1: 0.8943


In [36]:
perplexity = load("perplexity")
results = perplexity.compute(model_id='openai-community/gpt2', add_start_token=False, predictions=decoded_preds)
print(round(results["mean_perplexity"], 2))



  0%|          | 0/30 [00:00<?, ?it/s]

49.84


In [37]:
# Create a Dataset from Generated Texts
gen_df = pd.DataFrame({'text': decoded_preds})
gen_dataset = Dataset.from_pandas(gen_df)

# Tokenize the Generated Texts
gen_dataset = gen_dataset.map(tokenize_function_cls, batched=True, remove_columns=['text'])
gen_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Make Predictions
predictions = trainer_cls.predict(gen_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate Conversational Style Percentage
num_conversational = np.sum(predicted_labels == 0)
total_texts = len(predicted_labels)
percentage_conversational = (num_conversational / total_texts) * 100

print(f"Number of Conversational Texts: {num_conversational}/{total_texts}")
print(f"Percentage of Conversational Texts: {percentage_conversational:.2f}%")

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

Number of Conversational Texts: 457/468
Percentage of Conversational Texts: 97.65%


### Print some results with reference

In [None]:
print("The 4th")
print("Predicted:", decoded_preds[3], " Reference:", decoded_labels[3])
print("The 20th")
print("Predicted:", decoded_preds[19], " Reference:", decoded_labels[19])
print("The 46th")
print("Predicted:", decoded_preds[45], " Reference:", decoded_labels[45])

### Clean unuseful variables

In [None]:
# List of variable names to delete
variables_to_delete = [
    'tokenizer',
    'predictions',
    'gen_df',
    'total_texts',
    'percentage_conversational',
    'num_conversational',
    'predicted_labels',
    'gen_dataset',
    'max_target_length',
    'tokenized_train',
    'tokenized_val',
    'decoded_labels',
    'decoded_labels_for_bleu',
    'tokenized_test',
    'average_f1',
    'meteor_score',
    'bertscore_output',
    'model',
    'bleu_score',
    'peft_config',
    'training_args',
    'data_collator',
    'trainer',
    'preds',
    'labels',
    'results'
]

# Function to delete variables from the global scope
def delete_variables(var_list):
    for var in var_list:
        try:
            del globals()[var]
            print(f"Deleted variable: {var}")
        except KeyError:
            print(f"Variable '{var}' does not exist and cannot be deleted.")

# Call the function to delete variables
delete_variables(variables_to_delete)

#This clean the GPU
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()