# Text Similarity with GPT-2 (version 2)

## 1. Fine-tuning GPT-2 for Text Similarity

In [None]:
import torch

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(device)

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, GPT2LMHeadModel
from datasets import load_dataset, DatasetDict, concatenate_datasets
import random

# Ratio for limiting the number of examples
ratio = 0.05
load_local_model = False

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the readerbench dataset
dataset = load_dataset("readerbench/ro_fake_news", "eda")

considered_tag_words = ['5g', 'vaccinare', 'controlul']

# Function to create labeled pairs with both similar and dissimilar examples
def preprocess_data(example, all_bodies=None):
    example['text'] = example['headline'] + " " + example['body']

    # Label the pair as similar
    example['label'] = 1.0

    # Add dissimilar pairs if bodies list is available
    dissimilar_examples = []
    if all_bodies:
        for other_body in random.sample(all_bodies, min(5, len(all_bodies))):  # Pick a few random other bodies
            dissimilar_example = {
                'text': example['headline'] + " " + other_body,
                'label': 0.0
            }
            dissimilar_examples.append(dissimilar_example)
    return [example] + dissimilar_examples


# Collect all bodies from the dataset to use for dissimilar examples
all_bodies = [item['body'] for subset in dataset.values() for item in subset]

# Limit the number of bodies to a ratio of the total number of examples
limit = ratio * sum([len(subset) for subset in dataset.values()])   

print(f"Processing {int(limit)} similar examples.")

# Apply the function to each instance in the dataset and flatten the results
processed_data = []
i = 0
for subset in dataset.values():
    for example in subset:
        processed_data.extend(preprocess_data(example, all_bodies=all_bodies))
        
        i += 1
        if i >= limit:
            break
    if i >= limit:
        break
        
print(f"Processed {len(processed_data)} examples.")

# Convert processed data to DataFrame
processed_df = pd.DataFrame(processed_data)

# Split the data into train and validation sets
train_df = processed_df[:int(len(processed_df) * 0.9)]
validation_df = processed_df[int(len(processed_df) * 0.9):]

# Convert the DataFrames to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Create a DatasetDict for train and validation
combined_dataset = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})

# Load the GPT-2 tokenizer and model for sequence classification
if load_local_model:
    tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-romanian")
    model = GPT2LMHeadModel.from_pretrained("./gpt2-romanian")
else:
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=1)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

# Add a pad token to the tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Move the model to the GPU
model.to(device)

# Tokenize the data
def tokenize_data(example):
    encoding = tokenizer(
        example['text'],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    encoding['labels'] = example['label']
    return encoding


# Apply tokenization
train_data = combined_dataset["train"].map(tokenize_data)
eval_data = combined_dataset["validation"].map(tokenize_data)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="../gpt2-similarity",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

# Fine-tune the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./gpt2-similarity")
tokenizer.save_pretrained("./gpt2-similarity")

## 2. Loading and Testing the Fine-Tuned Model

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("../gpt2-similarity")
model = GPT2ForSequenceClassification.from_pretrained("../gpt2-similarity")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a function to calculate similarity
def calculate_similarity(prompt, ground_truth_text):
    inputs = tokenizer(prompt + " " + ground_truth_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        similarity_score = torch.sigmoid(outputs.logits).item()  # Convert logits to probability
    return similarity_score

def test_similarity(prompt, ground_truth_text):
    similarity = calculate_similarity(prompt, ground_truth_text)
    print(f"Similarity score between prompt and ground truth: {similarity:.2f}")

# Test the model with a custom prompt and ground truth
prompt1 = "Covid-19 are legatura cu reteaua 5G, conform unor teorii ale conspiratiei."
prompt2 = "Vaccinarea este o conspiratie."
prompt3 = "Politica este implicata in controlul populatiei."
ground_truth_text = "In Regatul Unit se răspândește un val de teorii ale conspirației despre o potentiala legătură între rețeaua 5G și răspândirea virusului COVID-19. Aceste teorii au fost deja demontate de experți, însă mulți oameni continuă să le creadă."

test_similarity(prompt1, ground_truth_text)
test_similarity(prompt2, ground_truth_text)
test_similarity(prompt3, ground_truth_text)

## 3. Checking for GPU Availability

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True if a GPU is available
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.current_device())  # Index of the current GPU