In [1]:
# !pip install --upgrade transformers
# !pip install --upgrade transformers accelerate peft bitsandbytes
# !pip install datasets
# !pip install scikit-learn
# !pip install evaluate
# !pip install protobuf
#!pip install SentencePiece 

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import evaluate
import re

In [3]:
# Load the MRPC dataset
dataset = load_dataset("glue", "mrpc")

In [4]:
#test = pd.DataFrame(dataset["test"])

In [5]:
# Define a function to remove noise
def remove_noise(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove special patterns like "< .SPX >", "< .IXIC >"
    text = re.sub(r'< \.[A-Z]+ >', '', text)
    
    # Remove ellipsis (...)
    text = re.sub(r'\.\s*\.\s*\.+', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove special characters like "â€™", "Â½", "Â£", etc.
    text = re.sub(r'[â€™Â½Â£]', '', text)
    
    # Remove single alphabets (e.g., "C")
    text = re.sub(r'\b\w\b', '', text)
    
    # Remove equal (=) sign at the end
    text = re.sub(r'=$', '', text)
    
    # Remove double hyphens (--)
    text = re.sub(r'--+', '', text)
    
    # Remove unwanted quotes
    text = re.sub(r'["“”]', '', text)

    # Fix short words with apostrophes (e.g., "'re" -> "are")
    text = re.sub(r"\s+'re\b", " are", text)
    text = re.sub(r"\b're\b", "are", text)
    text = re.sub(r"\b've\b", "have", text)
    text = re.sub(r"\b'll\b", "will", text)
    text = re.sub(r"\b'd\b", "would", text)
    text = re.sub(r"\b'm\b", "am", text)
    text = re.sub(r"\b's\b", "is", text)
    text = re.sub(r"\b'n\b", "and", text)
    text = text.lower()
    
    return text

def remove_noise_batch(examples):
    examples["cleaned_sentence1"] = [remove_noise(sentence) for sentence in examples["sentence1"]]
    examples["cleaned_sentence2"] = [remove_noise(sentence) for sentence in examples["sentence2"]]
    return examples

In [6]:
test=dataset['test'].map(remove_noise_batch, batched=True)
test

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'cleaned_sentence1', 'cleaned_sentence2'],
    num_rows: 1725
})

In [7]:
# test = test.select(range(10))
# test

In [8]:
# Load pre-trained models and tokenizers
# encoder_model_name = "roberta-base"  # Encoder-based model
# encoder_tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
# encoder_model = AutoModelForSequenceClassification.from_pretrained(encoder_model_name, num_labels=2)

encoder_decoder_model_name = "google-t5/t5-base"  # Encoder-decoder-based model
encoder_decoder_tokenizer = T5Tokenizer.from_pretrained(encoder_decoder_model_name)
encoder_decoder_model = T5ForConditionalGeneration.from_pretrained(encoder_decoder_model_name)  #AutoModelForSeq2SeqLM.from_pretrained(encoder_decoder_model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
# Define a function to create prompts
# def create_prompt(sentence1, sentence2):
#     return f"Are these sentences paraphrases? Sentence 1: {sentence1}. Sentence 2: {sentence2}."

#A paraphrase conveys the same meaning using different words while maintaining the core information. A non-paraphrase has a different meaning or significantly alters the information.  
#Are the following sentences paraphrases? Sentence 1: {sentence1}. Sentence 2: {sentence2}"""
"""
            Example 1:
            Sentence 1: "The cat is on the mat."
            Sentence 2: "The mat has a cat on it."
            Answer: paraphrase

            Example 2:
            Sentence 1: "The dog is barking."
            Sentence 2: "The cat is meowing."
            Answer: non-paraphrase"""
#Respond strictly with one word: **paraphrase** or **non-paraphrase**

def create_prompt(sentence1, sentence2):
    #print("Sentence1:",sentence1)
    #print("Sentence1:",sentence2)
    return f"""Is the following pair of sentences a paraphrase? Sentence 1: {sentence1} Sentence 2: {sentence2} Answer:"""


In [10]:
# Function to predict paraphrase using encoder-decoder-based model
def predict_paraphrase_encoder_decoder(sentence1, sentence2):
    prompt = create_prompt(sentence1, sentence2)
    #print("Prompt:",prompt)
    inputs = encoder_decoder_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = encoder_decoder_model.generate(**inputs, max_length=5, num_beams=10,temperature=0.1,do_sample=True)
        response = encoder_decoder_tokenizer.decode(outputs[0], skip_special_tokens=True)
    #return "Paraphrase" if response.lower() == "yes" else "Not Paraphrase"
    #print(f"Response:{response}")
    #print(response)
    #print("-----------------------------------------")
    return response  

In [11]:
results = []
for example in test:
    sentence1 = example["cleaned_sentence1"]
    sentence2 = example["cleaned_sentence2"]
    label = example["label"]

    prediction = predict_paraphrase_encoder_decoder(sentence1, sentence2)

    results.append({
        "sentence1": sentence1,
        "sentence2": sentence2,
        "label": label,
        "predicted_label": prediction
    })
    results_df = pd.DataFrame(results)
    #print(results_df)
    #predicted_label = "Paraphrase" if prediction.lower() == "yes" else "Not Paraphrase"

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
results_df.to_csv("T5_result.csv")

In [12]:
# # Function to predict paraphrase using encoder-based model
# def predict_paraphrase_encoder(sentence1, sentence2):
#     prompt = create_prompt(sentence1, sentence2)
#     inputs = encoder_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
#     with torch.no_grad():
#         outputs = encoder_model(**inputs)
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#     #return "Paraphrase" if predictions.item() == 1 else "Not Paraphrase"
#     return predictions.item()

In [13]:
# Evaluate on the MRPC validation set
def evaluate_on_mrpc(model_type, tokenizer, model, dataset):
    correct = 0
    total = 0
    for example in dataset["test"]:
        sentence1 = example["sentence1"]
        sentence2 = example["sentence2"]
        label = "Paraphrase" if example["label"] == 1 else "Not Paraphrase"
        
        if model_type == "encoder":
            prediction = predict_paraphrase_encoder(sentence1, sentence2)
        elif model_type == "encoder_decoder":
            prediction = predict_paraphrase_encoder_decoder(sentence1, sentence2)
        else:
            raise ValueError("Invalid model type.")
        
        if prediction == label:
            correct += 1
        total += 1
    
    accuracy = correct / total
    return accuracy


In [14]:
# # Evaluate encoder-based model
# encoder_accuracy = evaluate_on_mrpc("encoder", encoder_tokenizer, encoder_model, dataset)
# print(f"Encoder-Based Model Accuracy: {encoder_accuracy:.4f}")

# # Evaluate encoder-decoder-based model
# encoder_decoder_accuracy = evaluate_on_mrpc("encoder_decoder", encoder_decoder_tokenizer, encoder_decoder_model, dataset)
# print(f"Encoder-Decoder-Based Model Accuracy: {encoder_decoder_accuracy:.4f}")