In [None]:
#downloading the reviews.txt from gdrive in Kaggle
!gdown --id 1PYJfWCT5YB5BJFR65S7bX8ktguURP1VH

In [None]:
#installing the required dependencies
!pip install transformers peft datasets evaluate accelerate bitsandbytes langdetect googletrans==4.0.0-rc1

In [None]:
!pip install rouge_score

In [3]:
#clearing the cuda cache to ensure that heavy models like pegasus can be finetuned
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [28]:
#importing the required packages and dependencies
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TrainingArguments, Trainer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration, MT5Tokenizer, MT5ForConditionalGeneration
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
from langdetect import detect
from googletrans import Translator
import logging
import datasets
import transformers
import warnings
warnings.filterwarnings("ignore")

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

#initialising the translator to translate hindi comments to english
translator = Translator()

#using kaggle's GPUs for finetuning purpose
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

#initialising the bart tokenizer and model to be used for generation of pseudo labels
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)

def generate_pseudo_label_bart(text):
    """Function for generating the pseudo labels using BART"""
    inputs = bart_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(torch_device)
    output = bart_model.generate(**inputs, max_length=100, min_length=20)
    return bart_tokenizer.decode(output[0], skip_special_tokens=True)

def load_and_preprocess(model_name):
    """
    Function for loading and pre-processing the dataset which
    includes translation, tokenization as well as pseudo-label
    generation.
    """
    with open("reviews.txt", "r", encoding="utf-8") as file:
        reviews = file.readlines()
    
    reviews = [review.strip() for review in reviews if review.strip()]
    processed_reviews = []

    #translating the reviews
    for review in reviews:
        try:
            lang = detect(review)
            if lang == "hi":
                translated_text = translator.translate(review, src="hi", dest="en").text
                processed_reviews.append(f"Original: {review}\nTranslated: {translated_text}")
            else:
                processed_reviews.append(review)
        except:
            processed_reviews.append(review)

    dataset = Dataset.from_dict({"text": processed_reviews})

    #initilising the model and tokenizer based on model_name
    if model_name == "pegasus":
        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
        model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(torch_device)
    elif model_name == "mt5":
        tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
        model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(torch_device)
    else:
        tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to(torch_device)

    def generate_pseudo_label(example):
        """Function for generating the pseudo labels"""
        input_text = example["text"]
        summary = generate_pseudo_label_bart(input_text)
    
        tokenized_input = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        tokenized_summary = tokenizer(summary, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    
        labels = tokenized_summary["input_ids"].squeeze().tolist()
        labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]
    
        return {
            "input_ids": tokenized_input["input_ids"].squeeze().tolist(),
            "attention_mask": tokenized_input["attention_mask"].squeeze().tolist(),
            "labels": labels
        }
    dataset = dataset.map(generate_pseudo_label, batched=False)

    
    if dataset is None or len(dataset) == 0:
        raise ValueError("Dataset mapping failed. Check data preprocessing.")
    
    print(f"Dataset size: {len(dataset)}")
    print(f"Tokenizer loaded: {tokenizer}")
    
    return dataset.train_test_split(test_size=0.1), tokenizer

def load_lora_model(model_name):
    """
    Function for loading the model and setting up LoRA
    config based on the model_name sent when the function
    is called.
    """
    if model_name == "pegasus":
        model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(torch_device)
    elif model_name == "mt5":
        model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(torch_device)
    else:
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to(torch_device)

    if model_name == "pegasus":
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj"]
        )
    elif model_name == "mt5":
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "wi", "wo"]
        )
    else:
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["encoder.block.0.layer.0.SelfAttention.q", 
                    "encoder.block.0.layer.0.SelfAttention.v", 
                    "encoder.block.0.layer.1.DenseReluDense.wi"]
        )
    
    model = get_peft_model(model, lora_config)
    return model

def fine_tune(model_name):
    """
    Function for fine-tuning using LoRA for the
    specified model_name.
    """
    dataset, tokenizer = load_and_preprocess(model_name)
    model = load_lora_model(model_name)
    
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=20,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none",
        fp16=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
    )
    
    trainer.train()
    model.save_pretrained(f"./fine_tuned_lora_{model_name}")
    tokenizer.save_pretrained(f"./fine_tuned_lora_{model_name}")
    
    return model, tokenizer

In [13]:
def evaluate_model(model_name):
    """
    Function for evaluating the fine-tuned model using ROUGE
    scores with BART pseudo-labels.
    """
    if model_name == "pegasus":
        model = PegasusForConditionalGeneration.from_pretrained("./fine_tuned_lora_pegasus").to(torch_device)
    elif model_name == "mt5":
        model = MT5ForConditionalGeneration.from_pretrained("./fine_tuned_lora_mt5").to(torch_device)
    else:
        model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_lora_flant5").to(torch_device)

    #get dataset and tokenizer
    dataset, tokenizer = load_and_preprocess(model_name)
    
    rouge = evaluate.load("rouge")
    predictions = []
    references = []
    
    for example in dataset["test"]["text"]:
        input_text = "summarize: " + example
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(torch_device)
        output = model.generate(**inputs)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        
        predictions.append(summary)
        references.append(generate_pseudo_label_bart(example))
    
    scores = rouge.compute(predictions=predictions, references=references)
    print(f"ROUGE Evaluation Scores for {model_name} with BART pseudo-labels: {scores}")
    return scores


def evaluate_pretrained_model(model_name):
    """
    Function for evaluating the pre-trained model using ROUGE
    scores with BART pseudo-labels.
    """
    if model_name == "pegasus":
        model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(torch_device)
        tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
    elif model_name == "mt5":
        model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(torch_device)
        tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
    else:
        model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to(torch_device)
        tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
    rouge = evaluate.load("rouge")
    with open("reviews.txt", "r", encoding="utf-8") as file:
        reviews = file.readlines()
    
    reviews = [review.strip() for review in reviews if review.strip()]    
    predictions = []
    references = []
    
    for review in reviews:
        input_text = review
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(torch_device)
        output = model.generate(**inputs)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        
        predictions.append(summary)
        references.append(generate_pseudo_label_bart(input_text))
    
    scores = rouge.compute(predictions=predictions, references=references)
    print(f"Pretrained {model_name} ROUGE Scores:", scores)
    return scores

def get_top_reviews():
    """
    Function for getting the top 5 negative and positive
    reviews by using distilbert for sentiment analysis.
    """
    sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    with open("reviews.txt", "r", encoding="utf-8") as file:
        reviews = file.readlines()
    
    reviews = [review.strip() for review in reviews if review.strip()]
    
    sentiment_results = sentiment_model(reviews)
    
    scored_reviews = [(review, result["label"], result["score"]) for review, result in zip(reviews, sentiment_results)]
    
    positive_reviews = sorted([r for r in scored_reviews if r[1] == "POSITIVE"], key=lambda x: x[2], reverse=True)[:5]
    negative_reviews = sorted([r for r in scored_reviews if r[1] == "NEGATIVE"], key=lambda x: x[2], reverse=True)[:5]
    
    print("\nTop 5 Positive Reviews:")
    for review in positive_reviews:
        print(f"Score: {review[2]:.2f} | Review: {review[0]}")
    
    print("\nTop 5 Negative Reviews:")
    for review in negative_reviews:
        print(f"Score: {review[2]:.2f} | Review: {review[0]}")
    
    return positive_reviews, negative_reviews

In [None]:
fine_tune("pegasus")
pretrained_pegasus_scores = evaluate_pretrained_model("pegasus")
finetuned_pegasus_scores = evaluate_model("pegasus")

In [41]:
print(f"The ROUGE score for fine-tuned pegasus: {finetuned_pegasus_scores}")

The ROUGE score for fine-tuned pegasus: {'rouge1': 0.33613923458043454, 'rouge2': 0.2616487813643875, 'rougeL': 0.3241700244419172, 'rougeLsum': 0.3228415488320868}


In [42]:
print(f"The ROUGE score for pre-trained pegasus: {pretrained_pegasus_scores}")

The ROUGE score for pre-trained pegasus: {'rouge1': 0.2907037477675651, 'rouge2': 0.1863888422657444, 'rougeL': 0.2569151688079649, 'rougeLsum': 0.25798304063359023}


In [None]:
fine_tune("mt5")
pretrained_mt5_scores = evaluate_pretrained_model("mt5")
finetuned_mt5_scores = evaluate_model("mt5")

In [48]:
print(f"The ROUGE score for fine-tuned MT5: {finetuned_mt5_scores}")

The ROUGE score for fine-tuned MT5: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [49]:
print(f"The ROUGE score for pre-trained MT5: {pretrained_mt5_scores}")

The ROUGE score for pre-trained MT5: {'rouge1': 0.003497536945812808, 'rouge2': 0.0022407407407407406, 'rougeL': 0.003968253968253968, 'rougeLsum': 0.004053092501368363}


In [None]:
fine_tune("flant5")
pretrained_flant5_scores = evaluate_pretrained_model("flant5")
finetuned_flant5_scores = evaluate_model("flant5")

In [45]:
print(f"The ROUGE score for fine-tuned Flan-T5: {finetuned_flant5_scores}")

The ROUGE score for fine-tuned Flan-T5: {'rouge1': 0.2988225644999838, 'rouge2': 0.202988084904916, 'rougeL': 0.29381187578422596, 'rougeLsum': 0.2906362346546678}


In [46]:
print(f"The ROUGE score for pre-trained Flan-T5: {pretrained_flant5_scores}")

The ROUGE score for pre-trained Flan-T5: {'rouge1': 0.4482300915700921, 'rouge2': 0.3799694526109791, 'rougeL': 0.43608958796747216, 'rougeLsum': 0.4362428089990562}


In [37]:
top_5_pos_reviews, top_5_neg_reviews = get_top_reviews()


Top 5 Positive Reviews:
Score: 1.00 | Review: Love this app
Score: 1.00 | Review: Its awesome app
Score: 1.00 | Review: Perfect app for influencer. Very simple , easy to use, I love it
Score: 1.00 | Review: Nice
Score: 1.00 | Review: This is good for creater

Top 5 Negative Reviews:
Score: 1.00 | Review: I have been applying for so many Collabs And not even a single reply Neither the app is that gud to use like it's very basic.. no help centre.. no way to get info on the applications.. no progress shown.. just a big waste app I feel 🥱
Score: 1.00 | Review: Worst application. It doesn't let me connect my Instagram neither my YouTube. Nothing works.
Score: 1.00 | Review: I'm not even able to connect my account to Instagram this application is showing error.not a good app
Score: 1.00 | Review: Not good enough...
Score: 1.00 | Review: Wasted my time... 2 months and not even a single Collab. I also raised an issue that I'm having trouble in connecting my Youtube to the app.. they asked me 