In [27]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import re

In [28]:
import torch
from transformers import pipeline
from datasets import Dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [29]:
dataset_path = "taskcsv/formatted_data.csv"

In [30]:
df = pd.read_csv(dataset_path)

df_sampled = df.sample(n=500, random_state=42)

dataset = Dataset.from_pandas(df_sampled)

dataset_df = dataset.to_pandas()

dataset_df = dataset_df.reset_index(drop=True)

train_df, test_df = train_test_split(dataset_df, test_size=0.3, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

In [None]:
checkpoint = "C:/xampp/htdocs/taskapp/llm/merged_model"
device = "cpu"

In [31]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [None]:
model.eval()

In [33]:
def generate_subtasks(task_title, model, tokenizer, num_subtasks, detail_level):
    if detail_level == "low":
        detail_instruction = "Each subtask should be very short, with fewer than 5 words."
    elif detail_level == "high":
        detail_instruction = "Each subtask should be detailed, with 10-20 words per subtask."
    
    prompt = (
        f"You are a task planner. Break down the following task into exactly {num_subtasks} clear and actionable steps. "
        f"{detail_instruction} Each subtask should be practical, specific, and easy to follow. The subtasks should be ordered logically and focus on accomplishing the task in a methodical way. "
        "Avoid any filler, general explanations, or placeholders. The goal is for someone to be able to follow these steps and complete the task without needing further clarification.\n\n"
        f"Task: {task_title}\n\n"
        "Subtasks:"
    )

    tokenizer.pad_token = tokenizer.eos_token

    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
        
        outputs = model.generate(
            **inputs,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=2,
            num_beams=1,
            early_stopping=True,
            max_new_tokens=256
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    subtasks_section = generated_text.split("Subtasks:", 1)[-1].strip()
    subtasks = []
    
    valid_subtask_regex = r"^\d+\.\s+"

    for line in subtasks_section.split("\n"):
        line = line.strip()
        if line and re.match(valid_subtask_regex, line):
            subtask_text = re.sub(r"^\d+\.\s*", "", line)  # Remove numbering
            subtasks.append(subtask_text.strip())  # Add the subtask text

    subtasks = subtasks[:num_subtasks]

    while len(subtasks) < num_subtasks:
        subtasks.append("Complete the task.")

    return subtasks

In [38]:
def validate_model_with_generated_subtasks(model, tokenizer, test_dataset, num_subtasks=5, detail_level="low"):
    if not all(field in test_dataset.column_names for field in ["instruction", "output"]):
        raise ValueError("Test dataset must contain 'instruction' and 'output' columns.")
    
    generated_subtasks = []
    ground_truth_subtasks = []
    rouge_scores = []
    bleu_scores = []
    
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    with torch.no_grad():
        for i, example in enumerate(test_dataset):
            try:
                task = example['instruction']
                ground_truth = example['output']

                generated_subtask = generate_subtasks(task, model, tokenizer, num_subtasks, detail_level)

                generated_subtask_text = "\n".join(generated_subtask)
                
                generated_subtasks.append(generated_subtask_text)
                ground_truth_subtasks.append(ground_truth)

                if i % 10 == 0:
                    print(f"Processed {i}/{len(test_dataset)} examples")

            except Exception as e:
                print(f"Error processing example {i}: {e}")
                continue

    print("Calculating metrics...")
    smooth = SmoothingFunction()
    for gen, gt in zip(generated_subtasks, ground_truth_subtasks):
        # BLEU score
        bleu = sentence_bleu([gt.split()], gen.split(), smoothing_function=smooth.method4)
        bleu_scores.append(bleu)

        # ROUGE score
        rouge = rouge_scorer_obj.score(gt, gen)
        rouge_scores.append(rouge)

    # BERTScore
    P, R, F1 = bert_score(generated_subtasks, ground_truth_subtasks, lang="en")
    bert_scores = F1.tolist()

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {
        key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores)
        for key in ['rouge1', 'rougeL']
    }
    avg_bert = sum(bert_scores) / len(bert_scores)

    print(f"\nValidation Results:")
    print(f"BLEU Score: {avg_bleu:.4f}")
    print(f"ROUGE Scores: {avg_rouge}")
    print(f"BERTScore: {avg_bert:.4f}")

    return generated_subtasks, ground_truth_subtasks, avg_bleu, avg_rouge, avg_bert

In [None]:
generated_subtasks, ground_truth_subtasks, avg_bleu, avg_rouge, avg_bert = validate_model_with_generated_subtasks(model, tokenizer, test_dataset)

In [None]:
print(f"\nGenerated Subtasks:\n{generated_subtasks}")
print(f"\nGround Truth Subtasks:\n{ground_truth_subtasks}")