In [16]:
import nltk
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.tokenize import word_tokenize

In [17]:
#Test prompts and two edge cases
test_prompts = [
    #Official prompts from Section 7
    "Create a new Git branch and switch to it.",
    "Compress the folder reports into reports.tar.gz.",
    "List all Python files in the current directory recursively.",
    "Set up a virtual environment and install requests.",
    "Fetch only the first ten lines of a file named output.log.",
    #Custom edge cases
    "Find and kill all processes using more than 1GB of memory.",
    "Schedule a cron job to run cleanup.sh every Sunday at 2AM."
]

In [18]:
#Defining model and adapter path
base_model_id = "Qwen/Qwen2-0.5B"
adapter_path = "lora-qwen2-adapter"

#Loading tokenizer & base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
gen_base = pipeline("text-generation", model=base_model, tokenizer=tokenizer)

#Loading fine-tuned model
tuned_model = PeftModel.from_pretrained(base_model, adapter_path)
gen_tuned = pipeline("text-generation", model=tuned_model, tokenizer=tokenizer)

#Generate responses
results =[]
for prompt in test_prompts:
  base_out = gen_base(f"Step-by-step plan for: {prompt}\n", max_new_tokens=100)[0]["generated_text"]
  tuned_out = gen_tuned(f"Step-by-step plan for: {prompt}\n", max_new_tokens=100)[0]["generated_text"]
  results.append((prompt, base_out, tuned_out))

Device set to use cpu
Device set to use cpu


In [19]:
#Downloading punkt
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [21]:
#Defining scoring function first
def score_plan_quality(response: str) -> int:
    steps = response.count("\n")
    if steps < 2:
      return 0
    if any(word in response.lower() for word in ["step", "run", "command", "execute"]):
        return 2
    return 1

#Computing BLEU & ROUGE-L
rouge = Rouge()
bleu_scores = []
rouge_scores = []
plan_scores = []

for prompt, base_out, tuned_out in results:
  ref = word_tokenize(base_out)
  hyp = word_tokenize(tuned_out)
  bleu = sentence_bleu([ref], hyp)
  rouge_score = rouge.get_scores(tuned_out, base_out)[0]["rouge-l"]["f"]
  plan_quality = score_plan_quality(tuned_out)

  bleu_scores.append(bleu)
  rouge_scores.append(rouge_score)
  plan_scores.append(plan_quality)

print("\nMetric Comparison")
for i, (prompt, base_out, tuned_out) in enumerate(results):
  print(f"Prompt {i+1}: {prompt}")
  print(f"BLEU: {bleu_scores[i]:.2f} | ROUGE-L: {rouge_scores[i]:.2f}")
  print(f"Plan quality: {plan_scores[i]} / 2\n")


Metric Comparison
Prompt 1: Create a new Git branch and switch to it.
BLEU: 0.18 | ROUGE-L: 0.39
Plan quality: 2 / 2

Prompt 2: Compress the folder reports into reports.tar.gz.
BLEU: 0.12 | ROUGE-L: 0.29
Plan quality: 2 / 2

Prompt 3: List all Python files in the current directory recursively.
BLEU: 0.21 | ROUGE-L: 0.36
Plan quality: 2 / 2

Prompt 4: Set up a virtual environment and install requests.
BLEU: 0.17 | ROUGE-L: 0.39
Plan quality: 2 / 2

Prompt 5: Fetch only the first ten lines of a file named output.log.
BLEU: 0.04 | ROUGE-L: 0.42
Plan quality: 0 / 2

Prompt 6: Find and kill all processes using more than 1GB of memory.
BLEU: 0.20 | ROUGE-L: 0.40
Plan quality: 2 / 2

Prompt 7: Schedule a cron job to run cleanup.sh every Sunday at 2AM.
BLEU: 0.18 | ROUGE-L: 0.44
Plan quality: 2 / 2

