In [1]:
import sys
import os
import numpy as np

SCRIPT_DIR = os.path.dirname(os.path.abspath("..."))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from training.agent_trainer import END_KEY, RESPONSE_KEY_NL, CHAT_START_KEY
from datasets import load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1 

In [2]:

test_data = load_from_disk("/opt/home/bo_ling/dataset/modeling_data_v2.hf")['test']
test_data

Dataset({
    features: ['text', 'ticket_uuid'],
    num_rows: 6282
})

In [3]:
local_output_dir="/opt/home/bo_ling/modeling_data_v1_2048_checkpoint-53600"
#model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)
import torch
tokenizer = AutoTokenizer.from_pretrained(local_output_dir, padding_side="left")
device = "cuda:3" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(local_output_dir, trust_remote_code=True).to(device)

In [4]:
eos_token_id = tokenizer.encode(END_KEY)
print(END_KEY, eos_token_id)

<|endofsentence|> [50400]


In [5]:
def generate_agent_response(
    texts: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    do_sample: bool = True,
    max_new_tokens: int = 256,
    top_p: float = 0.92,
    top_k: int = 0,
    **kwargs,
) -> str:
    #texts = texts.replace(END_KEY, "")
    input_ids = tokenizer(texts, return_tensors="pt").input_ids.to(device)

    response_key_token_id = tokenizer.encode(RESPONSE_KEY_NL)[0]
    end_key_token_id = tokenizer.encode(END_KEY)[0]
    gen_tokens = model.generate(
        input_ids,
        pad_token_id=tokenizer.pad_token_id,
        # Ensure generation stops once it generates "### End"
        eos_token_id=end_key_token_id,
        do_sample=do_sample,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        **kwargs,
    )[0].cpu()

    # The response will be set to this variable if we can identify it.
    question_size = len(input_ids[0])
    decoded = tokenizer.decode(gen_tokens[question_size:]).strip()

    return decoded

In [6]:
def print_generated_chats(chat_str):
    chats = chat_str.replace("\n\n###\n", END_KEY).split(END_KEY)
    curr_text = chats[0] + END_KEY
    print(chats[0])
    for chat in chats[1:]:
        print(chat)
        if chat.startswith(RESPONSE_KEY_NL):
            generated = generate_agent_response(curr_text, model, tokenizer).strip(END_KEY)
            if generated.replace("\n", "").startswith(RESPONSE_KEY_NL.replace("\n", "")):
                print("\n")
                print("*"*40 + "START OUTPUT GENERATED" + "*"*40)
                print(generated)
                print("*"*40 + "END OUTPUT GENERATED" + "*"*40)
                print("\n")
        curr_text += chat + END_KEY

In [7]:
def compute_metrics(chat_str):
    chats = chat_str.replace("\n\n###\n", END_KEY).split(END_KEY)
    curr_text = chats[0] + END_KEY
    scores = []
    total_missed = 0
    total_f1 = 0
    total_exact = 0
    total = 0
    for i, chat in enumerate(chats[1:]):
        if chat.startswith(RESPONSE_KEY_NL):
            generated = generate_agent_response(curr_text, model, tokenizer).strip(END_KEY)
            if generated.replace("\n", "").startswith(RESPONSE_KEY_NL.replace("\n", "")):
                exact_score = compute_exact(generated, chat)
                f1_score = compute_f1(generated, chat)
                scores.append({"exact": exact_score, "f1": f1_score, "type": "active"})
                total_f1 += f1_score
                total_exact += exact_score
            else:
                scores.append({"exact": 0.0, "f1": 0.0, "type": "silent"})
                total_missed += 1
            total += 1
        curr_text += chat + END_KEY
    
    return {"scores": scores, 
            "avg_f1_score": total_f1/total, 
            "avg_exact_score": total_exact/total,
            "missed": total_missed,
           }
compute_metrics(test_data[0]['text'])

{'scores': [{'exact': 0, 'f1': 0.3125, 'type': 'active'},
  {'exact': 0, 'f1': 0.6666666666666666, 'type': 'active'},
  {'exact': 1, 'f1': 1.0, 'type': 'active'},
  {'exact': 0.0, 'f1': 0.0, 'type': 'silent'},
  {'exact': 0, 'f1': 0.14545454545454545, 'type': 'active'},
  {'exact': 0, 'f1': 0.21505376344086022, 'type': 'active'},
  {'exact': 0, 'f1': 0.22857142857142854, 'type': 'active'},
  {'exact': 0.0, 'f1': 0.0, 'type': 'silent'},
  {'exact': 0, 'f1': 0.3768115942028986, 'type': 'active'}],
 'avg_f1_score': 0.32722866648182214,
 'avg_exact_score': 0.1111111111111111,
 'missed': 2}

In [None]:
statistics = {}
outputs = []
count  = 0
import json
for d in test_data:
    count += 1
    scores = compute_metrics(d['text'])
    scores["ticket_uuid"] = d["ticket_uuid"]
    outputs.append(scores)
    if count%100 == 0:
        with open(f'/opt/home/bo_ling/co_model_eval/agent_v1_2048L_{count}C.json', 'w') as f:
            json.dump(outputs, f)

In [None]:
print(1)

In [10]:
with open(f'/opt/home/bo_ling/co_model_eval/agent_v1_2048L_400C.json', 'r') as f:
    outputs = json.load(f)

In [12]:
missed_rate = 0
f1_score = 0.0
exact_score = 0.0
for d in outputs:
    missed_rate += d['missed']/len(d['scores'])
    f1_score += d['avg_f1_score']
    exact_score += d['avg_exact_score']
print(missed_rate/len(outputs), f1_score/len(outputs), exact_score/len(outputs))

0.286946690840082 0.43090462370548976 0.2495591078133538


In [13]:
with open(f'/opt/home/bo_ling/co_model_eval/agent_v1_1536L_400C.json', 'r') as f:
    outputs = json.load(f)
missed_rate = 0
f1_score = 0.0
exact_score = 0.0
for d in outputs:
    missed_rate += d['missed']/len(d['scores'])
    f1_score += d['avg_f1_score']
    exact_score += d['avg_exact_score']
print(missed_rate/len(outputs), f1_score/len(outputs), exact_score/len(outputs))

0.2904717562978118 0.43366429365621634 0.2532833605226647
