# Manual Test

In [1]:
import rouge_score
from typing_extensions import TypedDict
from typing import List
from agents.DOCSQNA import graph
class Message(TypedDict):
    role:str
    content:str

class QnaState(TypedDict):
    messages: List[Message]

def get_answer(question:str):
    state = QnaState(messages=[Message(role='user',content=question)])
    answer = graph.invoke(state)['answer']
    return answer

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader
import re
def split_by_numbered_items(text, keep_separators=True):
    pattern = r'(\d+\.)'
    
    if keep_separators:
        chunks = re.split(pattern, text)
        result = []
        for i in range(1, len(chunks), 2):
            if i + 1 < len(chunks):
                chunk = chunks[i] + chunks[i + 1]
                result.append(chunk.strip())
            else:
                result.append(chunks[i].strip())
        
        if chunks[0].strip():
            result.insert(0, chunks[0].strip())
            
    else:
        chunks = re.split(pattern, text)
        result = []
        for chunk in chunks:
            stripped = chunk.strip()
            if stripped and not re.match(r'\d+\.$', stripped):
                if "\n \n" in stripped:
                    stripped = stripped.split("\n \n")[0].strip()
                if stripped:
                    result.append(stripped)
    
    return result

loader = PyMuPDFLoader("docs/FAQ Dexa Medica.pdf")
docs = loader.load()
chunk_text = split_by_numbered_items(''.join([doc.page_content for doc in docs]), keep_separators=False)
qna_result = [
    {
        "question": chunk.split('?', 1)[0].strip() + '?',
        "answer": chunk.split('?', 1)[1].strip() if '?' in chunk else ''
    }
    for chunk in chunk_text if '?' in chunk
]

In [3]:
import pandas as pd

df = pd.DataFrame.from_records(qna_result)

In [4]:
df

Unnamed: 0,question,answer
0,Apa itu Dexa Medica?,Dexa Medica adalah salah satu perusahaan farma...
1,Apa visi dari Dexa Medica?,Visi Dexa Medica adalah menjadi perusahaan ter...
2,Apa saja nilai inti Dexa Medica?,Nilai inti Dexa Medica adalah: \no Strive for ...
3,Apa arti nama 'Dexa'?,"Nama ""Dexa"" berasal dari bahasa Yunani ""δέκα"" ..."
4,Apa saja kompetensi inti Dexa Medica?,Kompetensi inti Dexa Medica meliputi: \no Peng...
5,Apa itu OMAI?,OMAI (Obat Modern Asli Indonesia) adalah obat ...
6,Apa yang dimaksud dengan OGBdexa?,OGBdexa adalah lini produk Obat Generik Berlog...
7,Berapa jumlah obat yang telah diluncurkan Dexa...,Lebih dari 50 obat baru telah diluncurkan dala...
8,Apakah Dexa Medica memproduksi obat herbal?,"Ya, melalui divisi DLBS (Dexa Laboratories of ..."
9,Apa itu DLBS?,DLBS adalah divisi riset Dexa Medica yang foku...


In [5]:
df.to_csv('qna.csv')

In [None]:
df['llm_answer'] = df['question'].apply(get_answer)

In [4]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_rouge_scores(row):
    reference = row['answer']
    hypothesis = row['llm_answer']
    # Compute ROUGE-L F1 score
    scores = scorer.score(reference, hypothesis)
    return scores['rougeL'].fmeasure

df['rouge_l_f1'] = df.apply(compute_rouge_scores, axis=1)

In [5]:
for col in df.columns:
    print(f"{col}: {df.iloc[15][col]}")

question: Apa yang dicari Dexa Medica dalam calon karyawan?
answer: Profesionalisme, dedikasi, integritas, kemampuan berinovasi, dan semangat kolaborasi.
llm_answer: Dexa Medica mencari profesionalisme, dedikasi, integritas, kemampuan berinovasi, dan semangat kolaborasi dalam calon karyawan.
rouge_l_f1: 0.7272727272727273


In [6]:
df.rouge_l_f1.mean()

0.943746023455189

In [7]:
for col, val in df.iloc[15].items():
    print(f"{col}: {val}")

question: Apa yang dicari Dexa Medica dalam calon karyawan?
answer: Profesionalisme, dedikasi, integritas, kemampuan berinovasi, dan semangat kolaborasi.
llm_answer: Dexa Medica mencari profesionalisme, dedikasi, integritas, kemampuan berinovasi, dan semangat kolaborasi dalam calon karyawan.
rouge_l_f1: 0.7272727272727273


# Automatic Test

In [1]:
from typing_extensions import TypedDict
from typing import List
class Message(TypedDict):
    role: str
    content: str

class QnaState(TypedDict):
    messages: List[Message]

In [None]:
from typing import Dict, Any
def prepare_graph_input(example: Dict[str, Any]) -> QnaState:
    print(example)
    question = example.get("question", "")
    
    return QnaState(
        messages=[
            Message(role="user", content=question)
        ]
    )

def graph_wrapper(example: Dict[str, Any]) -> Dict[str, str]:
    try:
        # Convert example to graph input format
        graph_input = prepare_graph_input(example)
        
        # Invoke the graph - it returns dict with 'answer' key
        result = graph.invoke(graph_input)
        
        # Graph already returns 'answer' field, so just pass it through
        return {"answer": result.get("answer", "")}
    
    except Exception as e:
        print(f"Error processing example: {e}")
        return {"answer": ""}

In [5]:
from langsmith import evaluate, Client
from rouge_score import rouge_scorer
from agents.DOCSQNA import graph

client = Client()
dataset_name = "ds-crushing-mountain-99"

def exact_match(outputs: dict, reference_outputs: dict) -> bool:
    return outputs.get("answer", "") == reference_outputs.get("answer", "")

def rouge_l_f1_evaluator(outputs: dict, reference_outputs: dict) -> float:
    """
    Compute ROUGE-L F1 score between model output and reference answer.
    Assumes outputs and reference_outputs are dicts with 'answer' keys.
    """
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    output_text = outputs.get("answer", "")
    reference_text = reference_outputs.get("answer", "")
    scores = scorer.score(reference_text, output_text)
    return scores['rougeL'].fmeasure

evaluate(
    graph_wrapper,
    data=dataset_name,
    evaluators=[exact_match, rouge_l_f1_evaluator],
    experiment_prefix="ds-crushing-mountain-99 experiment"
)

View the evaluation results for experiment: 'ds-crushing-mountain-99 experiment-cbc4ed0e' at:
https://smith.langchain.com/o/a9b0e3b6-35e5-411a-86ec-d368c8730760/datasets/0d84d8fd-bee0-4947-95d7-e65dbdbe4fb7/compare?selectedSessions=2a770dce-cc9d-4bac-8156-f41b2378ee86




0it [00:00, ?it/s]

{'question': 'Apakah Dexa Medica terdaftar di Bursa Efek Indonesia?'}


0it [00:08, ?it/s]


KeyboardInterrupt: 