In [1]:
import requests
import glob
import os
from pathlib import Path
import re
import json
from general_purpose_agent import GeneralPurposeAgent
from parser.HDSD_excel_to_markdown import process_excel_to_markdown as hdsd_excel_extractor
from parser.docx_extractor import extract_docx_with_images as hdsd_docx_extractor
from parser.qa_docx_extractor import extract_text_and_images as qa_docx_extractor
from parser.qa_xlsx_extractor import process_excel_to_markdown as qa_excel_extractor
import pandas as pd
import pickle

In [3]:
def read_hdsd_excel(file_path):
    with open(file_path, "rb") as f:
        results = hdsd_excel_extractor(file_path, f.read())
    combined_content = []
    for result in results:
        combined_content.append(f"## Sheet {result['sheet_name']}:")
        combined_content.append(result["content"])
        combined_content.append("")  # Th√™m d√≤ng tr·ªëng gi·ªØa c√°c sheet
    
    merged_results = {
        "content": "\n".join(combined_content),
        "images": [img for result in results for img in result["images"]]
    }
    return merged_results["content"]


def read_hdsd_docx(file_path):
    return hdsd_docx_extractor(file_path)

def read_qa_docx(file_path):
    with open(file_path, "rb") as f:
        return qa_docx_extractor(file_path, f)['data']

def read_qa_excel(file_path):
    with open(file_path, "rb") as f:
        results = qa_excel_extractor(file_path, f.read())
    combined_content = []
    for result in results:
        combined_content.append(f"## Sheet {result['sheet_name']}:")
        combined_content.append(result["content"])
        combined_content.append("")  # Th√™m d√≤ng tr·ªëng gi·ªØa c√°c sheet
    
    merged_results = {
        "content": "\n".join(combined_content),
        "images": [img for result in results for img in result["images"]]
    }
    
    return merged_results['content']

def extract_json(response_text):
    pattern = r"```json\s*([\[{].*?[\]}])\s*```"
    match = re.search(pattern, response_text, re.DOTALL)
    if match:
        return json.loads(match.group(1))
    return None

In [7]:
xlsx_system_prompt = """B·∫°n l√† agent ƒë·ªÉ T·∫°o C·∫∑p C√¢u H·ªèi & Tr·∫£ L·ªùi (FAQ) ƒë·ªÉ Ki·ªÉm Tra Hi·ªáu NƒÉng H·ªá Th·ªëng H∆∞·ªõng D·∫´n Nghi·ªáp V·ª• Ng√¢n H√†ng

## M·ª•c ti√™u
Sinh c√°c c·∫∑p FAQ (Frequently Asked Questions) ƒë·ªÉ ki·ªÉm th·ª≠ kh·∫£ nƒÉng hi·ªÉu v√† tr·∫£ l·ªùi ƒë√∫ng c·ªßa h·ªá th·ªëng h·ªó tr·ª£ ng∆∞·ªùi d√πng nghi·ªáp v·ª• ng√¢n h√†ng, d·ª±a tr√™n d·ªØ li·ªáu h∆∞·ªõng d·∫´n s·ª≠ d·ª•ng h·ªá th·ªëng ACL - Loan Servicing (Secured Lending).

## ƒê·ªãnh d·∫°ng ƒë·∫ßu v√†o:
- ƒê·ªãnh d·∫°ng ƒë·∫ßu v√†o s·∫Ω l√† n·ªôi dung t√†i li·ªáu excel:
```
# Sheet 0: Sheet t·ªïng quan
[N·ªôi dung sheet 0]

# Sheet 1
[N·ªôi dung sheet 1]

...
```

## Nhi·ªám v·ª• c·ªßa b·∫°n
ƒê·ªëi v·ªõi m·ªói b∆∞·ªõc nghi·ªáp v·ª• (t·ª´ c√°c sheet chi ti·∫øt):

1. Sinh m·ªôt c·∫∑p FAQ g·ªìm:
   - C√¢u h·ªèi: L√† phi√™n b·∫£n di·ªÖn ƒë·∫°t l·∫°i (paraphrased) c·ªßa n·ªôi dung g·ªëc, nh∆∞ng v·∫´n gi·ªØ nguy√™n √Ω nghƒ©a
   - C√¢u tr·∫£ l·ªùi: Gi·ªØ nguy√™n n·ªôi dung g·ªëc nh∆∞ trong file

## ƒê·ªãnh d·∫°ng ƒë·∫ßu ra
K·∫øt qu·∫£ tr·∫£ v·ªÅ ph·∫£i ·ªü d·∫°ng JSON theo c·∫•u tr√∫c sau:

```json
{{
  "index": {{
    "question": "C√¢u h·ªèi ƒë∆∞·ª£c tgr",
    "answer": "C√¢u tr·∫£ l·ªùi gi·ªØ nguy√™n t·ª´ file g·ªëc",
    "ref": ["T√™n file", "T√™n sheet", "STT ho·∫∑c d√≤ng li√™n quan"]
  }},
  ...
}}

## L∆∞u √Ω:
- Sheet 0 lu√¥n l√† sheet t·ªïng qua, QA v√≠ d·ª• s·∫Ω l√† quy tr√¨nh t·ªïng quan (ti√™u ƒë·ªÅ sheet) v√† c√°c quy tr√¨nh b√™n trong n√≥ (n·ªôi dung sheet)
- C√°c sheet kh√°c l√† sheet c·ª• th·ªÉ v·ªÅ b∆∞·ªõc c·ªßa t·ª´ng quy tr√¨nh con trong quy tr√¨nh t·ªïng quan
- C√°c c√¢u h·ªèi n√™n l√† n·ªôi dung c·ªßa c√°c sheet
- S·ª≠a l·ªói ch√≠nh t·∫£ ti·∫øng Vi·ªát (vi·∫øt thi·∫øu d·∫•u, c√°c t·ª´ li·ªÅn nhau kh√¥ng c√°ch ra,...)
  - V√≠ d·ª•: "Ki·ªÉmtra Danh m·ª•ch·ªìs∆°m√†Ng∆∞·ªùigiaod·ªãch" -> "Ki·ªÉm tra danh m·ª•c ch·ªì s∆° m√† ng∆∞·ªùi giao d·ªãch"
""".strip()

In [8]:
docx_system_prompt = """
# üìÑ System Prompt: Tr√≠ch xu·∫•t Q&A t·ª´ n·ªôi dung `.docx` ƒë√£ tr√≠ch xu·∫•t h√¨nh ·∫£nh

B·∫°n l√† m·ªôt agent chuy√™n t·∫°o b·ªô **Q&A (FAQ)** t·ª´ c√°c t√†i li·ªáu `.docx` ƒë√£ ƒë∆∞·ª£c convert sang text, bao g·ªìm c·∫£ h√¨nh ·∫£nh ƒë∆∞·ª£c nh√∫ng d∆∞·ªõi d·∫°ng image markdown `![alt](path)`).

## Instructions:

1. **B·ªè qua c√°c ph·∫ßn kh√¥ng li√™n quan**:
   - M·ª•c l·ª•c d·∫°ng li·ªát k√™ "C√¢u 1: ‚Ä¶", "C√¢u 2: ‚Ä¶" ·ªü ƒë·∫ßu t√†i li·ªáu.
   - Header/footer l·∫∑p l·∫°i gi·ªØa c√°c trang (n·∫øu c√≤n t·ªìn t·∫°i).
   - C√°c ch√∫ th√≠ch h√†nh ch√≠nh (v√≠ d·ª•: ‚ÄúT√†i li·ªáu l∆∞u h√†nh n·ªôi b·ªô‚Äù, ‚ÄúTrang X/Y‚Äù, ‚Ä¶).

2. **X√°c ƒë·ªãnh question**:
   - M·ªói khi xu·∫•t hi·ªán c·ª•m `"C√¢u N:"`, n·ªôi dung ngay sau d·∫•u `:` ch√≠nh l√† **question**.
   - N·∫øu kh√¥ng c√≥ t·ª´ kh√≥a `"C√¢u N:"`, h√£y t·ª± ƒë·ªông ph√°t hi·ªán c√¢u h·ªèi d·ª±a tr√™n:
     - D√≤ng ƒë·∫ßu mang t√≠nh ƒë·ªãnh h∆∞·ªõng ("L√†m th·∫ø n√†o", "Khi g·∫∑p l·ªói", "Tr∆∞·ªùng h·ª£p n√†y x·ª≠ l√Ω th·∫ø n√†o", v.v.).
     - C√°c ƒëo·∫°n in ƒë·∫≠m, g·∫°ch ƒë·∫ßu d√≤ng m·ªü ƒë·∫ßu cho m·ªôt t√¨nh hu·ªëng nghi·ªáp v·ª• c·ª• th·ªÉ.
   - C√¢u h·ªèi c·∫ßn r√µ nghƒ©a, kh√¥ng vi·∫øt t·∫Øt, v√† ph√π h·ª£p ng·ªØ c·∫£nh nghi·ªáp v·ª•.

3. **X√°c ƒë·ªãnh answer**:
   - T·∫•t c·∫£ n·ªôi dung n·∫±m **sau c√¢u h·ªèi ƒë√≥** cho ƒë·∫øn tr∆∞·ªõc c√¢u h·ªèi k·∫ø ti·∫øp (ho·∫∑c ƒë·∫øn h·∫øt file n·∫øu l√† c√¢u cu·ªëi c√πng).
   - N·∫øu c√≥ ·∫£nh minh h·ªça, gi·ªØ nguy√™n to√†n b·ªô image tag n·∫±m trong v√πng tr·∫£ l·ªùi.

4. **ƒê√°nh s·ªë th·ª© t·ª± `<index>`** t∆∞∆°ng ·ª©ng v·ªõi th·ª© t·ª± xu·∫•t hi·ªán (t·ª± tƒÉng d·∫ßn t·ª´ `1`).

5. **Kh√¥ng t·ª± ch·∫ø th√™m c√¢u h·ªèi ho·∫∑c tr·∫£ l·ªùi ngo√†i n·ªôi dung file.** Ch·ªâ di·ªÖn gi·∫£i l·∫°i n·∫øu n·ªôi dung g·ªëc qu√° r·ªùi r·∫°c.

## C·∫•u tr√∫c ph·∫£n h·ªìi:
```json
{{
  "<index>": {{
    "question": "<n·ªôi dung c√¢u h·ªèi>",
    "answer":  "<n·ªôi dung tr·∫£ l·ªùi, gi·ªØ nguy√™n image tag n·∫øu c√≥>"
  }},
  ...
}}
```

## V√≠ d·ª• ph·∫£n h·ªìi:
```json
{{
  "1": {{
    "question": "Khi g·∫∑p l·ªói 'Kh√¥ng th·ªÉ x√°c minh th√¥ng tin kh√°ch h√†ng', c·∫ßn x·ª≠ l√Ω nh∆∞ th·∫ø n√†o?",
    "answer": "Ki·ªÉm tra l·∫°i m√£ s·ªë kh√°ch h√†ng ho·∫∑c CMND/CCCD. N·∫øu ƒë√∫ng, li√™n h·ªá b·ªô ph·∫≠n ki·ªÉm so√°t n·ªôi b·ªô ƒë·ªÉ x√°c minh d·ªØ li·ªáu. <img src=\"data/images/verify_error.png\"></image>"
  }},
  "2": {{
    "question": "L√†m th·∫ø n√†o ƒë·ªÉ in h·ª£p ƒë·ªìng t√≠n d·ª•ng ƒë√£ duy·ªát?",
    "answer": "Truy c·∫≠p menu 'H·ª£p ƒë·ªìng' > 'Tra c·ª©u h·ª£p ƒë·ªìng'. Ch·ªçn h·ª£p ƒë·ªìng c·∫ßn in v√† nh·∫•n 'In h·ª£p ƒë·ªìng'. <img src=\"data/images/print_contract.png\"></image>"
  }}
}}
```

## L∆∞u √Ω:
- C√¢u h·ªèi - tr·∫£ l·ªùi c·∫ßn r√µ r√†ng, ƒë√∫ng logic v√† nghi·ªáp v·ª•.
- B·∫£o to√†n to√†n b·ªô image tag (kh√¥ng ƒë·ªïi t√™n, kh√¥ng lo·∫°i b·ªè).
- Format JSON ph·∫£i h·ª£p l·ªá ƒë·ªÉ `json.loads(..., ensure_ascii=False)` kh√¥ng b√°o l·ªói.
- N·∫øu ph·∫ßn tr·∫£ l·ªùi c√≥ nhi·ªÅu b∆∞·ªõc, tr√¨nh b√†y theo th·ª© t·ª± ƒë·ªÉ d·ªÖ hi·ªÉu (kh√¥ng c·∫ßn format b·∫£ng).

"""

In [9]:
agent = GeneralPurposeAgent(xlsx_system_prompt)

In [49]:
# QA docx
file_path = Path("data/Q&A_Cam_nang_ACL_lending_23052025.docx")
output_dir = Path(f"output/{file_path.stem}")
os.makedirs(output_dir, exist_ok=True)
data = read_qa_docx(str(file_path))

response = agent.get_response(f"T·∫°o FAQ gi√∫p t√¥i, ƒë√¢y l√† n·ªôi dung c·ªßa t√†i li·ªáu {file_path.stem}:\n{data}")
with open(output_dir / "response.pkl", "wb") as f:
    pickle.dump(response, f)
json_response = extract_json(response["text"])
with open(output_dir / "json_response.json", "wb") as f:
    json.dump(json_response, f, indent=4, ensure_ascii=False)

KeyboardInterrupt: 

In [50]:
# QA excel
file_path = Path("data/Q&A_Testcase_ACL_ChatBot_v1.xlsx")
output_dir = Path(f"output/{file_path.stem}")
os.makedirs(output_dir, exist_ok=True)
data = read_qa_excel(str(file_path))

response = agent.get_response(f"T·∫°o FAQ gi√∫p t√¥i, ƒë√¢y l√† n·ªôi dung c·ªßa t√†i li·ªáu Q&A: {file_path.name}:\n{data}")
with open(output_dir / "response.pkl", "wb") as f:
    pickle.dump(response, f)
json_response = extract_json(response["text"])
with open(output_dir / "json_response.json", "wb") as f:
    json.dump(json_response, f, indent=4, ensure_ascii=False)

2025-06-03 00:55:12,637 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω n·ªôi dung file Excel
2025-06-03 00:55:12,641 - INFO - ƒêang t·∫£i workbook...
2025-06-03 00:55:12,752 - INFO - ƒê√£ t·∫£i workbook th√†nh c√¥ng. S·ªë sheet: 1
2025-06-03 00:55:12,753 - INFO - ƒêang x·ª≠ l√Ω sheet: Master
2025-06-03 00:55:12,797 - INFO - Sheet Master: 207 h√†ng, 7 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-03 00:55:12,801 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet Master
2025-06-03 00:55:12,806 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet Master
2025-06-03 00:55:12,834 - INFO - ƒêang t·∫°o markdown cho sheet Master
2025-06-03 00:55:12,835 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet Master. S·ªë ·∫£nh: 1
2025-06-03 00:55:12,840 - INFO - ƒê√£ x·ª≠ l√Ω xong file Excel. T·ªïng s·ªë sheet: 1


NameError: name 'pickle' is not defined

In [10]:
def process_file(file_path):
    print(f"Processing {file_path}")
    output_dir = Path(f"output/{file_path.stem}")
    os.makedirs(output_dir, exist_ok=True)
    data = read_hdsd_excel(str(file_path))

    response = agent.get_response(f"T·∫°o FAQ gi√∫p t√¥i, ƒë√¢y l√† n·ªôi dung c·ªßa t√†i li·ªáu HDSD: {file_path.name}:\n{data}")
    with open(output_dir / "response.pkl", "wb") as f:
        pickle.dump(response, f)
    json_response = extract_json(response["text"])
    with open(output_dir / "json_response.json", "w", encoding="utf-8") as f:
        json.dump(json_response, f, indent=4, ensure_ascii=False)
    print(f"Done {file_path}")

# X·ª≠ l√Ω t·∫•t c·∫£ c√°c file .xlsx trong th∆∞ m·ª•c new_data
input_dir = Path("new_data")
for file_path in input_dir.glob("*.xlsx"):
    process_file(file_path)

Processing new_data/RM - HDSD ACL_KHCN_BSS_nga.ct_No_pic.xlsx


2025-06-11 14:02:00,785 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω n·ªôi dung file Excel
2025-06-11 14:02:00,785 - INFO - ƒêang t·∫£i workbook...
2025-06-11 14:02:01,794 - INFO - ƒê√£ t·∫£i workbook th√†nh c√¥ng. S·ªë sheet: 18
2025-06-11 14:02:01,795 - INFO - ƒêang x·ª≠ l√Ω sheet: 0
2025-06-11 14:02:01,798 - INFO - Sheet 0: 26 h√†ng, 9 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:02:01,799 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 0
2025-06-11 14:02:01,799 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 0
2025-06-11 14:02:01,800 - INFO - ƒêang t·∫°o markdown cho sheet 0
2025-06-11 14:02:01,800 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 0. S·ªë ·∫£nh: 0
2025-06-11 14:02:01,801 - INFO - ƒêang x·ª≠ l√Ω sheet: 1
2025-06-11 14:02:01,801 - INFO - Sheet 1: 3 h√†ng, 30 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:02:01,801 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 1
2025-06-11 14:02:01,802 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 1
2025-06-11 14:02:01,802 - INFO - ƒêang t·∫°o markdown cho sheet 1
2025-06-11 14:02:01,803 -

image map: 0, dict_keys([])
image map: 2, dict_keys(['R3', 'AD3'])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 1, dict_keys(['D17'])
image map: 0, dict_keys([])
image map: 1, dict_keys(['D28'])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])


2025-06-11 14:02:02,002 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 10. S·ªë ·∫£nh: 0
2025-06-11 14:02:02,003 - INFO - ƒêang x·ª≠ l√Ω sheet: 11
2025-06-11 14:02:02,008 - INFO - Sheet 11: 16 h√†ng, 4 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:02:02,010 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 11
2025-06-11 14:02:02,011 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 11
2025-06-11 14:02:02,012 - INFO - ƒêang t·∫°o markdown cho sheet 11
2025-06-11 14:02:02,018 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 11. S·ªë ·∫£nh: 0
2025-06-11 14:02:02,019 - INFO - ƒêang x·ª≠ l√Ω sheet: 12
2025-06-11 14:02:02,028 - INFO - Sheet 12: 18 h√†ng, 4 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:02:02,033 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 12
2025-06-11 14:02:02,035 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 12
2025-06-11 14:02:02,036 - INFO - ƒêang t·∫°o markdown cho sheet 12
2025-06-11 14:02:02,040 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 12. S·ªë ·∫£nh: 0
2025-06-11 14:02:02,041 - INFO - ƒêang x·ª≠ l√Ω sheet: 13.1
2025-06-11 14:02:0

image map: 0, dict_keys([])
image map: 1, dict_keys(['D17'])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])


2025-06-11 14:07:02,217 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω n·ªôi dung file Excel
2025-06-11 14:07:02,219 - INFO - ƒêang t·∫£i workbook...
2025-06-11 14:07:02,284 - INFO - ƒê√£ t·∫£i workbook th√†nh c√¥ng. S·ªë sheet: 7
2025-06-11 14:07:02,287 - INFO - ƒêang x·ª≠ l√Ω sheet: 0
2025-06-11 14:07:02,293 - INFO - Sheet 0: 13 h√†ng, 8 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:07:02,296 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 0
2025-06-11 14:07:02,298 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 0
2025-06-11 14:07:02,301 - INFO - ƒêang t·∫°o markdown cho sheet 0
2025-06-11 14:07:02,306 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 0. S·ªë ·∫£nh: 0
2025-06-11 14:07:02,309 - INFO - ƒêang x·ª≠ l√Ω sheet: 1
2025-06-11 14:07:02,311 - INFO - Sheet 1: 11 h√†ng, 2 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:07:02,315 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 1
2025-06-11 14:07:02,323 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 1
2025-06-11 14:07:02,328 - INFO - ƒêang t·∫°o markdown cho sheet 1
2025-06-11 14:07:02,330 - 

Done new_data/RM - HDSD ACL_KHCN_BSS_nga.ct_No_pic.xlsx
Processing new_data/VK - HDSD ACL_edit_final.xlsx
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])


2025-06-11 14:07:02,415 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 4. S·ªë ·∫£nh: 0
2025-06-11 14:07:02,420 - INFO - ƒêang x·ª≠ l√Ω sheet: 5
2025-06-11 14:07:02,425 - INFO - Sheet 5: 16 h√†ng, 2 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:07:02,430 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 5
2025-06-11 14:07:02,437 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 5
2025-06-11 14:07:02,443 - INFO - ƒêang t·∫°o markdown cho sheet 5
2025-06-11 14:07:02,449 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 5. S·ªë ·∫£nh: 0
2025-06-11 14:07:02,454 - INFO - ƒêang x·ª≠ l√Ω sheet: 6
2025-06-11 14:07:02,460 - INFO - Sheet 6: 10 h√†ng, 2 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:07:02,462 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 6
2025-06-11 14:07:02,464 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 6
2025-06-11 14:07:02,469 - INFO - ƒêang t·∫°o markdown cho sheet 6
2025-06-11 14:07:02,475 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 6. S·ªë ·∫£nh: 0
2025-06-11 14:07:02,477 - INFO - ƒê√£ x·ª≠ l√Ω xong file Excel. T·ªïng s·ªë sheet: 7


image map: 0, dict_keys([])
image map: 2, dict_keys(['B9', 'B10'])


2025-06-11 14:08:49,749 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω n·ªôi dung file Excel
2025-06-11 14:08:49,750 - INFO - ƒêang t·∫£i workbook...


Done new_data/VK - HDSD ACL_edit_final.xlsx
Processing new_data/RM - HDSD ACL_KHDN_edit_final.xlsx


2025-06-11 14:08:50,118 - INFO - ƒê√£ t·∫£i workbook th√†nh c√¥ng. S·ªë sheet: 17
2025-06-11 14:08:50,118 - INFO - ƒêang x·ª≠ l√Ω sheet: 0
2025-06-11 14:08:50,121 - INFO - Sheet 0: 21 h√†ng, 8 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:08:50,125 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 0
2025-06-11 14:08:50,126 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 0
2025-06-11 14:08:50,127 - INFO - ƒêang t·∫°o markdown cho sheet 0
2025-06-11 14:08:50,127 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 0. S·ªë ·∫£nh: 0
2025-06-11 14:08:50,128 - INFO - ƒêang x·ª≠ l√Ω sheet: 1
2025-06-11 14:08:50,129 - INFO - Sheet 1: 8 h√†ng, 3 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-11 14:08:50,130 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 1
2025-06-11 14:08:50,131 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 1
2025-06-11 14:08:50,132 - INFO - ƒêang t·∫°o markdown cho sheet 1
2025-06-11 14:08:50,132 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 1. S·ªë ·∫£nh: 0
2025-06-11 14:08:50,133 - INFO - ƒêang x·ª≠ l√Ω sheet: 2
2025-06-11 14:08:50,134 - IN

image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])
image map: 0, dict_keys([])


JSONDecodeError: Invalid control character at: line 49 column 1056 (char 11073)

In [53]:
# HDSD excel
file_path = Path("data/RM - HDSD ACL_KHCN.xlsx")
output_dir = Path(f"output/{file_path.stem}")
os.makedirs(output_dir, exist_ok=True)
data = read_hdsd_excel(str(file_path))

response = agent.get_response(f"T·∫°o FAQ gi√∫p t√¥i, ƒë√¢y l√† n·ªôi dung c·ªßa t√†i li·ªáu HDSD: {file_path.name}:\n{data}")
with open(output_dir / "response.pkl", "wb") as f:
    pickle.dump(response, f)
json_response = extract_json(response["text"])
with open(output_dir / "json_response.json", "wb") as f:
    json.dump(json_response, f, indent=4, ensure_ascii=False)


2025-06-03 02:06:19,116 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω n·ªôi dung file Excel
2025-06-03 02:06:19,121 - INFO - ƒêang t·∫£i workbook...
2025-06-03 02:06:19,739 - INFO - ƒê√£ t·∫£i workbook th√†nh c√¥ng. S·ªë sheet: 21
2025-06-03 02:06:19,740 - INFO - ƒêang x·ª≠ l√Ω sheet: 0
2025-06-03 02:06:19,741 - INFO - Sheet 0: 26 h√†ng, 8 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-03 02:06:19,741 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 0
2025-06-03 02:06:19,742 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 0
2025-06-03 02:06:19,743 - INFO - ƒêang t·∫°o markdown cho sheet 0
2025-06-03 02:06:19,743 - INFO - ƒê√£ x·ª≠ l√Ω xong sheet 0. S·ªë ·∫£nh: 0
2025-06-03 02:06:19,744 - INFO - ƒêang x·ª≠ l√Ω sheet: 1
2025-06-03 02:06:19,745 - INFO - Sheet 1: 9 h√†ng, 4 c·ªôt c√≥ d·ªØ li·ªáu
2025-06-03 02:06:19,745 - INFO - ƒêang x·ª≠ l√Ω header c·ªßa sheet 1
2025-06-03 02:06:19,746 - INFO - ƒêang x·ª≠ l√Ω d·ªØ li·ªáu c·ªßa sheet 1
2025-06-03 02:06:19,747 - INFO - ƒêang t·∫°o markdown cho sheet 1
2025-06-03 02:06:19,751 - 

TypeError: a bytes-like object is required, not 'str'

In [None]:
# HDSD docx
agent.change_system_prompt(docx_system_prompt)
file_path = Path("data/2. PL01 - C√°c quy tr√¨nh nghi·ªáp v·ª• c·ªßa s·∫£n ph·∫©m ti·ªÅn g·ª≠i d√†nh cho KH c√° nh√¢n (2).docx")
output_dir = Path(f"output/{file_path.stem}")
os.makedirs(output_dir, exist_ok=True)
data = read_hdsd_docx(str(file_path))

response = agent.get_response(f"T·∫°o FAQ gi√∫p t√¥i, ƒë√¢y l√† n·ªôi dung c·ªßa t√†i li·ªáu HDSD: {file_path.name}:\n{data}")
with open(output_dir / "response.pkl", "wb") as f:
    pickle.dump(response, f)
json_response = extract_json(response["text"])
with open(output_dir / "json_response.json", "wb") as f:
    json.dump(json_response, f, indent=4, ensure_ascii=False)


In [23]:
data_df = {'question': [],
           'answer': [],
           'ref': []}

for index, data in json_response.items():
    data_df['question'].append(data['question'])
    data_df['answer'].append(data['answer'])
    data_df['ref'].append(data['ref'])

In [25]:
df = pd.DataFrame(data_df)
df

Unnamed: 0,question,answer,ref
0,L√†m th·∫ø n√†o ƒë·ªÉ ki·ªÉm tra xem kh√°ch h√†ng ƒë√£ t·ªìn ...,Ch·ªçn C·∫•p t√≠n d·ª•ng B√°n l·∫ª/ Cho vay c√≥ ƒë·∫£m b·∫£o/ ...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
1,Quy tr√¨nh t√¨m ki·∫øm kh√°ch h√†ng ƒë√£ t·ªìn t·∫°i tr√™n ...,Ch·ªçn Hambeger menu. Ch·ªçn C·∫•p t√≠n d·ª•ng B√°n l·∫ª/Q...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
2,"Khi t·∫°o m·ªõi th√¥ng tin kh√°ch h√†ng, nh·ªØng ƒëi·ªÉm n...",M·ªô t s·ªë t√™n tr∆∞·ªùng c·∫ßn l∆∞u √Ω: T√™n kh√°ch h√†ng: ...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
3,C·∫ßn nh·∫≠p nh·ªØng th√¥ng tin g√¨ khi t·∫°o m·ªõi kh√°ch ...,M·ªôt s·ªë t√™n tr∆∞·ªùng c·∫ßn l∆∞u √Ω: Th√¥ng tin qu·ªëc t·ªã...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
4,C√°c b∆∞·ªõc t·∫°o s·ªë t·ªù tr√¨nh t√≠n d·ª•ng trong h·ªá th·ªë...,ACL menu/Cho vay c√≥ b·∫£o ƒë·∫£m/T·∫°o m·ªõi/T·∫°o m·ªõi kh...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
5,L√†m c√°ch n√†o ƒë·ªÉ t√¨m ki·∫øm kh√°ch h√†ng khi t·∫°o t·ªù...,- Ch·ªçn T√¨m ki·ªÅm Kh√°ch h√†ng c√† nh√¢n/Kh√°ch h√†ng ...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
6,"Khi t·∫°o t·ªù tr√¨nh t√≠n d·ª•ng, c·∫ßn nh·∫≠p nh·ªØng th√¥n...","- Nh·∫≠p c√°c th√¥ng tin S·∫£n ph·∫©m vay, S·ªë ti·ªÅn vay...","[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
7,L√†m th·∫ø n√†o ƒë·ªÉ th√™m ng∆∞·ªùi li√™n quan v√†o t·ªù tr√¨...,Tab Th√¥ng tin ng∆∞·ªùi li√™n quan - ƒê·ªÉ th√™m Ng∆∞·ªùi ...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
8,Quy tr√¨nh th√™m ng∆∞·ªùi ƒë·ªìng vay v√†o h·ªì s∆° t√≠n d·ª•...,Tab Ng∆∞·ªùi ƒë·ªìng vay - N·∫øu c√≥ Ng∆∞·ªùi ƒë·ªìng vay th√¨...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."
9,C√°ch nh·∫≠p ngu·ªìn thu nh·∫≠p c·ªßa kh√°ch h√†ng trong ...,Nh·∫≠p ngu·ªìn tr·∫£ n·ª£ c·ªßa KH t·∫°i Tab Thu nh·∫≠p - B∆∞...,"[H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng ACL - Loan Servicing, Sheet..."


In [26]:
df.to_excel("data/faq_demo.xlsx", sheet_name="faq")

In [None]:
data_dir = Path("output/2. PL01 - C√°c quy tr√¨nh nghi·ªáp v·ª• c·ªßa s·∫£n ph·∫©m ti·ªÅn g·ª≠i d√†nh cho KH c√° nh√¢n (2)")
data_df = {'question': [],
           'answer': [],
           'ref': []
           }

with open(data_dir / "results.json", "r") as f:
    json_response = json.load(f)

for index, data in json_response.items():
    data_df['question'].append(data['question'])
    data_df['answer'].append(data['answer'])
    # data_df['ref'].append(data['ref'])

print(len(data_df['question']), len(data_df['answer']))
df = pd.DataFrame(data_df)
df.to_excel(data_dir / "faq_demo.xlsx", sheet_name="faq")

13 13


In [12]:
import json
filepath = "output/RM - HDSD ACL_KHCN_BSS_nga.ct_No_pic/json_response.json"
with open(filepath, "r") as f:
    data = json.load(f)
questions = []
true_answers = []
refs = []
for index, item in data.items():
    questions.append(item['question'])
    true_answers.append(item['answer'])
    refs.append(item['ref'])

df = pd.DataFrame({
    'Question': questions,
    'True Answer': true_answers,
    'Refs': refs
})
df.to_excel("output/RM - HDSD ACL_KHCN_BSS_nga.ct_No_pic/RM - HDSD ACL_KHCN_BSS_nga.ct_No_pic_QA.xlsx", index=False)

In [1]:
print("B·∫°n l√† m·ªôt chuy√™n gia ƒë√°nh gi√° ch·∫•t l∆∞·ª£ng c√¢u tr·∫£ l·ªùi c·ªßa chatbot. H√£y ƒë√°nh gi√° c√¢u tr·∫£ l·ªùi c·ªßa agent h·ªó tr·ª£ ng√¢n h√†ng VIB so v·ªõi c√¢u tr·∫£ l·ªùi chu·∫©n (true answer) d·ª±a tr√™n c√°c ti√™u ch√≠ sau:\n\n## Ti√™u ch√≠ ƒë√°nh gi√°\n1. ƒê·ªô bao ph·ªß th√¥ng tin (Information Coverage):\n- ƒêi·ªÉm 8-10: C√¢u tr·∫£ l·ªùi bao g·ªìm t·∫•t c·∫£ th√¥ng tin quan tr·ªçng c√≥ trong true answer, kh√¥ng b·ªè s√≥t b·∫•t k·ª≥ th√¥ng tin n√†o\n- ƒêi·ªÉm 5-7: C√¢u tr·∫£ l·ªùi bao g·ªìm m·ªôt ph·∫ßn th√¥ng tin quan tr·ªçng t·ª´ true answer, thi·∫øu m·ªôt s·ªë th√¥ng tin\n- ƒêi·ªÉm 0-4: C√¢u tr·∫£ l·ªùi thi·∫øu h·∫ßu h·∫øt th√¥ng tin quan tr·ªçng t·ª´ true answer\n- L∆∞u √Ω: Ti√™u ch√≠ n√†y ch·ªâ c·∫ßn xem x√©t th√¥ng tin c√≥ trong true answer hay ch∆∞a, n·∫øu th·ª´a th√¨ b·ªè qua, kh√¥ng c·∫ßn tr·ª´ ƒëi·ªÉm v√¨ s·∫Ω ƒë√°nh gi√° ·ªü ti√™u ch√≠ kh√°c, b·ªè qua c·∫£ c√°ch tr√¨nh b√†y, ƒë√°nh s·ªë\n\n2. ƒê·ªô ch√≠nh x√°c v√† li√™n quan c·ªßa th√¥ng tin (Information Accuracy and Relevance):\n- ƒêi·ªÉm 8-10: Th√¥ng tin b·ªï sung (n·∫øu c√≥) ƒë·ªÅu ch√≠nh x√°c, li√™n quan ƒë·∫øn c√¢u h·ªèi v√† h·ªó tr·ª£ ng∆∞·ªùi d√πng hi·ªÉu r√µ h∆°n\n- ƒêi·ªÉm 5-7: Th√¥ng tin b·ªï sung (n·∫øu c√≥) ch√≠nh x√°c nh∆∞ng kh√¥ng ho√†n to√†n li√™n quan ho·∫∑c c√≥ th·ªÉ g√¢y hi·ªÉu nh·∫ßm nh·∫π\n- ƒêi·ªÉm 0-4: Th√¥ng tin b·ªï sung kh√¥ng ch√≠nh x√°c, kh√¥ng li√™n quan ƒë·∫øn c√¢u h·ªèi ho·∫∑c g√¢y hi·ªÉu sai l·ªách nghi√™m tr·ªçng\n\n3. ƒê·ªãnh d·∫°ng v√† c·∫•u tr√∫c (Format):\n- ƒêi·ªÉm 8-10: C√¢u tr·∫£ l·ªùi ƒë∆∞·ª£c ƒë·ªãnh d·∫°ng r√µ r√†ng, c√≥ xu·ªëng d√≤ng khi t√°ch √Ω, kh√¥ng c√≥ c√¢u di·ªÖn gi·∫£i ho·∫∑c h·∫≠u t·ªë kh√¥ng c·∫ßn thi·∫øt\n- ƒêi·ªÉm 5-7: C√¢u tr·∫£ l·ªùi c√≥ ƒë·ªãnh d·∫°ng nh∆∞ng ch∆∞a t·ªëi ∆∞u\n- ƒêi·ªÉm 0-4: C√¢u tr·∫£ l·ªùi kh√¥ng c√≥ ƒë·ªãnh d·∫°ng r√µ r√†ng\n\n4. Ng√¥n ng·ªØ v√† phong c√°ch (Language):\n- ƒêi·ªÉm 8-10: S·ª≠ d·ª•ng ng√¥n ng·ªØ l·ªãch s·ª±, chuy√™n nghi·ªáp, ph√π h·ª£p v·ªõi ng·ªØ c·∫£nh ng√¢n h√†ng\n- ƒêi·ªÉm 5-7: Ng√¥n ng·ªØ ch·∫•p nh·∫≠n ƒë∆∞·ª£c nh∆∞ng ch∆∞a th·∫≠t s·ª± chuy√™n nghi·ªáp\n- ƒêi·ªÉm 0-4: Ng√¥n ng·ªØ kh√¥ng ph√π h·ª£p\n\n5. X·ª≠ l√Ω tr∆∞·ªùng h·ª£p kh√¥ng t√¨m th·∫•y c√¢u tr·∫£ l·ªùi (Handling Unknown):\n- ƒêi·ªÉm 8-10: ƒê·ªÅ xu·∫•t chuy·ªÉn ƒë·∫øn Admin m·ªôt c√°ch ph√π h·ª£p khi kh√¥ng t√¨m th·∫•y c√¢u tr·∫£ l·ªùi\n- ƒêi·ªÉm 5-7: C√≥ ƒë·ªÅ xu·∫•t chuy·ªÉn ƒë·∫øn Admin nh∆∞ng ch∆∞a r√µ r√†ng\n- ƒêi·ªÉm 0-4: Kh√¥ng x·ª≠ l√Ω tr∆∞·ªùng h·ª£p kh√¥ng t√¨m th·∫•y c√¢u tr·∫£ l·ªùi\n\nH√£y ƒë√°nh gi√° c√¢u tr·∫£ l·ªùi c·ªßa agent theo c√°c ti√™u ch√≠ tr√™n v√† cho ƒëi·ªÉm t·ªïng th·ªÉ t·ª´ 0-10.\n\n## Input Format:\n- C√¢u h·ªèi: [c√¢u h·ªèi t·ª´ ng∆∞·ªùi d√πng]\n- True Answer: [c√¢u tr·∫£ l·ªùi chu·∫©n t·ª´ FAQ]\n- Agent Answer: [c√¢u tr·∫£ l·ªùi c·ªßa agent]\n\n## Output Format:\nƒê·∫ßu ra c·ªßa ph·∫£n h·ªìi ph·∫£i l√† m·ªôt json schema nh∆∞ sau:\n```json\n{{\n  \"information_coverage\": [ƒêi·ªÉm s·ªë cho ƒë·ªô bao ph·ªß th√¥ng tin (Information Coverage)],\n  \"hallucination_control\": [ƒêi·ªÉm s·ªë ki·ªÉm so√°t Hallucination],\n  \"format_and_structure\": [ƒêi·ªÉm s·ªë cho d·∫°ng v√† c·∫•u tr√∫c (Format and Structure)],\n  \"language_and_style\": [ƒêi·ªÉm s·ªë cho ng√¥n ng·ªØ v√† phong c√°ch (Language)],\n  \"handling_unknown\": [ƒêi·ªÉm s·ªë cho x·ª≠ l√Ω tr∆∞·ªùng h·ª£p kh√¥ng t√¨m th·∫•y c√¢u tr·∫£ l·ªùi (Handling Unknown)],\n  \"comments\": [Nh·∫≠n x√©t, g√≥p √Ω v√† c·∫£i thi·ªán cho t·ª´ng ti√™u ch√≠ ƒëi·ªÉm]\n}}\n```\n\n## L∆∞u √Ω quan tr·ªçng:\n- N·∫øu trong c√¢u tr·∫£ l·ªùi c√≥ m√¥ t·∫£ cho c√°c b∆∞·ªõc, h√£y b·ªè qua, ƒë√¢y kh√¥ng ph·∫£i ƒëi·ªÉm tr·ª´\n- B·ªè qua c√°c c√¢u k·∫øt th√∫c nh∆∞ \"Qu√Ω kh√°ch c√≥ c·∫ßn h·ªó tr·ª£ g√¨ th√™m kh√¥ng\", ƒë√¢y kh√¥ng ph·∫£i l√† ƒëi·ªÉm tr·ª´\n- ƒê∆∞a ra ƒëi·ªÉm ngang h√†ng v·ªõi ti√™u ch√≠ ƒë√≥, v√≠ d·ª•: ƒê·ªô bao ph·ªß th√¥ng tin (Information Coverage): 10\n- ƒêi·ªÉm t·ªïng th·ªÉ l√† ƒëi·ªÉm cu·ªëi c√πng, kh√¥ng ƒë∆∞a ra c√°ch t√≠nh, v√≠ d·ª•: ƒêi·ªÉm t·ªïng th·ªÉ: 9.8\n- Nh·∫≠n x√©t, g√≥p √Ω v√† c·∫£i thi·ªán cho t·ª´ng ti√™u ch√≠ ƒëi·ªÉm s·∫Ω l√† json v·ªõi key l√† ti√™u ch√≠, value l√† nh·∫≠n x√©t, g√≥p √Ω v√† c·∫£i thi·ªán cho ti√™u ch√≠ ƒë√≥")

B·∫°n l√† m·ªôt chuy√™n gia ƒë√°nh gi√° ch·∫•t l∆∞·ª£ng c√¢u tr·∫£ l·ªùi c·ªßa chatbot. H√£y ƒë√°nh gi√° c√¢u tr·∫£ l·ªùi c·ªßa agent h·ªó tr·ª£ ng√¢n h√†ng VIB so v·ªõi c√¢u tr·∫£ l·ªùi chu·∫©n (true answer) d·ª±a tr√™n c√°c ti√™u ch√≠ sau:

## Ti√™u ch√≠ ƒë√°nh gi√°
1. ƒê·ªô bao ph·ªß th√¥ng tin (Information Coverage):
- ƒêi·ªÉm 8-10: C√¢u tr·∫£ l·ªùi bao g·ªìm t·∫•t c·∫£ th√¥ng tin quan tr·ªçng c√≥ trong true answer, kh√¥ng b·ªè s√≥t b·∫•t k·ª≥ th√¥ng tin n√†o
- ƒêi·ªÉm 5-7: C√¢u tr·∫£ l·ªùi bao g·ªìm m·ªôt ph·∫ßn th√¥ng tin quan tr·ªçng t·ª´ true answer, thi·∫øu m·ªôt s·ªë th√¥ng tin
- ƒêi·ªÉm 0-4: C√¢u tr·∫£ l·ªùi thi·∫øu h·∫ßu h·∫øt th√¥ng tin quan tr·ªçng t·ª´ true answer
- L∆∞u √Ω: Ti√™u ch√≠ n√†y ch·ªâ c·∫ßn xem x√©t th√¥ng tin c√≥ trong true answer hay ch∆∞a, n·∫øu th·ª´a th√¨ b·ªè qua, kh√¥ng c·∫ßn tr·ª´ ƒëi·ªÉm v√¨ s·∫Ω ƒë√°nh gi√° ·ªü ti√™u ch√≠ kh√°c, b·ªè qua c·∫£ c√°ch tr√¨nh b√†y, ƒë√°nh s·ªë

2. ƒê·ªô ch√≠nh x√°c v√† li√™n quan c·ªßa th√¥ng tin (Information Accuracy a