with open("./test-en.md", "r", encoding="utf-8") as doc1, open("./test-bm.md", "r", encoding="utf-8") as doc2:
    doc1_content = ""
    doc2_content = ""
    for line in doc1.readlines():
        if line.strip() == "":
            continue
        doc1_content += line.strip() + "\n"

    for line in doc2.readlines():
        if line.strip() == "":
            continue
        doc2_content += line.strip() + "\n"

In [None]:
from helpers.document_processor import DocumentProcessor
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
import base64
from langfuse.callback import CallbackHandler

document_processor = DocumentProcessor()

with open(".\multilingual_docs\Wise Card\Wise - Eng T&C.pdf", "rb") as doc1_file: 
    doc1_content = base64.b64encode(doc1_file.read()).decode("utf-8")
    # doc1_content = document_processor.extract_filtered_content(doc1_file.read())
    # doc2_content = document_processor.extract_filtered_content(doc2_file.read())

with open(".\multilingual_docs\Wise Card\Wise - BM T&S.pdf", "rb") as doc2_file:
    doc2_content = base64.b64encode(doc2_file.read()).decode("utf-8")
    # doc2_content = document_processor.extract_filtered_content(doc2_file.read())

instruction = """You are a meticulous linguistic analyst comparing English and Bahasa Malaysia versions of a legal document.

Your task is to create a comprehensive list of ALL discrepancies between the given two documents.

## KEY ANALYSIS REQUIREMENTS ##

1. Examine both documents word-by-word, comparing each sentence in both languages
2. Create a separate flag for EACH distinct discrepancy, even within the same sentence
3. Pay special attention to:
   - Missing words in either language (e.g., "HLB QR Pay" vs just "QR Pay")
   - Spelling errors (e.g., "mengunakan" instead of "menggunakan")
   - Formatting and numbering differences
   - Brand names and technical terms
4. Both documents should reflect the exact same meaning, so if even a word is missing in one version, it should be flagged
5. When highlighting differences:
   - Highlight ONLY the specific word or element that differs
   - For missing words, highlight the word in the version where it exists

After your analysis, provide a JSON object with this structure:

{{
  "total": "Total number of discrepancies found",
  "flags": [
    {{
      "location": "Precise location in the document",
      "doc1": {{
        "content": "Content from Document 1 with <span style=\"color: red\">highlighted difference</span>"
      }},
      "doc2": {{
        "content": "Content from Document 2 with <span style=\"color: red\">highlighted difference</span>"
      }},
      "explanation": "Brief explanation of this specific discrepancy"
    }}
  ]
}}

Important: Your analysis should identify dozens of discrepancies, including multiple separate discrepancies within the same sentence when they exist."""

system_message = SystemMessage(content=instruction)
message = HumanMessage(content=[
    {"type": "file", "source_type": "base64", "data": doc2_content, "mime_type": "application/pdf", "filename": "english"},
    {"type": "text", "text": "Change this whole document into markdown file. Maintain the whole document's structure like numbers, bullet point. Do not change any thing except changing the tables into flattened comma separated values. Do not change the content of the document. Just change the format into markdown. Only response with the markdown file. Do not add any other text."},
])

prompt = ChatPromptTemplate.from_messages(
    [
      message
    ]
)

In [9]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
load_dotenv()

model = ChatOpenAI(model="gpt-4.1")
chain = prompt | model | StrOutputParser()
            
result = chain.invoke({}, config={"callbacks": [CallbackHandler(user_id=str("test"))]})

In [10]:
print(result)

```markdown
# HongLeongBank

## TERMA DAN SYARAT KAD KREDIT WISE HONG LEONG (English Version)

Dikemas Kini 10 Disember 2024

Terma dan Syarat (“T&S”) Kad Kredit WISE HLB (“Kad WISE”) hendaklah dibaca bersama dengan Perjanjian Pemegang Kad (“Perjanjian”) Hong Leong Bank Berhad ("HLB"). Selain daripada variasi yang dinyatakan di bawah, semua terma dan syarat Perjanjian hendaklah diterima pakai. Sekiranya terdapat percanggahan atau ketidakselarasan antara terma dan syarat Perjanjian dengan terma dan syarat T&S ini T&S ini ini hendaklah diutamakan setakat mana yang berkenaan dengan Kad WISE. Dengan menerima Kad WISE, Pemegang Kad bersetuju untuk terikat dengan T&S ini dan Perjanjian.

### 1. Program Pulangan Tunai (“Program”)

(a) Pemegang Kad Utama WISE (“Pemegang Kad Utama”) akan menerima pulangan tunai sehingga 15% (“Pulangan Tunai”), tertakluk kepada yang berikut:  
(i) perbelanjaan minimum Ringgit Malaysia Satu Ribu (RM1,000) dari Transaksi Runcit Layak (seperti ditakrif dalam Klausa

In [18]:
import json
import re

json_str = result
match = re.search(r"```json\s*(\{.*\})\s*```", result, re.DOTALL)
if match:
    json_str = match.group(1)
try:
    result_dict = json.loads(json_str)  # Attempt to parse the JSON
    if result_dict.get("flags"):  # Check if "flags" key exists and is not empty
        print(json.dumps(result_dict, indent=2))
    else:
        print("No flags found in the result.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")

No flags found in the result.


import base64
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

load_dotenv()

with open("./multilingual_docs\Sutera\hlb-sutera-credit-card-tnc-en.pdf", "rb") as f:
    data = f.read()

base64_string = base64.b64encode(data).decode("utf-8")

llm = ChatOpenAI(model="gpt-4o")
message = HumanMessage(content=[
    {"type": "text", "text": "Can you give me the markdown format for this pdf file? And flatten all the tables in this document."},
    {"type": "file", "source_type": "base64", "data": base64_string, "mime_type": "application/pdf", "filename": "hlb-sutera-credit-card-tnc-en.pdf"}
])

prompt = ChatPromptTemplate.from_messages([message])
# message = {
#     "role": "user",
#     "content": [
#       {
#         "type": "text",
#         "text": "Can you give me the markdown format for this pdf file? And flatten all the tables in this document."
#       },
#       {
#         "type": "file",
#         "source_type": "base64",
#         "data": base64_string,
#         "mime_type": "application/pdf",
#         "filename": "hlb-sutera-credit-card-tnc-en.pdf"
#       },
#     ],
# }

chain = prompt | llm | StrOutputParser()
result = chain.invoke({})