# Load the data

In [39]:
import os
import huggingface_hub
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from datasets import load_dataset

In [2]:
load_dotenv()
huggingface_hub.login(os.getenv("HF_TOKEN"))
dataset = load_dataset("nhhsag12/vlsp_2025_multimodal_rag")
dataset


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


DatasetDict({
    train: Dataset({
        features: ['id', 'image_id', 'question', 'relevant_articles', 'question_type', 'choices', 'answer', 'image'],
        num_rows: 530
    })
})

In [3]:
data_train = dataset["train"]
print(len(data_train))

530


# Analyze the current dataset

## Question type

In [4]:
question_types = set()
for data in data_train:
    question_types.add(data["question_type"])
print(question_types)

{'Multiple choice', 'Yes/No'}


In [5]:
multiple_choice_count = 0
yes_no_count = 0
for data in data_train:
    if data["question_type"] == "Multiple choice":
        multiple_choice_count += 1
    else:
        yes_no_count += 1
print(f"Number of Multiple choice questions: {multiple_choice_count}")
print(f"Number of Yes/No questions: {yes_no_count}")

Number of Multiple choice questions: 376
Number of Yes/No questions: 154


## Relevant articles analysis

### Convert the string representation of list into list data type in relevant articles field

In [33]:
from copy import deepcopy
import ast
processed_data_train = []
for idx, data in enumerate(data_train):
    new_record = deepcopy(data)

    # Convert the string representation of the list of the dictionary elements to the list type
    new_record["relevant_articles"] = ast.literal_eval(new_record["relevant_articles"])

    processed_data_train.append(new_record)


### Analysis

In [35]:
max_relevant_articles = 0
min_relevant_articles = 999

for data in processed_data_train:
    max_relevant_articles = max(max_relevant_articles, len(data["relevant_articles"]))
    min_relevant_articles = min(min_relevant_articles, len(data["relevant_articles"]))

print(f"Max number of relevant articles: {max_relevant_articles}")
print(f"Min number of relevant articles: {min_relevant_articles}")

Max number of relevant articles: 8
Min number of relevant articles: 1


In [36]:
article_counts = [0 for _ in range(max_relevant_articles + 1)]
for data in processed_data_train:
    article_counts[len(data["relevant_articles"])] += 1

for i in range(1, 9):
    print(f"Number of articles with {i} relevant articles: {article_counts[i]}")

Number of articles with 1 relevant articles: 78
Number of articles with 2 relevant articles: 337
Number of articles with 3 relevant articles: 39
Number of articles with 4 relevant articles: 48
Number of articles with 5 relevant articles: 11
Number of articles with 6 relevant articles: 14
Number of articles with 7 relevant articles: 1
Number of articles with 8 relevant articles: 2


In [37]:
for data in processed_data_train:
    if len(data["relevant_articles"]) == 7:
        print(data)
        break

{'id': 'train_399', 'image_id': 'train_6_31', 'question': 'Từ 6:00 đến 22:00, tất cả các loại xe (cơ giới và thô sơ) không được phép rẽ vào đường Hồng Hà trên đoạn đường trong ảnh, đúng hay sai?', 'relevant_articles': [{'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '22'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'B.23'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '36'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'E.14'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': '41'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'F.5'}, {'law_id': 'QCVN 41:2024/BGTVT', 'article_id': 'F.10'}], 'question_type': 'Yes/No', 'choices': None, 'answer': 'Sai', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=465x406 at 0x7489C90E7A70>}


## Number of words in questions

In [6]:
import torch
from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig, AutoTokenizer

bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModel.from_pretrained("google/siglip2-large-patch16-512",
                                  quantization_config=bnb_config,
                                  device_map="auto",
                                  attn_implementation="sdpa")
processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")


PackageNotFoundError: No package metadata was found for bitsandbytes

# Analyze the current database

In [46]:
import json

db_path = "../data/VLSP 2025 - MLQA-TSR Data Release/law_db/vlsp2025_law.json"
with open(db_path, "r") as f:
    db = json.load(f)

print(f"Number of records: {len(db)}")



Number of records: 2


In [50]:
for record in db:
    print(f"Record ID: {record['id']}")
    print(f"Law title: {record['title']}")
    print(f"Number of articles: {len(record['articles'])}")
    print("-"*10)

Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Number of articles: 310
----------
Record ID: 36/2024/QH15
Law title: LUẬT TRẬT TỰ, AN TOÀN GIAO THÔNG ĐƯỜNG BỘ
Number of articles: 89
----------


In [108]:
min_image_cnt = 999
max_image_cnt = 0

def count_image_tags(text):
    """
    Counts the occurrences of the <<IMAGE: ..../IMAGE>> format in a given text.

    Args:
        text: The input string to search within.

    Returns:
        The number of times the image tag format appears in the text.
    """
    # The regular expression to match the image tag format
    # <<IMAGE: any_characters_non_greedy /IMAGE>>
    pattern = r"<<IMAGE:.*?/IMAGE>>"

    # Find all occurrences of the pattern
    matches = re.findall(pattern, text)

    # Return the number of matches
    return len(matches)


for record in db:
    for article in record["articles"]:
        image_num = count_image_tags(article["text"])
        min_image_cnt = min(min_image_cnt, image_num)
        max_image_cnt = max(max_image_cnt, image_num)
        print(f"Record ID: {record['id']}")
        print(f"Law title: {record['title']}")
        print(f"Article ID: {article['id']}")
        # print(f"Number of articles: {len(record['articles'])}")
        print(f"Number of images: {image_num}")

print(f"Maximum number of images in a article: {max_image_cnt}")
print(f"Minimum number of images in a article: {min_image_cnt}")

Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 1
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 2
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 3
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 4
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 5
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 6
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 7
Number of images: 0
Record ID: QCVN 41:2024/BGTVT
Law title: QUY CHUẨN KỸ THUẬT QUỐC GIA VỀ BÁO HIỆU ĐƯỜNG BỘ
Article ID: 8
Number of images: 0
Record I

In [109]:
txt = ""

for record in db[0]["articles"]:
    if record["id"] == "M.4":
        txt = record["text"]
        print(txt)
        break

Biển số I.401: Bắt đầu đường ưu tiên

<<IMAGE: image1132.jpg /IMAGE>>
Biển số I.402: Hết đoạn đường ưu tiên

<<IMAGE: image1134.jpg /IMAGE>>
Biển số I.405a: Đường cụt

<<IMAGE: image1136.jpg /IMAGE>>
<<TABLE: <table border="0" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="width:100.0%;border-collapse:collapse;mso-padding-alt:0cm 0cm 0cm 0cm" width="100%">
<tr style="mso-yfti-irow:0;mso-yfti-firstrow:yes">
<td style="width:25.7%;border-top:solid black 1.0pt;border-left:
  solid black 1.0pt;border-bottom:none;border-right:none;mso-border-top-alt:
  solid black .5pt;mso-border-left-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" width="25%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-color-alt:windowtext;mso-ansi-language:EN">Lo</span><span lang="VI" style="color:black;mso-color-alt:windowtext;mso-ansi-language:VI">ại
  đường</span>

In [110]:
import re
from bs4 import BeautifulSoup

def convert_text_to_markdown(text):
    """
    Chuyển đổi một chuỗi văn bản có cấu trúc cụ thể sang định dạng Markdown.

    Hàm này xử lý các tiêu đề, danh sách, in đậm, hình ảnh và bảng HTML được nhúng.

    Args:
        text: Chuỗi văn bản đầu vào cần chuyển đổi.

    Returns:
        Một chuỗi đã được định dạng Markdown.
    """
    lines = text.split('\n')
    markdown_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Bỏ qua các dòng trống
        if not line:
            i += 1
            continue

        # Xử lý tiêu đề chính (ví dụ: G1.1. Nhóm...)
        if re.match(r'^G\d+\.\d+\.\s.*', line):
            markdown_lines.append(f"# {line}\n---")
            i += 1
            continue

        # Xử lý tiêu đề phụ (ví dụ: a. Vạch 1.1:)
        if re.match(r'^[a-z]\.\s.*', line):
            # Xóa dấu hai chấm ở cuối nếu có để tiêu đề gọn gàng hơn
            if line.endswith(':'):
                line = line[:-1]
            markdown_lines.append(f"## {line}")
            i += 1
            continue

        keywords = ['Ý nghĩa sử dụng:', 'Quy cách:', 'Minh họa:', 'Lưu ý:', 'Ghi chú:', 'Qui cách:']
        flag = False
        for keyword in keywords:
            if line.startswith(keyword):
                flag = True
                markdown_lines.append(f"**{keyword}** {line[len(keyword):].strip()}")
                i += 1
                break
        if flag:
            continue

        # Xử lý danh sách
        if line.startswith('-') or line.startswith('+') or line.startswith('*'):
            markdown_lines.append(f"* {line[1:].strip()}")
            i += 1
            continue

        # Xử lý hình ảnh (ví dụ: <<IMAGE: image498.jpg /IMAGE>>)
        if line.startswith('<<IMAGE:'):
            match = re.search(r'<<IMAGE:\s*(.*?)\s*/IMAGE>>', line)
            if match:
                image_filename = match.group(1)
                # Chú thích hình ảnh thường ở dòng tiếp theo
                caption = ""
                # if i + 1 < len(lines) and lines[i+1].strip().startswith("Hình G."):
                #     caption = lines[i+1].strip()
                #     i += 1 # Bỏ qua dòng chú thích đã xử lý
                markdown_lines.append(f"![{caption}]({image_filename})")
            i += 1
            continue

        # Xử lý bảng HTML
        if line.startswith('<<TABLE:'):
            table_html_content = ""
            # Trích xuất nội dung HTML của bảng
            while i < len(lines) and ' /TABLE>>' not in lines[i]:
                table_html_content += lines[i].replace('<<TABLE:', '') + "\n"
                i += 1
            table_html_content += lines[i].replace(' /TABLE>>', '') # Thêm dòng cuối cùng

            # Sử dụng BeautifulSoup để phân tích HTML
            soup = BeautifulSoup(table_html_content, 'html.parser')
            table = soup.find('table')
            if table:
                headers = [header.get_text(strip=True) for header in table.find_all('td')[:4]] # Giả sử 4 cột đầu tiên là header

                # Tạo hàng tiêu đề Markdown
                markdown_lines.append(f"| {' | '.join(headers)} |")
                markdown_lines.append(f"|{':---:|' * len(headers)}")

                # Xử lý các hàng dữ liệu
                rows = table.find_all('tr')[1:] # Bỏ qua hàng tiêu đề trong HTML
                for row in rows:
                    cols = [col.get_text(strip=True) for col in row.find_all('td')]
                    markdown_lines.append(f"| {' | '.join(cols)} |")

            i += 1
            continue

        # Xử lý các dòng văn bản thông thường
        markdown_lines.append(line)
        i += 1

    return '\n'.join(markdown_lines)

In [111]:
processed_text = convert_text_to_markdown(txt)
print(processed_text)

Biển số I.401: Bắt đầu đường ưu tiên
![](image1132.jpg)
Biển số I.402: Hết đoạn đường ưu tiên
![](image1134.jpg)
Biển số I.405a: Đường cụt
![](image1136.jpg)
| Loại
  đường | Đường
  đôi
  ngoài đô thị | Đường
  thông
  thường | Đường
  đô thị |
|:---:|:---:|:---:|:---:|
| z(mm) | 100 | 75 | 50 |
Biển số I.405b: Đường cụt
![](image1138.jpg)
Biển số I.405c: Đường cụt
![](image1140.jpg)
Biển số I.406: Được ưu tiên qua đường hẹp
![](image1142.jpg)
| Loại
  đường | Đường
  đôi
  ngoài đô thị | Đường
  thông thường | Đường
  đô thị |
|:---:|:---:|:---:|:---:|
| z(mm) | 100 | 75 | 50 |
Biển số I.407a: Đường một chiều
![](image1144.jpg)
Biển số I.407b: Đường một chiều
![](image1146.jpg)
Biển số I.407c: Đường một chiều
![](image1148.jpg)
| Loại
  đường | Đường
  đôi
  ngoài đô thị | Đường
  thông
  thường | Đường
  đô thị |
|:---:|:---:|:---:|:---:|
| z(mm) | 100 | 75 | 50 |
Biển số I.408: Nơi đỗ xe
![](image1150.jpg)
Biển số I.408a: Nơi đỗ xe một phần trên hè phố
![](image1152.jpg)
Biển số I.

In [80]:
html_table = """
<table border="0" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="width:100.0%;border-collapse:collapse;mso-padding-alt:0cm 0cm 0cm 0cm" width="100%">
<tr style="mso-yfti-irow:0;mso-yfti-firstrow:yes">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ốc độ hạn chế lớn nhất</span><span lang="VI" style="color:black"> </span><span lang="VI" style="color:black;
  mso-ansi-language:VI">hoặc tốc độ V<sub>85</sub> (km/h)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ầm nh</span><span lang="EN-US" style="color:black">ìn vư</span><span lang="VI" style="color:black;mso-ansi-language:
  VI">ợt xe tối thiểu (m) (chỉ d</span><span lang="EN-US" style="color:black">ùng
  đ</span><span lang="VI" style="color:black;mso-ansi-language:VI">ể x</span><span lang="EN-US" style="color:black">ác đ</span><span lang="VI" style="color:black;
  mso-ansi-language:VI">ịnh v</span><span lang="EN-US" style="color:black">ùng c</span><span lang="VI" style="color:black;mso-ansi-language:VI">ấm vượt)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ốc độ hạn chế</span><span lang="VI" style="color:black"> </span><span lang="VI" style="color:black;mso-ansi-language:
  VI">lớn nhất hoặc tốc độ V<sub>85</sub> (km/h)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ầm nh</span><span lang="EN-US" style="color:black">ìn vư</span><span lang="VI" style="color:black;mso-ansi-language:
  VI">ợt xe tối thiểu (m)</span><span lang="VI" style="color:black"> </span><span lang="VI" style="color:black;mso-ansi-language:VI">(chỉ d</span><span lang="EN-US" style="color:black">ùng đ</span><span lang="VI" style="color:black;
  mso-ansi-language:VI">ể x</span><span lang="EN-US" style="color:black">ác đ</span><span lang="VI" style="color:black;mso-ansi-language:VI">ịnh</span><span lang="VI" style="color:black"> </span><span lang="VI" style="color:black;mso-ansi-language:
  VI">v</span><span lang="EN-US" style="color:black">ùng c</span><span lang="VI" style="color:black;mso-ansi-language:VI">ấm vượt)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:1">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">30</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">120</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">80</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">245</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:2">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">40</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">140</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">90</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">280</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:3">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">50</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">160</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">100</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">320</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:4">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">60</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">180</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">110</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">355</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:5;mso-yfti-lastrow:yes">
<td style="width:18.48%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">70</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">210</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">120</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  mso-border-alt:solid black .5pt;background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">395</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
</table>
"""



In [82]:
from bs4 import BeautifulSoup
import re

def html_table_to_markdown(html_content):
    """
    Convert HTML table to Markdown format
    """
    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table
    table = soup.find('table')
    if not table:
        return "No table found in the HTML content"

    # Extract all rows
    rows = table.find_all('tr')

    markdown_rows = []

    for i, row in enumerate(rows):
        # Get all cells (td or th)
        cells = row.find_all(['td', 'th'])

        # Extract text from each cell and clean it up
        cell_texts = []
        for cell in cells:
            # Get text and clean up whitespace
            text = cell.get_text(strip=True)
            # Remove extra whitespace and line breaks
            text = re.sub(r'\s+', ' ', text)
            cell_texts.append(text)

        # Create markdown row
        if cell_texts:  # Only add non-empty rows
            markdown_row = '| ' + ' | '.join(cell_texts) + ' |'
            markdown_rows.append(markdown_row)

            # Add separator after header row (first row)
            if i == 0:
                separator = '|' + '---|' * len(cell_texts)
                markdown_rows.append(separator)

    return '\n'.join(markdown_rows)

def clean_html_content(html_content):
    """
    Clean up HTML content by removing MS Word specific tags and formatting
    """
    # Remove MS Word specific attributes and tags
    html_content = re.sub(r'style="[^"]*"', '', html_content)
    html_content = re.sub(r'class="[^"]*"', '', html_content)
    html_content = re.sub(r'mso-[^:]*:[^;]*;?', '', html_content)
    html_content = re.sub(r'<o:p[^>]*>.*?</o:p>', '', html_content)
    html_content = re.sub(r'lang="[^"]*"', '', html_content)
    html_content = re.sub(r'valign="[^"]*"', '', html_content)
    html_content = re.sub(r'width="[^"]*"', '', html_content)
    html_content = re.sub(r'align="[^"]*"', '', html_content)

    return html_content

In [93]:
# clean_html_table = clean_html_content(html_table)
markdown_table = html_table_to_markdown(html_table)
print(markdown_table)

| Tốc độ hạn chế lớn nhấthoặc tốc độ V85(km/h) | Tầm nhìn vượt xe tối thiểu (m) (chỉ dùng để xác định vùng cấm vượt) | Tốc độ hạn chếlớn nhất hoặc tốc độ V85(km/h) | Tầm nhìn vượt xe tối thiểu (m)(chỉ dùng để xác địnhvùng cấm vượt) |
|---|---|---|---|
| 30 | 120 | 80 | 245 |
| 40 | 140 | 90 | 280 |
| 50 | 160 | 100 | 320 |
| 60 | 180 | 110 | 355 |
| 70 | 210 | 120 | 395 |


In [96]:
from bs4 import BeautifulSoup
import re

def html_table_to_markdown(html_content):
    """
    Convert HTML table to Markdown format
    """
    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table
    table = soup.find('table')
    if not table:
        return "No table found in the HTML content"

    # Extract all rows
    rows = table.find_all('tr')

    markdown_rows = []

    for i, row in enumerate(rows):
        # Get all cells (td or th)
        cells = row.find_all(['td', 'th'])

        # Extract text from each cell and clean it up
        cell_texts = []
        for cell in cells:
            # Get text and clean up whitespace
            text = cell.get_text(strip=True)
            # Remove extra whitespace and line breaks
            text = re.sub(r'\s+', ' ', text)
            cell_texts.append(text)

        # Create markdown row
        if cell_texts:  # Only add non-empty rows
            markdown_row = '| ' + ' | '.join(cell_texts) + ' |'
            markdown_rows.append(markdown_row)

            # Add separator after header row (first row)
            if i == 0:
                separator = '|' + '---|' * len(cell_texts)
                markdown_rows.append(separator)

    return '\n'.join(markdown_rows)

def clean_html_content(html_content):
    """
    Clean up HTML content by removing MS Word specific tags and formatting
    """
    # Remove MS Word specific attributes and tags
    html_content = re.sub(r'style="[^"]*"', '', html_content)
    html_content = re.sub(r'class="[^"]*"', '', html_content)
    html_content = re.sub(r'mso-[^:]*:[^;]*;?', '', html_content)
    html_content = re.sub(r'<o:p[^>]*>.*?</o:p>', '', html_content)
    html_content = re.sub(r'lang="[^"]*"', '', html_content)
    html_content = re.sub(r'valign="[^"]*"', '', html_content)
    html_content = re.sub(r'width="[^"]*"', '', html_content)
    html_content = re.sub(r'align="[^"]*"', '', html_content)

    return html_content

# Example usage with your HTML content
html_content = '''<table border="0" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="width:100.0%;border-collapse:collapse;mso-padding-alt:0cm 0cm 0cm 0cm" width="100%">
<tr style="mso-yfti-irow:0;mso-yfti-firstrow:yes">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ốc độ hạn chế lớn nhất</span><span lang="VI" style="color:black"> </span><span lang="VI" style="color:black;
  mso-ansi-language:VI">hoặc tốc độ V<sub>85</sub> (km/h)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ầm nhìn vượt xe tối thiểu (m) (chỉ dùng để xác định vùng cấm vượt)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ốc độ hạn chế lớn nhất hoặc tốc độ V<sub>85</sub> (km/h)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">T</span><span lang="VI" style="color:black;mso-ansi-language:VI">ầm nhìn vượt xe tối thiểu (m) (chỉ dùng để xác định vùng cấm vượt)</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:1">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">30</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">120</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">80</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">245</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:2">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">40</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">140</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">90</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">280</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:3">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">50</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">160</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">100</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">320</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:4">
<td style="width:18.48%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">60</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">180</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border-top:solid black 1.0pt;
  border-left:solid black 1.0pt;border-bottom:none;border-right:none;
  mso-border-top-alt:solid black .5pt;mso-border-left-alt:solid black .5pt;
  background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">110</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  border-bottom:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-right-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">355</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
<tr style="mso-yfti-irow:5;mso-yfti-lastrow:yes">
<td style="width:18.48%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="18%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">70</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:32.66%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="32%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">210</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:21.02%;border:solid black 1.0pt;
  border-right:none;mso-border-top-alt:solid black .5pt;mso-border-left-alt:
  solid black .5pt;mso-border-bottom-alt:solid black .5pt;background:white;
  padding:0cm 0cm 0cm 0cm" valign="top" width="21%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">120</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
<td style="width:27.84%;border:solid black 1.0pt;
  mso-border-alt:solid black .5pt;background:white;padding:0cm 0cm 0cm 0cm" valign="top" width="27%">
<p align="center" class="MsoNormal" style="margin-top:6.0pt;text-align:center;
  mso-layout-grid-align:none;text-autospace:none"><span lang="EN" style="color:black;mso-ansi-language:EN">395</span><span lang="EN" style="mso-ansi-language:EN"><o:p></o:p></span></p>
</td>
</tr>
</table>'''

# Convert to markdown
markdown_table = html_table_to_markdown(html_content)
print("Markdown Table:")
print(markdown_table)

# If you want to read from a file instead:
def convert_html_file_to_markdown(file_path):
    """
    Convert HTML table from a file to markdown
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        return html_table_to_markdown(html_content)
    except FileNotFoundError:
        return f"File {file_path} not found"
    except Exception as e:
        return f"Error reading file: {str(e)}"

# Example usage:
# markdown_result = convert_html_file_to_markdown('your_table.html')
# print(markdown_result)

Markdown Table:
| Tốc độ hạn chế lớn nhấthoặc tốc độ V85(km/h) | Tầm nhìn vượt xe tối thiểu (m) (chỉ dùng để xác định vùng cấm vượt) | Tốc độ hạn chế lớn nhất hoặc tốc độ V85(km/h) | Tầm nhìn vượt xe tối thiểu (m) (chỉ dùng để xác định vùng cấm vượt) |
|---|---|---|---|
| 30 | 120 | 80 | 245 |
| 40 | 140 | 90 | 280 |
| 50 | 160 | 100 | 320 |
| 60 | 180 | 110 | 355 |
| 70 | 210 | 120 | 395 |
