In [6]:
import re


def tokenize(s):
    tokens = re.findall(r'\(|\)|[^\s()]+', s)
    return tokens


def parse(tokens):

    if len(tokens) == 0:
        raise ValueError("Không có token nào!")
    token = tokens.pop(0)
    if token != '(':
        return token
    lst = []
    while tokens and tokens[0] != ')':
        lst.append(parse(tokens))
    if tokens:
        tokens.pop(0)  # loại bỏ dấu ')'
    return lst


def extract_leaves(tree):

    leaves = []
    if isinstance(tree, list):
        if len(tree) == 2 and isinstance(tree[0], str) and isinstance(tree[1], str):

            tag_parts = tree[0].split('-')
            tag = tag_parts[0]
            token = tree[1]

            if tag not in ['-NONE-', 'LBKT', 'RBKT']:
                leaves.append((token, tag))
        else:
            for subtree in tree:
                leaves.extend(extract_leaves(subtree))
    return leaves


def convert_line_to_pos(line):

    try:
        tokens = tokenize(line)
        tree = parse(tokens)
        pos_pairs = extract_leaves(tree)
        return pos_pairs
    except Exception as e:
        print(f"Lỗi khi parse dòng: {e}")
        return []


def read_treebank_and_convert_to_pos(input_file_path, output_file_path):

    with open(input_file_path, 'r', encoding='utf-8') as infile:
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            for line_num, line in enumerate(infile, 1):
                try:
                    line = line.strip()

                    if not line or line.startswith('//'):
                        continue

                    pos_pairs = convert_line_to_pos(line)
                    if pos_pairs:

                        pos_line = " ".join(
                            f"{token}/{tag}" for token, tag in pos_pairs)
                        outfile.write(pos_line + "\n")
                except Exception as e:
                    print(f"Lỗi khi xử lý dòng {line_num}: {e}")
                    print(f"Nội dung dòng: {line[:100]}...")

In [7]:

input_file = r'C:\Users\09398\Subject\Corpus\vn_treebank.txt'
output_file = r'C:\Users\09398\Subject\Corpus\vn_pos.txt'
read_treebank_and_convert_to_pos(input_file, output_file)
print(f"Đã chuyển đổi dữ liệu từ {input_file} sang {output_file}")

Đã chuyển đổi dữ liệu từ C:\Users\09398\Subject\Corpus\vn_treebank.txt sang C:\Users\09398\Subject\Corpus\vn_pos.txt
