In [34]:
from transformers import AutoTokenizer

model_name_map = {
    "de": "Helsinki-NLP/opus-mt-de-en",
    "zh": "Helsinki-NLP/opus-mt-zh-en",
    "fr": "Helsinki-NLP/opus-mt-fr-en",
    "ar": "Helsinki-NLP/opus-mt-ar-en",
    "ru": "Helsinki-NLP/opus-mt-ru-en",
    "es": "Helsinki-NLP/opus-mt-es-en"
}

model_map = {}

for lang, model in model_name_map.items():
    tokenizer = AutoTokenizer.from_pretrained(model)
    model_map[lang] = tokenizer

In [35]:
split_symbols_map = {
    "zh": ['\n', '。', '；', ':', ',', ' '],
    "de": ['\n', '.', ';', ':', ',', ' '],
    "fr": ['\n', '.', ';', ':', ',', ' '],
    "ar": ['\n', '،', ';', '.', ':', ' '],
    "ru": ['\n', '.', ';', ':', ',', ' '],
    "es": ['\n', '.', ';', ':', ',', ' '],
}

def split_sentence(sentence, tokenizer, lang, token_limit=512):
    """
    将一个句子分成若干块，每块的token数量不超过token_limit。
    尝试使用不同的分隔符来拆分句子。
    """
    split_symbols = split_symbols_map.get(lang)

    for split_symbol in split_symbols:
        chunks = sentence.split(split_symbol)

        if len(chunks) == 1:
            continue

        chunked_results = []
        current_content = ""
        current_tokens = 0

        for chunk in chunks:
            tokens = tokenizer.encode(chunk)
            added_tokens = len(tokens) + (len(tokenizer.encode(split_symbol)) if current_content else 0)
    
            # 如果当前块与新的块合并后不超过token限制，则合并它们。
            if current_tokens + added_tokens <= token_limit:
                current_content += (split_symbol if current_content else "") + chunk
                current_tokens += added_tokens
            else:
                # 否则，保存当前块并开始一个新的块。
                if current_content:
                    chunked_results.append({"content": current_content, "token_total": tokenizer.encode(current_content), "split_symbol": split_symbol})
                current_content = chunk
                current_tokens = len(tokens)
    
        # 添加最后一个块。
        if current_content:
            chunked_results.append({"content": current_content, "token_total": tokenizer.encode(current_content), "split_symbol": split_symbol})
    
        # 确保所有块的大小都不超过token限制。
        if chunked_results and all([len(chunk["token_total"]) <= token_limit for chunk in chunked_results]):
            return chunked_results
    
    return [{"content": sentence, "token_total": len(tokenizer.encode(sentence)), "split_symbol": ""}]

def split_text_into_chunks(text, tokenizer, lang, token_limit=512):
    """
    将文本分成若干块，每块的token数量不超过token_limit。
    这是通过首先按段落拆分文本，然后进一步拆分长段落来实现的。
    """
    
    paragraphs = text.split("\n\n")
    results = []

    for index, para in enumerate(paragraphs):
        tokens = tokenizer.encode(para)
        
        # 如果段落的token数量小于token限制，直接添加到结果中。
        if len(tokens) < token_limit:
            results.append({
                "id": str(index),
                "content": para,
                "token_total": len(tokens),
                "split_symbol": "\n\n"
            })
        else:
            # 否则，进一步拆分段落。
            chunks = split_sentence(para, tokenizer, lang, token_limit)
            for idx, chunk in enumerate(chunks):
                results.append({
                    "id": f"{index}-{idx}",
                    "content": chunk["content"],
                    "token_total": len(chunk["token_total"]),
                    "split_symbol": chunk["split_symbol"]
                })

    return results

def reconstruct_text(chunks):
    reconstructed = []
    current_paragraph = []
    current_parent_id = None

    for chunk in chunks:
        # Split the ID to determine the parent ID and potential sub-IDs
        id_split = chunk["id"].split("-")
        parent_id = id_split[0]

        # If this is a new paragraph, append the previous paragraph to the results and start a new one
        if parent_id != current_parent_id:
            if current_paragraph:
                reconstructed.append(current_paragraph)
            current_paragraph = []
            current_parent_id = parent_id

        current_paragraph.append(chunk)

    # Append the last paragraph if there's any content left
    if current_paragraph:
        reconstructed.append(current_paragraph)

    completed_reconstructed = []
    for arr_para in reconstructed:
        if len(arr_para) == 1:
            completed_reconstructed.append(arr_para[0]["content"])
        else:
            content = ""
            for para in arr_para:
                content += (para["split_symbol"] if content else "") + para["content"]
            completed_reconstructed.append(content)

    return completed_reconstructed

In [36]:
import re

import json

def clean_paragraph(paragraph):
    lines = paragraph.split('\n')
    para = ''
    table = []
    discarded = []  # 用于保存被舍弃的内容

    for line in lines:
        line = line.strip()
        
        # 表格线或其他分割线
        if re.match(r'^(\+[-=+]+\+|-+|=+|_+)$', line):
            discarded.append(line)
            if not para.endswith('\n'):
                para += '\n'
            if len(table) > 0:
                para += '\t'.join(table)
                table = []
                
        # 表格中的空行
        elif re.match(r'^\|( +\|)+$', line):
            discarded.append(line)
            para += '\t'.join(table) + ' '
            table = []
            
        # 表格中的内容行
        elif re.match(r'^\|([^|]+\|)+$', line):
            if len(table) == 0:
                table = line[1:-2].split('|')
            else:
                arr = line[1:-2].split('|')
                if len(arr) == len(table):
                    table = [table[i].strip() + arr[i].strip() for i in range(len(table))]
                elif len(arr) > len(table):
                    table = [table[i].strip() + arr[i].strip() if i < len(table) else arr[i].strip() for i in range(len(arr))]
                else:
                    table = [table[i].strip() + arr[i].strip() if i < len(arr) else table[i].strip() for i in range(len(table))]
        # 正文内容
        else:
            para += ' ' + line

    if len(table) > 0:
        if not para.endswith('\n'):
            para += '\n'
            
        para += '\t'.join(table)
        
    # # 如果有被舍弃的内容，保存到文件
    # if discarded:
    #     with open("discarded_content.jsonl", "a", encoding="utf-8") as f:
    #         f.write(json.dumps({"discarded": discarded, "method": "clean_table"}) + "\n")
        
    return re.sub(r'[ \t]{2,}', ' ', re.sub(r'\n{2,}', '\n', para)).strip()


def preprocess(text):
    cleaned_paragraphs = [clean_paragraph(para) for para in text.split("\n\n")]
    cleaned_paragraphs = list(filter(lambda x: x.strip() != "" and x != "[]", cleaned_paragraphs))
    
    processed_paragraphs = []
    for para in cleaned_paragraphs:
        lines = para.splitlines()

        # 替换那些"-"出现频率超过30%的行中的"-"字符为空格
        for idx, line in enumerate(lines):
            num_dashes = line.count("-")
            if num_dashes / len(line) >= 0.2:
                modified_line = line.replace("-", " ")
                
                if modified_line.strip() == "":
                    del lines[idx]
                else:
                    lines[idx] = modified_line
                    idx += 1
        
                # # Save the count of '-' replaced in the line
                # with open("discarded_content.jsonl", "a", encoding="utf-8") as f:
                #     f.write(json.dumps({"discarded": "-" * num_dashes, "method": "replace '-' with space"}) + "\n")

        processed_paragraphs.append("\n".join(lines))
        
    return "\n\n".join(filter(lambda para: para.strip() != "", processed_paragraphs))

In [37]:
from tqdm import tqdm

def process_row(row):
    results = []
    
    record = row["record"]
    for lang, content in row.items():
   
        if lang in ["en", "record"]:
            continue

        if content.strip() == "":
            continue
        
        processed_text = preprocess(content)
        tokenizer = model_map[lang]
        chunks = split_text_into_chunks(processed_text, tokenizer, lang)
        
        for chunk in chunks:
            chunk.update({"record": record, "source": lang, "target": "en"})
            results.append(chunk)
        
    return results

In [38]:
def unique_items_generator(data_list):
    seen_contents = set()
    for item in data_list:
        if item['content'] not in seen_contents:
            yield item
            
        if len(item['content']) < 10:
            seen_contents.add(item['content'])

In [39]:
import datasets

dataset = datasets.load_dataset("bot-yaya/undl_text", split="train[:100]")

Resolving data files:   0%|          | 0/98 [00:00<?, ?it/s]

In [40]:
from multiprocessing import Pool
from tqdm import tqdm

# with Pool(8) as p:
#     result_total = list(tqdm(p.imap(process_row, dataset), total=len(dataset)))

# flattened_result = [item for sublist in result_total for item in sublist]

results = []
for index, row in tqdm(enumerate(dataset)):
    results += process_row(row)



0it [00:00, ?it/s][A[AToken indices sequence length is longer than the specified maximum sequence length for this model (748 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (887 > 512). Running this sequence through the model will result in indexing errors


9it [00:00, 71.71it/s][A[AToken indices sequence length is longer than the specified maximum sequence length for this model (1009 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (955 > 512). Running this sequence through the model will result in indexing errors


17it [00:00, 18.80it/s][A[AToken indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors




In [43]:
from datasets import Features, Value

features = Features({
    'id': Value('string'),
    'content': Value('string'),
    'token_total': Value('int32'),
    'split_symbol': Value('string'),
    'record': Value('string'),
    'source': Value('string'),
    'target': Value('string'),
})

def generator_wrapper(data_to_process):
    def generator():
        return unique_items_generator(data_to_process)
    return generator

new_dataset = datasets.Dataset.from_generator(generator_wrapper(results), features=features)
new_dataset

Dataset({
    features: ['id', 'content', 'token_total', 'split_symbol', 'record', 'source', 'target'],
    num_rows: 68352
})

In [51]:
test_dataset = new_dataset.filter(lambda row: row["token_total"] > 500)
test_dataset

Dataset({
    features: ['id', 'content', 'token_total', 'split_symbol', 'record', 'source', 'target'],
    num_rows: 6
})

In [49]:
test_dataset[0: 4]

{'id': ['118-0', '93', '340', '76'],
 'content': ['Статья 10. Обязательствораскрывать информации\t\nДля арбитров и кандидатов варбитры\tДля судей и кандидатов надолжность судьи\n1. Кандидат и арбитр раскрываютинформацию о любыхобстоятельствах, которые могутвызвать оправданные сомнения[,\xa0в\xa0том числе с точки зрениясторон в споре,] в ихнезависимости илибеспристрастности.\t<То же> за исключением оговорки«с точки зрения сторон в споре»,которая не будет применяться ксудьям и кандидатам на должностьсудьи2. Раскрываются, в частности,следующие сведения о:\t*Судья, раскрывает, в частности,сведения\xa0о: a) любых финансовых, деловых,профессиональных и личныхотношениях за последние пять летс:\ta) любых финансовых, деловых,профессиональных и личныхотношениях за последние пять летс: i) любой из сторон в споре илиорганизацией, указанной сторонойв споре;\ti) любой из сторон в споре илиорганизацией, указанной сторонойв споре; ii) законным(-ыми)представителем(-ями) любой изсторон в рамках разбират