In [1]:
ls

MD_traductor.ipynb  README.md  full.md  miner.ipynb  output.md  output_2.md


In [2]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re
from tqdm import tqdm

# Configuración del dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Cargar modelo y tokenizer para traducción
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name, src_lang="ru", tgt_lang="en")
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

# Cargar modelo y tokenizer para corrección tipográfica (usando una versión pequeña de T5 como ejemplo)
#correction_model_name = "google/flan-t5-base"  # Or a smaller version like "google/flan-t5-small"
#correction_tokenizer = AutoTokenizer.from_pretrained(correction_model_name)
#correction_model = AutoModelForSeq2SeqLM.from_pretrained(correction_model_name).to(device)

# Patrones
image_pattern = r"!\[\]\((.*?)\)"
latex_inline_pattern = r"\$[^$]+\$"
latex_block_pattern = r"\$\$[\s\S]*?\$\$"
code_block_pattern = r"```[\s\S]*?```"
header_pattern = r"^(#{1,6})\s+(.*)$"

# Diccionarios para preservar bloques LaTeX
latex_blocks = {}
block_counter = 0

def translate_text(text):
    tokenizer.src_lang = "ru"
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
    translated = model.generate(**inputs, forced_bos_token_id=tokenizer.get_lang_id("en"), max_length=1024)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

def preserve_latex_blocks(text):
    global block_counter
    def repl(match):
        global block_counter
        key = f"<LATEX_BLOCK_{block_counter}>"
        latex_blocks[key] = match.group(0)
        block_counter += 1
        return key
    return re.sub(latex_block_pattern, repl, text)

def restore_latex_blocks(text):
    for key, value in latex_blocks.items():
        text = text.replace(key, value)
    return text

def split_paragraph(paragraph):
    patterns = [image_pattern, latex_block_pattern, latex_inline_pattern, code_block_pattern]
    parts = [paragraph]
    for pattern in patterns:
        new_parts = []
        for part in parts:
            matches = list(re.finditer(pattern, part, re.DOTALL))
            last_pos = 0
            for match in matches:
                start, end = match.span()
                if last_pos < start:
                    new_parts.append(part[last_pos:start])
                new_parts.append(match.group(0))
                last_pos = end
            if last_pos < len(part):
                new_parts.append(part[last_pos:])
        parts = new_parts
    return parts

def is_russian(text):
    return re.search(r"[а-яА-Я]", text, re.DOTALL)

def correct_latex_in_text(text):
    latex_pattern = re.compile(r"(\${1,2})([^\$]+?)\1", re.DOTALL)
    matches = list(latex_pattern.finditer(text))

    corrected_latex = {}
    for match in tqdm(matches, desc="Corrigiendo LaTeX"):
        delimiter, formula = match.groups()
        prompt = f"Correct any typos or syntactic errors in this LaTeX formula without changing its mathematical content:\n\n{delimiter}{formula}{delimiter}"

        inputs = correction_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
        outputs = correction_model.generate(**inputs, max_length=512)
        corrected = correction_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Try to extract the LaTeX part from the corrected output
        corrected_formula_match = re.search(r"(\${1,2})([^\$]+?)\1", corrected)
        if corrected_formula_match:
            corrected = f"{corrected_formula_match.group(1)}{corrected_formula_match.group(2)}{corrected_formula_match.group(1)}"
        else:
            # If extraction fails, fall back to the original formula
            corrected = f"{delimiter}{formula.strip()}{delimiter}"

        corrected_latex[match.group(0)] = corrected

    # Reemplazar en el contenido original
    for original, fixed in corrected_latex.items():
        text = text.replace(original, fixed)

    return text

def process_paragraph(paragraph):
    header_match = re.match(header_pattern, paragraph)
    if header_match:
        level, text = header_match.groups()
        if is_russian(text):
            text = translate_text(text)
        return f"{level} {text}"

    paragraph = preserve_latex_blocks(paragraph)
    parts = split_paragraph(paragraph)

    translated_parts = []
    for part in parts:
        if is_russian(part):
            translated_parts.append(translate_text(part))
        elif re.match(latex_inline_pattern, part) or re.match(latex_block_pattern, part):
            # Si es texto LaTeX, corregirlo
            #correct_latex_in_text = correct_latex_in_text(part)
            translated_parts.append(part)
        else:
            translated_parts.append(part)

    final_paragraph = "".join(translated_parts)
    return restore_latex_blocks(final_paragraph).strip()

def main():
    #input_file = "full.md"
    input_file = "Homework_ActorCritic.md"
    #output_file = "output.md"
    output_file = "Homework_ActorCritic_eng.md"

    with open(input_file, "r", encoding="utf-8") as f:
        content = f.read()

    paragraphs = content.split("\n\n")
    translated_paragraphs = []

    for paragraph in tqdm(paragraphs, desc="Translating markdown paragraphs"):
        translated_paragraph = process_paragraph(paragraph)
        translated_paragraphs.append(translated_paragraph)

    translated_content = "\n\n".join(translated_paragraphs)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(translated_content)

    print("Translation complete! Check 'output.md' for the result.")

if __name__ == "__main__":
    main()

Using device: cuda


Translating markdown paragraphs:   0%|          | 0/188 [00:00<?, ?it/s]2025-04-05 04:25:41.154590: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Translating markdown paragraphs: 100%|██████████| 188/188 [02:16<00:00,  1.37it/s]

Translation complete! Check 'output.md' for the result.





In [4]:
ls

Homework_ActorCritic.md  README.md  miner.ipynb  output_2.md
MD_traductor.ipynb       full.md    output.md
