In [1]:
# ============================================
# Article Translator — File Picker + URL (AzureChatOpenAI)
# Requisitos (instale o que precisar):
# !pip install beautifulsoup4 pdfminer.six python-docx langchain-openai ipywidgets requests
# !jupyter nbextension enable --py widgetsnbextension
# ============================================

import os
from pathlib import Path
from typing import Optional, List

# HTTP e parsing de HTML
import requests
from bs4 import BeautifulSoup

# PDFs
try:
    from pdfminer.high_level import extract_text as pdf_extract_text
except Exception:
    pdf_extract_text = None

# DOCX
try:
    import docx
except Exception:
    docx = None

# Widgets (UI)
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
except Exception:
    widgets = None

In [13]:
from dotenv import load_dotenv
load_dotenv()          # carrega o .env para o ambiente



True

In [None]:
# --- DEPENDÊNCIAS ---
from langchain_openai import AzureChatOpenAI

# --- DEFINA SUAS CREDENCIAIS AQUI (ou use env vars já definidas) ---
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "https://SEU-RECURSO.openai.azure.com/")
AZURE_OPENAI_API_KEY  = os.getenv("AZURE_OPENAI_API_KEY",  "SUA_CHAVE")
AZURE_OPENAI_API_VER  = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini")

def _mask(s: str, keep: int = 4) -> str:
    if not s: return ""
    return (s[:keep] + "..." + s[-keep:]) if len(s) > keep*2 else "***"

def validate_azure_config():
    # prints úteis p/ debug (sem vazar a chave inteira)
    print("Endpoint:", AZURE_OPENAI_ENDPOINT)
    print("API Key:", _mask(AZURE_OPENAI_API_KEY))
    print("API Version:", AZURE_OPENAI_API_VER)
    print("Deployment:", AZURE_DEPLOYMENT_NAME)

    if not AZURE_OPENAI_ENDPOINT or "azure.com" not in AZURE_OPENAI_ENDPOINT:
        raise ValueError("AZURE_OPENAI_ENDPOINT inválido. Ex: https://SEU-RECURSO.openai.azure.com/")
    if not AZURE_OPENAI_API_KEY or AZURE_OPENAI_API_KEY.strip().lower() in ("", "sua_chave"):
        raise ValueError("AZURE_OPENAI_API_KEY ausente.")
    if not AZURE_OPENAI_API_VER:
        raise ValueError("AZURE_OPENAI_API_VERSION ausente.")
    if not AZURE_DEPLOYMENT_NAME:
        raise ValueError("AZURE_OPENAI_DEPLOYMENT (deployment_name) ausente. Use o NOME do deployment no Azure.")

def get_client() -> AzureChatOpenAI:
    """
    Instancia o cliente SOMENTE após validar configs.
    Corrige o erro 'Missing credentials' e facilita o debug.
    """
    validate_azure_config()
    return AzureChatOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        api_version=AZURE_OPENAI_API_VER,
        deployment_name=AZURE_DEPLOYMENT_NAME,
        max_retries=0,
        temperature=0.2,
    )

# Exemplo de uso (teste rápido):
try:
    client = get_client()
    resp = client.invoke([
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Diga apenas 'ok'."}
    ])
    print("Teste OK ->", resp.content)
except Exception as e:
    print("Falha no teste do cliente:", e)


Endpoint: https://SEU-RECURSO.openai.azure.com/
API Key: SUA_...HAVE
API Version: 2024-02-15-preview
Deployment: gpt-4o-mini
Falha no teste do cliente: AZURE_OPENAI_API_KEY ausente.


In [8]:
# ========= FUNÇÕES AUXILIARES =========

def chunk_text(text: str, max_chars: int = 6000) -> List[str]:
    """
    Divide um texto longo em blocos menores de até `max_chars`,
    tentando quebrar em limites de parágrafo/linha quando possível.
    """
    text = text or ""
    if len(text) <= max_chars:
        return [text]

    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        # tenta recuar até o fim de parágrafo/linha
        cut = text.rfind("\n\n", start, end)
        if cut == -1:
            cut = text.rfind("\n", start, end)
        if cut == -1 or cut <= start + int(max_chars * 0.5):
            cut = end
        chunks.append(text[start:cut].strip())
        start = cut
    return [c for c in chunks if c]


def translate_text(text: str, target_lang: str = "pt-BR") -> str:
    """
    Traduz `text` para `target_lang` usando AzureChatOpenAI, com chunking.
    """
    text = (text or "").strip()
    if not text:
        return ""

    system_msg = (
        f"You are a professional translator. Translate the user text into {target_lang}, "
        "preserving meaning, tone, and formatting when possible. Avoid adding commentary."
    )

    pieces = chunk_text(text, max_chars=6000)
    outputs: List[str] = []

    for idx, piece in enumerate(pieces, 1):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": piece},
        ]
        try:
            # LangChain's AzureChatOpenAI aceita .invoke(messages)
            resp = client.invoke(messages)
            # resp.content contém o texto na maioria das versões
            outputs.append(getattr(resp, "content", str(resp)))
        except Exception as e:
            outputs.append(f"[Translation error on chunk {idx}: {e}]")

    return "\n".join(outputs).strip()


def extract_text_from_url(url: str) -> Optional[str]:
    """
    Extrai texto “legível” de uma URL (HTML).
    Tenta focar em <article>, <main> ou <section>. Se não achar, pega o texto da página inteira.
    """
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        candidates = []
        for tag in ("article", "main", "section"):
            el = soup.find(tag)
            if el:
                candidates.append(el.get_text(separator=" ", strip=True))
        if candidates:
            return max(candidates, key=len)

        return soup.get_text(separator=" ", strip=True)
    except Exception as e:
        return f"[URL extraction error: {e}]"


def extract_text_from_file(path: Path) -> Optional[str]:
    """
    Lê e extrai texto de TXT/MD, HTML/HTM, PDF (pdfminer.six), DOCX (python-docx).
    Para outros formatos, tenta abrir como texto puro (UTF-8).
    """
    if not path or not path.exists():
        return None
    ext = path.suffix.lower()
    try:
        if ext in (".txt", ".md"):
            return path.read_text(encoding="utf-8", errors="ignore")

        if ext in (".html", ".htm"):
            html = path.read_text(encoding="utf-8", errors="ignore")
            soup = BeautifulSoup(html, "html.parser")
            return soup.get_text(separator=" ", strip=True)

        if ext == ".pdf":
            if pdf_extract_text is None:
                return "[PDF support missing. Please install pdfminer.six]"
            return pdf_extract_text(str(path))

        if ext == ".docx":
            if docx is None:
                return "[DOCX support missing. Please install python-docx]"
            d = docx.Document(str(path))
            return "\n".join(p.text for p in d.paragraphs)

        # fallback: tenta como texto
        return path.read_text(encoding="utf-8", errors="ignore")

    except Exception as e:
        return f"[File extraction error for {path.name}: {e}]"


def save_translation(text: str, lang: str, original_name: str = "document") -> Path:
    """
    Salva a tradução em ./translated/<original>_<lang>.txt
    """
    out_dir = Path("translated")
    out_dir.mkdir(exist_ok=True)
    stem = Path(original_name).stem or "document"
    out_path = out_dir / f"{stem}_{lang.replace('/', '-')}.txt"
    out_path.write_text(text, encoding="utf-8")
    return out_path


In [10]:


# =============== UI (Jupyter com ipywidgets) ===============
if widgets is None:
    print("ipywidgets não disponível neste ambiente.")
    print("Use diretamente as funções:")
    print("- extract_text_from_file(Path('arquivo.ext'))")
    print("- extract_text_from_url('https://...')")
    print("- translate_text(texto, 'pt-BR')")
else:
    upload = widgets.FileUpload(accept='.txt,.md,.html,.htm,.pdf,.docx', multiple=False)
    lang = widgets.Text(value='pt-BR', description='Idioma alvo:', placeholder='ex.: pt-BR, en, es')
    url_in = widgets.Text(value='', description='URL (opcional):', placeholder='https://exemplo.com/artigo')
    save_chk = widgets.Checkbox(value=True, description='Salvar tradução em arquivo')
    run_btn = widgets.Button(description='Traduzir', button_style='primary')
    out = widgets.Output()

    def _save_uploaded_file(up: widgets.FileUpload) -> Optional[Path]:
        """
        Salva o arquivo enviado via FileUpload e retorna o caminho salvo.
        Lida com variações de estrutura do `upload.value` em versões diferentes do ipywidgets.
        """
        if not up.value:
            return None

        save_dir = Path("uploads")
        save_dir.mkdir(exist_ok=True)

        # Formato A (dict): {'nome.ext': {'content': b'...', 'metadata': {...}}}
        if isinstance(up.value, dict):
            # pega o primeiro item
            (fname, meta), = up.value.items()
            content = meta.get('content', b'')
            fpath = save_dir / fname
            with open(fpath, "wb") as f:
                f.write(content)
            return fpath

        # Formato B (lista de dicts): [{'name': 'nome.ext', 'content': b'...'}, ...]
        if isinstance(up.value, (list, tuple)) and len(up.value) > 0:
            meta = up.value[0]
            fname = meta.get('name', 'uploaded_file.bin')
            content = meta.get('content', b'')
            fpath = save_dir / fname
            with open(fpath, "wb") as f:
                f.write(content)
            return fpath

        return None

    def on_run_clicked(_):
        with out:
            clear_output()
            # Checagem rápida de credenciais Azure
            if not AZURE_OPENAI_ENDPOINT or not AZURE_OPENAI_API_KEY or not AZURE_DEPLOYMENT_NAME:
                print("⚠️ Defina AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY e AZURE_DEPLOYMENT_NAME.")
                return

            text = ""
            origin_name = "document"

            # 1) Preferência: arquivo enviado
            fpath = _save_uploaded_file(upload)
            if fpath and fpath.exists():
                origin_name = fpath.name
                print(f"📄 Arquivo salvo em: {fpath}")
                text = extract_text_from_file(fpath)

            # 2) Alternativa: URL (se não houver arquivo)
            elif url_in.value.strip():
                print(f"🌐 Extraindo de URL: {url_in.value.strip()}")
                text = extract_text_from_url(url_in.value.strip())
                origin_name = "from_url"

            else:
                print("Envie um arquivo OU informe uma URL.")
                return

            if not text:
                print("Nenhum texto extraído.")
                return
            if isinstance(text, str) and text.startswith("[") and "error" in text.lower():
                print(text)
                return

            print(f"✅ Texto extraído ({len(text)} caracteres).")
            print("⏳ Traduzindo (AzureChatOpenAI)...")
            translated = translate_text(text, lang.value.strip() or "pt-BR")

            print("\n--- Tradução ---\n")
            print(translated[:2000] + ("...\n[conteúdo truncado]" if len(translated) > 2000 else ""))

            if save_chk.value and translated:
                out_file = save_translation(translated, lang.value.strip() or "pt-BR", origin_name)
                print(f"\n💾 Tradução salva em: {out_file.resolve()}")

    run_btn.on_click(on_run_clicked)

    ui = widgets.VBox([
        widgets.HBox([upload, lang]),
        url_in,
        widgets.HBox([save_chk, run_btn]),
        out
    ])
    display(ui)


ipywidgets não disponível neste ambiente.
Use diretamente as funções:
- extract_text_from_file(Path('arquivo.ext'))
- extract_text_from_url('https://...')
- translate_text(texto, 'pt-BR')


In [11]:
texto = extract_text_from_url("https://dev.to/dasha_tsion/6-mistakes-that-made-me-a-better-leader-3lnp")
traducao = translate_text(texto, "pt-BR")
print(traducao)


[Translation error on chunk 1: Connection error.]
