# Getting text from website

In [89]:
import requests
from bs4 import BeautifulSoup

def url_text_extractor(url):
    """Extract text from online article."""
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()
        
        textSep = soup.get_text(separator= ' ')

        # Text cleaner
        textLines = (line.strip() for line in textSep.splitlines())
        textParts = (phrase.strip() for line in textLines for phrase in line.split("  "))
        cleanText = '\n'.join(part for part in textParts if part)

        return cleanText

    else:
        print(f"Failed to fetch the URL. Status code: {response.status_code}")
        return None
    
    text = soup.get_text()
    return text


url_text_extractor('https://dev.to/u11d/selecting-the-appropriate-docker-base-image-2126')



'Selecting the appropriate Docker base image - DEV Community\nForem Feed\nFollow new Subforems to improve your feed\nDEV Community\nFollow\nA space to discuss and keep up software development and manage your software career\nGamers Forem\nFollow\nAn inclusive community for gaming enthusiasts\nFuture\nFollow\nNews and discussion of science and technology such as AI, VR, cryptocurrency, quantum computing, and more.\nMusic Forem\nFollow\nFrom composing and gigging to gear, hot music takes, and everything in between.\nVibe Coding Forem\nFollow\nDiscussing AI software development, and showing off what we\'re building.\nOpen Forem\nFollow\nA general discussion space for the Forem community. If it doesn\'t have a home elsewhere, it belongs here\nPopcorn Movies and TV\nFollow\nMovie and TV enthusiasm, criticism and everything in-between.\nDUMB DEV Community\nFollow\nMemes and software development shitposting\nDesign Community\nFollow\nWeb design, graphic design and everything in-between\nGolf 

# Azure client

In [133]:
from langchain_openai.chat_models.azure import AzureChatOpenAI

from pathlib import Path
from dotenv import load_dotenv
import os


#env_path = Path.cwd().parents[0] / ".env"
load_dotenv()

api_key = os.getenv("AZURE_API_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")


client = AzureChatOpenAI(
    azure_endpoint = endpoint,
    api_key = api_key,
    api_version = "2025-01-01-preview",
    deployment_name = "gpt-4o-mini",
    max_retries = 0
)

print("✅ Azure Chat client initialized successfully.")

✅ Azure Chat client initialized successfully.


# Translator

In [134]:
def article_translator(text, lang):
    messages = [
        ("system", "You act as a text translator"),
        ("user", f"Translate the {text} into language {lang} and response it in Markdown")
    ]

    response = client.invoke(messages)
    print(response.content)
    return response.content



# URL Article Translator

In [135]:
url = 'https://dev.to/u11d/selecting-the-appropriate-docker-base-image-2126'
text = url_text_extractor(url)
techArticle = article_translator(text, "pt-br")


# Selecionando a Imagem Base do Docker Apropriada

## DEV Community

### Feed do Forem
Siga novos Subforems para melhorar seu feed

**DEV Community**  
Um espaço para discutir e acompanhar o desenvolvimento de software e gerenciar sua carreira em software.

**Gamers Forem**  
Uma comunidade inclusiva para entusiastas de jogos.

**Futuro**  
Notícias e discussões sobre ciência e tecnologia, como IA, VR, criptomoeda, computação quântica e muito mais.

**Música Forem**  
De composição e apresentações até equipamentos, opiniões quentes sobre música e tudo mais.

**Vibe Coding Forem**  
Discutindo desenvolvimento de software em IA e mostrando o que estamos construindo.

**Open Forem**  
Um espaço de discussão geral para a comunidade Forem. Se não tiver um lar em outro lugar, pertence aqui.

**Filmes e TV Popcorn**  
Entusiasmo, crítica e tudo mais sobre filmes e TV.

**DUMB DEV Community**  
Memes e shitposting sobre desenvolvimento de software.

**Comunidade de Design**  
Design de website

# Export to Markdown

In [136]:
import os
import re
from datetime import datetime
from bs4 import BeautifulSoup
import requests

def export_translated_article(url, techArticle, output_dir="..\data\output"):
    """
    Exports the translated text of a web article to a Markdown (.md) file.
    The filename is automatically generated based on the article title.

    Args:
        url (str): The original article URL.
        techArticle (str): The translated content to be saved.
        output_dir (str, optional): Directory to save the Markdown file.
                                    Default is 'translations/'.
    """

    # 1. Get the article title from the webpage
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.string.strip() if soup.title else "untitled_article"
    except Exception:
        title = "untitled_article"

    # 2. Sanitize the title for use as filename
    safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
    safe_title = re.sub(r"\s+", "_", safe_title).lower()

    # 3. Define output directory and file path
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"{safe_title}_{timestamp}.md"
    filepath = os.path.join(output_dir, filename)

    # 4. Write to Markdown file
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(f"**Original URL:** {url}\n\n")
        f.write(f"**Translation Date:** {datetime.now().strftime('%Y-%m-%d')}\n\n---\n\n")
        f.write(techArticle)

    print(f"Translation exported successfully to: {filepath}")

  def export_translated_article(url, techArticle, output_dir="..\data\output"):


In [137]:
export_translated_article(url, techArticle)

Translation exported successfully to: ..\data\output\selecting_the_appropriate_docker_base_image_-_dev_community_2025-10-16_02-24-53.md
