In [None]:
%pip install beautifulsoup4 markdownify


In [3]:
import os
import json
from bs4 import BeautifulSoup
from datetime import datetime

# Function to extract content from the <article> element
def extract_article_content(soup):
    article = soup.find("article", class_="devsite-article")
    if not article:
        return None  # Return None if the <article> element is not found
    return md(str(article), strip=["style", "script"])  # Convert to Markdown, excluding unnecessary tags

# Function to extract the title
def extract_title(soup):
    title_tag = soup.find("title")
    return title_tag.string if title_tag else "No Title Found"

# Function to extract publish date
def extract_publish_date(soup):
    date_tag = soup.find("p", class_="gargardate")
    if date_tag:
        try:
            return str(datetime.strptime(date_tag.get_text(strip=True), "%A, %B %d, %Y"))
        except ValueError:
            return date_tag.get_text(strip=True)
    return "Publish Date Not Found"

# Function to extract JSON-LD data
def extract_json_ld(soup):
    json_ld_data = []
    for script in soup.find_all("script", {"type": "application/ld+json"}):
        try:
            json_data = json.loads(script.string)
            json_ld_data.append(json_data)
        except (json.JSONDecodeError, TypeError):
            continue
    return json_ld_data

# Function to extract important SEO metadata
def extract_seo_metadata(soup):
    seo_metadata = {}
    # Extract meta tags like description, keywords, Open Graph tags, etc.
    for meta_tag in soup.find_all("meta"):
        name = meta_tag.get("name") or meta_tag.get("property")
        content = meta_tag.get("content")
        if name and content:
            seo_metadata[name] = content
    # Extract canonical URL
    canonical_tag = soup.find("link", {"rel": "canonical"})
    if canonical_tag and canonical_tag.get("href"):
        seo_metadata["canonical"] = canonical_tag["href"]
    return seo_metadata

# Function to process a single HTML file and save as Markdown
def process_html_to_markdown(file_path, output_dir):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, "html.parser")

    # Extract data
    title = extract_title(soup)
    publish_date = extract_publish_date(soup)
    article_content = extract_article_content(soup)
    json_ld_data = extract_json_ld(soup)
    seo_metadata = extract_seo_metadata(soup)

    if not article_content:
        print(f"No <article> element found in {file_path}. Skipping.")
        return

    # Combine extracted data into Markdown format
    markdown = f"# {title}\n\n"
    markdown += f"**Published Date:** {publish_date}\n\n"
    markdown += "## SEO Metadata\n\n"
    markdown += json.dumps(seo_metadata, indent=4) + "\n\n"
    if json_ld_data:
        markdown += "## JSON-LD Data\n\n"
        markdown += json.dumps(json_ld_data, indent=4) + "\n\n"
    markdown += "## Article Content\n\n"
    markdown += article_content

    # Save as Markdown file
    output_file_name = f"{os.path.splitext(os.path.basename(file_path))[0]}.md"
    output_file_path = os.path.join(output_dir, output_file_name)
    with open(output_file_path, "w", encoding="utf-8") as md_file:
        md_file.write(markdown)

    print(f"Saved: {output_file_path}")

# Process all HTML files in the _html directory
input_dir = "_html"
output_dir = "markdown_files"

os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

for file_name in os.listdir(input_dir):
    if file_name.endswith(".html"):  # Process only HTML files
        file_path = os.path.join(input_dir, file_name)
        print(f"Processing: {file_path}")
        process_html_to_markdown(file_path, output_dir)

print("All files processed.")


FileNotFoundError: [Errno 2] No such file or directory: '_html'

In [10]:
import requests
from markdownify import markdownify as md
from bs4 import BeautifulSoup

url = "https://www.bing.com/search?q=cobol"  # Replace with your target URL

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}   
response = requests.get(url, headers=headers)


# Example URL
print(response.text)
soup = BeautifulSoup(response.text, "html.parser")
markdown_content = md(response.text) if response.text else "No content found"

print("Converted Content:")
print(markdown_content[:500] + "...")  # Print first 500 characters

<!DOCTYPE html><html dir="ltr" lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:Web="http://schemas.live.com/Web/"><script type="text/javascript" nonce="1V3F/hX28jPKPoOWIzskcfx1i+0opbG3Xm8lG1DlgWo=" >//<![CDATA[
si_ST=new Date
//]]></script><head><!--pc--><title>cobol - Search</title><meta content="text/html; charset=utf-8" http-equiv="content-type" /><meta name="referrer" content="origin-when-cross-origin" /><meta name="SystemEntropyOriginTrialToken" content="A1L3tx5CzccqjN3lK6st/fXMwhf9EeokCPf8XCt0DVI8JPbg37BWq0zKvlqgkdm8YEUbthoGkC/xdR1+iIz4txAAAABxeyJvcmlnaW4iOiJodHRwczovL3d3dy5iaW5nLmNvbTo0NDMiLCJmZWF0dXJlIjoiTXNVc2VyQWdlbnRMYXVuY2hOYXZUeXBlIiwiZXhwaXJ5IjoxNzM5NzI0MzExLCJpc1N1YmRvbWFpbiI6dHJ1ZX0=" http-equiv="origin-trial" /><meta property="og:description" content="Intelligent search from Bing makes it easier to quickly find what you’re looking for and rewards you." /><meta property="og:site_name" content="Bing" /><meta property="og:title" content="cobol - Bing" /

In [11]:
print(markdown_content)

cobol - Search[Skip to content](#) [Profile Picture](javascript:void(0);)

* [All](/?scope=web&FORM=HDRSC1)
* [Images](/images/search?q=cobol&FORM=HDRSC2)
* [Videos](/videos/search?q=cobol&FORM=HDRSC3)
* [Maps](/maps?q=cobol&FORM=HDRSC4)
* [News](/news/search?q=cobol&FORM=HDRSC6)
* [Shopping](/shop?q=cobol&FORM=SHOPTB)
* [More](javascript:void(0);)
  + [Flights](/travel/search?q=cobol&m=flights&FORM=FBSCOP)
  + [Travel](/travel/search?q=cobol&m=travel&FORM=THSCOP)
  + [Hotels](/travel/search?q=cobol&m=hotels&FORM=HTSCOP)
  + [Real Estate](/homes?FORM=000060)
* [Tools](javascript:void(0);)
About 318,000 resultsOpen links in new tab[Any time](javascript:) 

1. [wikipedia.orghttps://en.wikipedia.org › wiki › COBOL](https://en.wikipedia.org/wiki/COBOL)
   
   [**COBOL** - **Wikipedia**](https://en.wikipedia.org/wiki/COBOL)
   -----------------------------------------------