FOOL

In [None]:
from bs4 import BeautifulSoup

# Load your HTML string or file
with open("fool.htm", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

# Find the main article container
article_body = soup.find("div", class_="article-body")

# Extract all text from <p> and <h2> tags inside it
if article_body:
    paragraphs = article_body.find_all(['p', 'h2'])
    article_text = "\n\n".join(tag.get_text(strip=True) for tag in paragraphs)
    print(article_text)
else:
    print("No article-body div found.")


In [None]:
from bs4 import BeautifulSoup

def extract_article_text(html_path: str) -> str:
    """
    Extracts readable article text from a Motley Fool-style HTML file.

    Args:
        html_path (str): Path to the HTML file.

    Returns:
        str: Cleaned article text with paragraphs and section headings.
    """
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'html.parser')
        article_body = soup.find("div", class_="article-body")

        if article_body:
            tags = article_body.find_all(['p', 'h2'])
            text = "\n\n".join(tag.get_text(strip=True) for tag in tags)
            return text
        else:
            return "[!] No <div class='article-body'> found in the HTML."

    except Exception as e:
        return f"[!] Error reading or parsing file: {e}"


In [None]:
text = extract_article_text("fool.htm")
print(text)


In [None]:
import json
from bs4 import BeautifulSoup

def extract_article_text_from_json(json_path: str) -> str:
    """
    Extracts article text from a JSON file that contains raw HTML under the 'html' key.

    Args:
        json_path (str): Path to the JSON file.

    Returns:
        str: Cleaned article text or an error message.
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        html = data.get('html')
        if not html:
            return "[!] No 'html' key found in the JSON."

        soup = BeautifulSoup(html, 'html.parser')
        article_body = soup.find("div", class_="article-body")

        if article_body:
            tags = article_body.find_all(['p', 'h2'])
            text = "\n\n".join(tag.get_text(strip=True) for tag in tags)
            return text
        else:
            return "[!] No <div class='article-body'> found in the HTML."

    except Exception as e:
        return f"[!] Error processing JSON file: {e}"


In [None]:
article_text = extract_article_text_from_json("data/www_fool_com_54277.json")
print(article_text)


BENZINGA

In [None]:
import re
from bs4 import BeautifulSoup

def extract_benzinga_article_text(html: str) -> str:
    """
    Extracts main article text from Benzinga-style HTML, flattens for NLP.
    """
    soup = BeautifulSoup(html, "html.parser")

    # Main content container for the article body
    body_div = soup.find("div", {"id": "article-body"})
    if not body_div:
        return ""

    # Extract paragraphs and headings
    tags = body_div.find_all(["p", "h2", "li"])
    text_parts = [tag.get_text(strip=True) for tag in tags]

    # Join and clean
    raw_text = " ".join(text_parts)
    clean_text = re.sub(r'\s+', ' ', raw_text).strip()

    return clean_text


In [None]:
with open("benzinga.htm", "r", encoding="utf-8") as f:
    html_content = f.read()

text = extract_benzinga_article_text(html_content)
print(text[:5000])  # Preview first 500 chars


In [3]:
import re
from bs4 import BeautifulSoup

def extract_zacks_article_text(html: str) -> str:
    """
    Extracts main article text from Zacks-style HTML and removes promotional ad blocks.
    """
    soup = BeautifulSoup(html, "html.parser")
    body_div = soup.find("div", {"id": "comtext"})
    if not body_div:
        return ""

    # Extract paragraph, heading, and list elements
    tags = body_div.find_all(["p", "h2", "li"])
    text_parts = [tag.get_text(strip=True) for tag in tags]

    # Combine into a single string
    raw_text = " ".join(text_parts)
    clean_text = re.sub(r'\s+', ' ', raw_text).strip()

    # Remove text between known ad break markers
    ad_pattern = r'-{5,}.*?-{5,}'
    clean_text = re.sub(ad_pattern, '', clean_text)

    return clean_text.strip()




In [5]:
# Test locally with saved file
with open("zacks.htm", "r", encoding="utf-8") as f:
    html_content = f.read()

text = extract_zacks_article_text(html_content)
print(text)  # Preview first 5000 chars

There’s a new sheriff in town, or at least that’s what the budget bill says.President Trump is back in the saddle, and he’s riding in with a “Big, Beautiful Bill” that aims to reshape the fiscal landscape of America. Whether you love it or loathe it, one thing’s certain: there’s money to be made.This isn’t about political ideology. This is about dollars and cents. And if you’re the kind of investor who likes to get ahead of the curve, now’s the time to pay attention.Today, I’m breaking down exactly how Trump’s latest budget proposal can translate into a market-moving opportunity. We’re going to zero in on the sectors that are poised to benefit and understand why policy creates profit.Let’s get into it.Understanding the Budget BoomTrump’s budget isn’t a “skinny bill” or some placeholder draft. It’s a fireworks show of federal spending. This is fiscal stimulus with a red hat and a bullhorn. Infrastructure, defense, border security, energy independence, you name it, there’s a line item fo

# FINAL


Imports

In [51]:
import re
from bs4 import BeautifulSoup
import json
import os
from datetime import datetime


## HTML Extraction Helper
fool

In [52]:
def extract_fool_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", class_="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""


benzinga

In [53]:

def extract_benzinga_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""

dynamic dispatcher


In [54]:
def extract_article_text_by_source(url: str, html: str) -> str:
    if "fool.com" in url:
        return extract_fool_article_text(html)
    elif "benzinga.com" in url:
        return extract_benzinga_article_text(html)
    else:
        return ""


full file processing

In [56]:
def process_json_file(input_path: str) -> str:
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        html = data.get("html", "")
        url = data.get("url", "")
        scraping_timestamp = data.get("timestamp", "")

        article_text = extract_article_text_by_source(url, html)
        parsed_data = {
            "url": url,
            "scrapingTimestamp": scraping_timestamp,
            "parsingTimestamp": datetime.utcnow().isoformat() + "Z",
            "articleText": article_text
        }

        # Create the new filename
        dirname, filename = os.path.split(input_path)
        cleaned_filename = filename.replace("scrape_raw_", "scrape_cleaned_")
        output_path = os.path.join(dirname, cleaned_filename)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(parsed_data, f, ensure_ascii=False, indent=2)

        return output_path

    except Exception as e:
        return f"Error processing {input_path}: {e}"



Batch Processing

In [57]:
def process_all_json_files(folder_path: str):
    for filename in os.listdir(folder_path):
        if filename.endswith(".json") and not filename.endswith("_parsed.json"):
            full_path = os.path.join(folder_path, filename)
            output = process_json_file(full_path)
            print(f"✅ Processed: {filename} → {output}")


Run Function

In [58]:
# TODO: Adjust path
process_all_json_files("./data")


✅ Processed: scrape_cleaned_0.json → ./data\scrape_cleaned_0.json
✅ Processed: scrape_cleaned_1.json → ./data\scrape_cleaned_1.json
✅ Processed: scrape_cleaned_10.json → ./data\scrape_cleaned_10.json
✅ Processed: scrape_cleaned_29585.json → ./data\scrape_cleaned_29585.json
✅ Processed: scrape_cleaned_29590.json → ./data\scrape_cleaned_29590.json
✅ Processed: scrape_cleaned_7837.json → ./data\scrape_cleaned_7837.json
✅ Processed: scrape_cleaned_7848.json → ./data\scrape_cleaned_7848.json
✅ Processed: scrape_cleaned_9623.json → ./data\scrape_cleaned_9623.json
✅ Processed: scrape_cleaned_9663.json → ./data\scrape_cleaned_9663.json
✅ Processed: scrape_cleaned_9686.json → ./data\scrape_cleaned_9686.json
✅ Processed: scrape_cleaned_9707.json → ./data\scrape_cleaned_9707.json
✅ Processed: scrape_cleaned_9742.json → ./data\scrape_cleaned_9742.json
✅ Processed: scrape_cleaned_9767.json → ./data\scrape_cleaned_9767.json
✅ Processed: scrape_cleaned_9800.json → ./data\scrape_cleaned_9800.json
✅ Pr