FOOL

In [None]:
from bs4 import BeautifulSoup

# Load your HTML string or file
with open("fool.htm", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

# Find the main article container
article_body = soup.find("div", class_="article-body")

# Extract all text from <p> and <h2> tags inside it
if article_body:
    paragraphs = article_body.find_all(['p', 'h2'])
    article_text = "\n\n".join(tag.get_text(strip=True) for tag in paragraphs)
    print(article_text)
else:
    print("No article-body div found.")


In [None]:
from bs4 import BeautifulSoup

def extract_article_text(html_path: str) -> str:
    """
    Extracts readable article text from a Motley Fool-style HTML file.

    Args:
        html_path (str): Path to the HTML file.

    Returns:
        str: Cleaned article text with paragraphs and section headings.
    """
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            html = f.read()

        soup = BeautifulSoup(html, 'html.parser')
        article_body = soup.find("div", class_="article-body")

        if article_body:
            tags = article_body.find_all(['p', 'h2'])
            text = "\n\n".join(tag.get_text(strip=True) for tag in tags)
            return text
        else:
            return "[!] No <div class='article-body'> found in the HTML."

    except Exception as e:
        return f"[!] Error reading or parsing file: {e}"


In [None]:
text = extract_article_text("fool.htm")
print(text)


In [None]:
import json
from bs4 import BeautifulSoup

def extract_article_text_from_json(json_path: str) -> str:
    """
    Extracts article text from a JSON file that contains raw HTML under the 'html' key.

    Args:
        json_path (str): Path to the JSON file.

    Returns:
        str: Cleaned article text or an error message.
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        html = data.get('html')
        if not html:
            return "[!] No 'html' key found in the JSON."

        soup = BeautifulSoup(html, 'html.parser')
        article_body = soup.find("div", class_="article-body")

        if article_body:
            tags = article_body.find_all(['p', 'h2'])
            text = "\n\n".join(tag.get_text(strip=True) for tag in tags)
            return text
        else:
            return "[!] No <div class='article-body'> found in the HTML."

    except Exception as e:
        return f"[!] Error processing JSON file: {e}"


In [None]:
article_text = extract_article_text_from_json("data/www_fool_com_54277.json")
print(article_text)


Fool Timestamp


In [34]:
from datetime import datetime, timezone
from bs4 import BeautifulSoup

def extract_fool_publish_date(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")

    meta_tag = soup.find("meta", attrs={"property": "article:published_time"})
    if meta_tag and meta_tag.get("content"):
        dt = datetime.fromisoformat(meta_tag["content"])
        dt_utc = dt.astimezone(timezone.utc)
        return dt_utc.replace(tzinfo=None).isoformat(timespec="microseconds")

    return ""


In [38]:
import json

with open("data/scrape_raw_9999.json", "r", encoding="utf-8") as f:
    data = json.load(f)

html = data.get("html", "")
pubDate = extract_fool_publish_date(html)
print("Publishing Date:", pubDate)



Publishing Date: 2024-03-14T13:33:13.000000


BENZINGA

In [None]:
import re
from bs4 import BeautifulSoup

def extract_benzinga_article_text(html: str) -> str:
    """
    Extracts main article text from Benzinga-style HTML, flattens for NLP.
    """
    soup = BeautifulSoup(html, "html.parser")

    # Main content container for the article body
    body_div = soup.find("div", {"id": "article-body"})
    if not body_div:
        return ""

    # Extract paragraphs and headings
    tags = body_div.find_all(["p", "h2", "li"])
    text_parts = [tag.get_text(strip=True) for tag in tags]

    # Join and clean
    raw_text = " ".join(text_parts)
    clean_text = re.sub(r'\s+', ' ', raw_text).strip()

    return clean_text


In [None]:
with open("benzinga.htm", "r", encoding="utf-8") as f:
    html_content = f.read()

text = extract_benzinga_article_text(html_content)
print(text[:5000])  # Preview first 500 chars


In [39]:
from bs4 import BeautifulSoup
from datetime import datetime

def extract_benzinga_publish_date(html: str) -> str:
    """
    Extracts the publishing date from Benzinga HTML assuming consistent format.
    Example format: 'June 21, 2025 3:23 AM' → ISO 8601.
    """
    soup = BeautifulSoup(html, 'html.parser')
    date_span = soup.find("span", class_="article-date")
    if not date_span:
        return ""

    raw_date = date_span.get_text(strip=True)

    # Parse with fixed format
    dt = datetime.strptime(raw_date, "%B %d, %Y %I:%M %p")
    return dt.isoformat()


In [42]:
with open("benzinga.htm", "r", encoding="utf-8") as f:
    html_content = f.read()

text = extract_benzinga_publish_date(html_content)
print(text)  # Prev

2024-01-01T12:00:00


In [None]:
import re
from bs4 import BeautifulSoup

def extract_zacks_article_text(html: str) -> str:
    """
    Extracts main article text from Zacks-style HTML and removes promotional ad blocks.
    """
    soup = BeautifulSoup(html, "html.parser")
    body_div = soup.find("div", {"id": "comtext"})
    if not body_div:
        return ""

    # Extract paragraph, heading, and list elements
    tags = body_div.find_all(["p", "h2", "li"])
    text_parts = [tag.get_text(strip=True) for tag in tags]

    # Combine into a single string
    raw_text = " ".join(text_parts)
    clean_text = re.sub(r'\s+', ' ', raw_text).strip()

    # Remove text between known ad break markers
    ad_pattern = r'-{5,}.*?-{5,}'
    clean_text = re.sub(ad_pattern, '', clean_text)

    return clean_text.strip()




In [None]:
# Test locally with saved file
with open("zacks.htm", "r", encoding="utf-8") as f:
    html_content = f.read()

text = extract_zacks_article_text(html_content)
print(text)  # Preview first 5000 chars

# FINAL


Imports

In [None]:
import re
from bs4 import BeautifulSoup
import json
import os
from datetime import datetime


## HTML Extraction Helper
fool

In [None]:
def extract_fool_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", class_="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""


benzinga

In [None]:

def extract_benzinga_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""

dynamic dispatcher


In [None]:
def extract_article_text_by_source(url: str, html: str) -> str:
    if "fool.com" in url:
        return extract_fool_article_text(html)
    elif "benzinga.com" in url:
        return extract_benzinga_article_text(html)
    else:
        return ""


full file processing

In [None]:
def process_json_file(input_path: str) -> str:
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        html = data.get("html", "")
        url = data.get("url", "")
        scraping_timestamp = data.get("timestamp", "")

        article_text = extract_article_text_by_source(url, html)
        parsed_data = {
            "url": url,
            "scrapingTimestamp": scraping_timestamp,
            "parsingTimestamp": datetime.utcnow().isoformat() + "Z",
            "articleText": article_text
        }

        # Create the new filename
        dirname, filename = os.path.split(input_path)
        cleaned_filename = filename.replace("scrape_raw_", "scrape_cleaned_")
        output_path = os.path.join(dirname, cleaned_filename)

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(parsed_data, f, ensure_ascii=False, indent=2)

        return output_path

    except Exception as e:
        return f"Error processing {input_path}: {e}"



Batch Processing

In [None]:
def process_all_json_files(folder_path: str):
    for filename in os.listdir(folder_path):
        if filename.endswith(".json") and not filename.endswith("_parsed.json"):
            full_path = os.path.join(folder_path, filename)
            output = process_json_file(full_path)
            print(f"✅ Processed: {filename} → {output}")


Run Function

In [None]:
# TODO: Adjust path
process_all_json_files("./data")
