Imports

In [None]:
import re
from bs4 import BeautifulSoup
import boto3
from pyspark.sql import SparkSession
from datetime import datetime, timezone

# Connection to MinIO

In [None]:
s3_endpoint_url="http://172.29.16.105:9000"
s3_access_key_id="bdenggroup3"
s3_secret_access_key="bdenggroup3"
bucket_name = "bdenggroup3"

spark_master_url = "spark://localhost:7077"

spark = SparkSession.builder \
    .appName("PySpark Access Logs Minio") \
    .master(spark_master_url) \
    .config("spark.hadoop.fs.s3a.endpoint", s3_endpoint_url) \
    .config("spark.hadoop.fs.s3a.access.key", s3_access_key_id) \
    .config("spark.hadoop.fs.s3a.secret.key", s3_secret_access_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

# Parser

## HTML Extraction Helper
### fool.com

In [None]:
def extract_fool_article_data(html: str) -> tuple[str, str]:
    article_text = extract_fool_article_text(html)
    publish_date = extract_fool_publish_date(html)
    return article_text, publish_date


In [None]:
def extract_fool_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", class_="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""


In [None]:
def extract_fool_publish_date(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")

    meta_tag = soup.find("meta", attrs={"property": "article:published_time"})
    if meta_tag and meta_tag.get("content"):
        dt = datetime.fromisoformat(meta_tag["content"])
        dt_utc = dt.astimezone(timezone.utc)
        return dt_utc.replace(tzinfo=None).isoformat(timespec="microseconds")

    return ""

### benzinga.com

In [None]:
def extract_benzinga_article_data(html: str) -> tuple[str, str]:
    article_text = extract_benzinga_article_text(html)
    publish_date = extract_benzinga_publish_date(html)
    return article_text, publish_date

In [None]:
def extract_benzinga_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""

In [None]:
def extract_benzinga_publish_date(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    date_span = soup.find("span", class_="article-date")
    if not date_span:
        return ""

    raw_date = date_span.get_text(strip=True)

    dt = datetime.strptime(raw_date, "%B %d, %Y %I:%M %p")
    return dt.isoformat(timespec="microseconds")

### zacks.com

In [None]:
def extract_zacks_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="comtext")

    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        flat_text = re.sub(r'-{5,}.*?-{5,}', '', flat_text)  # Remove ad block content
        return re.sub(r'\s+', ' ', flat_text).strip()

    return ""

In [None]:
def extract_zacks_publish_date(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")

    # Look for <p class="byline"> and then find the <time> tag inside
    byline = soup.find("p", class_="byline")
    if byline:
        time_tag = byline.find("time")
        if time_tag and time_tag.text.strip():
            try:
                # Example: "November 04, 2024"
                dt = datetime.strptime(time_tag.text.strip(), "%B %d, %Y")
                return dt.isoformat(timespec="microseconds")
            except ValueError as e:
                print(f"Date parse error: {e}")

    # fallback if not found or error
    return datetime.now().isoformat(timespec="microseconds")

In [None]:
def extract_zacks_article_data(html: str) -> tuple[str, str]:
    article_text = extract_zacks_article_text(html)
    publish_date = extract_zacks_publish_date(html)
    return article_text, publish_date

### Dynamic Dispatcher

In [None]:
def extract_article_data_by_source(url: str, html: str) -> tuple[str, str]:
    if "fool.com" in url:
        return extract_fool_article_data(html)
    elif "benzinga.com" in url:
        return extract_benzinga_article_data(html)
    elif "zacks.com" in url:
        return extract_zacks_article_data(html)
    else:
        return ""


## Processing Function for Spark workers

In [None]:
def process_key(key):
    import boto3
    import json
    from datetime import datetime

    s3 = boto3.client("s3",
                      endpoint_url=s3_endpoint_url,
                      aws_access_key_id=s3_access_key_id,
                      aws_secret_access_key=s3_secret_access_key)

    obj = s3.get_object(Bucket=bucket_name, Key=key)
    data = json.load(obj['Body'])

    html = data.get("html", "")
    url = data.get("url", "")
    scraping_timestamp = data.get("timestamp", "")

    article_text, published_date = extract_article_data_by_source(url, html)
    parsed_data = {
        "url": url,
        "scrapingTimestamp": scraping_timestamp,
        "parsingTimestamp": datetime.utcnow().isoformat() + "Z",
        "articleTimestamp": published_date,
        "articleText": article_text
    }

    new_key = key.replace("raw/scrape_raw_", "parsed/parsed_")

    s3.put_object(
        Bucket=bucket_name,
        Key=new_key,
        Body=json.dumps(parsed_data, ensure_ascii=False, indent=2).encode("utf-8"),
        ContentType="application/json"
    )

    return f"parsed {key} → {new_key}"

In [None]:
# -- Step 1: List keys from S3 in the 'raw/' folder
def list_s3_keys(bucket_name, prefix, s3_client):
    paginator = s3_client.get_paginator('list_objects_v2')
    keys = []
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.startswith("raw/scrape_raw_") and key.endswith(".json"):
                keys.append(key)
    return keys

In [None]:
# -- Step 2: Create boto3 client and list keys
s3 = boto3.client("s3",
                  endpoint_url=s3_endpoint_url,
                  aws_access_key_id=s3_access_key_id,
                  aws_secret_access_key=s3_secret_access_key)

all_keys = list_s3_keys(bucket_name, "raw/", s3)

# parse up to 100000 files
rdd = spark.sparkContext.parallelize(all_keys[0:100000])
saved_keys = rdd.map(process_key).collect()

In [None]:
spark.stop()