Imports

In [None]:
!pip install boto3

In [None]:
import os
import json
import re
from datetime import datetime
from bs4 import BeautifulSoup
from io import BytesIO
import boto3
from pyspark.sql import SparkSession

# Connection to MinIO

In [None]:
spark = SparkSession.builder \
    .appName("PySpark Access Logs Minio") \
    .master("spark://localhost:7077") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://172.29.16.105:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "bdenggroup3") \
    .config("spark.hadoop.fs.s3a.secret.key", "bdenggroup3") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

# Parser

## HTML Extraction Helper
fool

In [None]:
def extract_fool_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", class_="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""


benzinga

In [None]:
def extract_benzinga_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="article-body")
    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        return re.sub(r'\s+', ' ', flat_text).strip()
    return ""

zacks

In [None]:
def extract_zacks_article_text(html: str) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    article_body = soup.find("div", id="comtext")

    if article_body:
        tags = article_body.find_all(['p', 'h2', 'li'])
        flat_text = ' '.join(tag.get_text(strip=True) for tag in tags)
        flat_text = re.sub(r'-{5,}.*?-{5,}', '', flat_text)  # Remove ad block content
        return re.sub(r'\s+', ' ', flat_text).strip()

    return ""


dynamic dispatcher


In [None]:
def extract_article_text_by_source(url: str, html: str) -> str:
    if "fool.com" in url:
        return extract_fool_article_text(html)
    elif "benzinga.com" in url:
        return extract_benzinga_article_text(html)
    elif "zacks.com" in url:
        return extract_zacks_article_text(html)
    else:
        return ""


full file processing (read, parse, write)

In [None]:
def process_json_file(s3_client, bucket: str, key: str) -> str:
    try:
        obj = s3_client.get_object(Bucket=bucket, Key=key)
        data = json.load(obj['Body'])

        html = data.get("html", "")
        url = data.get("url", "")
        scraping_timestamp = data.get("timestamp", "")

        article_text = extract_article_text_by_source(url, html)
        parsed_data = {
            "url": url,
            "scrapingTimestamp": scraping_timestamp,
            "parsingTimestamp": datetime.utcnow().isoformat() + "Z",
            "articleText": article_text
        }

        new_key = key.replace("raw/scrape_raw_", "parsed/parsed_")
        s3_client.put_object(
            Bucket=bucket,
            Key=new_key,
            Body=json.dumps(parsed_data, ensure_ascii=False, indent=2).encode("utf-8"),
            ContentType="application/json"
        )

        return f"✅ {key} → {new_key}"

    except Exception as e:
        return f"❌ Error processing {key}: {e}"


Initializing S3 Client & Batch processing

In [None]:
s3 = boto3.client(
    's3',
    endpoint_url='http://172.29.16.105:9000',
    aws_access_key_id='bdenggroup3',
    aws_secret_access_key='bdenggroup3'
)

bucket_name = "bdenggroup3"

paginator = s3.get_paginator('list_objects_v2')

counter = 0

for page in paginator.paginate(Bucket=bucket_name, Prefix='raw/'):
    for obj in page.get('Contents', []):
        key = obj["Key"]
        if key.startswith("raw/scrape_raw_") and key.endswith(".json"):
            result = process_json_file(s3, bucket_name, key)
            print(f'File {counter}: {result}')
            counter += 1
        if counter >= 100000:
            print("too many files")
            break
    else:
        continue
    break

In [None]:
spark.stop()