In [3]:
import json
import matplotlib.pyplot as plt
from collections import Counter
import glob
import os
import shutil

from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
from selenium.common import TimeoutException
import os
import json
import random
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup, Comment
from confluent_kafka import Producer, KafkaError
from datetime import datetime
import json

import psutil

import boto3

### extract URL list from AlphaVantage downloads

In [None]:
# Folder containing the url-index-files
folder_path = "alphavantage_news_json_selection"

# Get all files matching the pattern *_2024.json
file_paths = glob.glob(os.path.join(folder_path, "*_2024.json"))

all_relevance_scores = []
score_url_pairs = []  # (score, url) tuples
domaincounter = Counter()

for file_path in file_paths:
    # Extract ticker from filename (e.g., AAPL from AAPL_2024.json)
    ticker = os.path.basename(file_path).split("_")[0]

    # Load the data
    with open(file_path, "r", encoding="utf-8") as f:
        articles = json.load(f)

    # Collect relevance scores
    ticker_relevance_scores = []

    for article in articles:
        article_url = article.get("url")
        for sentiment in article.get("ticker_sentiment", []):
            score_str = sentiment.get("relevance_score")
            if score_str is not None:
                try:
                    score = float(score_str)
                    score_rounded = round(score, 2)
                    ticker_relevance_scores.append(score_rounded)
                    all_relevance_scores.append(score_rounded)

                    # Safely extract domain
                    try:
                        domain = article_url.split("/")[2]
                        domaincounter[domain] += 1
                        score_url_pairs.append((score, article_url, domain))
                    except IndexError:
                        pass  # Skip if URL is malformed
                except ValueError:
                    pass  # Skip if score is not a valid float

    if not ticker_relevance_scores:
        print(f"No relevance scores found in {ticker}_2024.json")
        continue

# Count frequencies
total_counts = Counter(all_relevance_scores)

# Print domain stats sorted by article count (descending)
for domain, count in domaincounter.most_common():
    print(f"Domain: {domain} has {count} articles")

### Filter URLs to only include top 3 news outlets

In [None]:
# filtered_pairs = [
#     (score, url, domain)
#     for score, url, domain in score_url_pairs
#     if score > 0.2 and any(keyword in domain for keyword in ["benzinga", "fool", "zacks"])
# ]

# Filter: score > 0.2 and domain contains "benzinga", "fool", or "zacks"
filtered_pairs = []
seen_urls = set()

for score, url, domain in score_url_pairs:
    if (
            score > 0.0
            and any(keyword in domain for keyword in ["benzinga", "fool", "zacks"])
            and url not in seen_urls
    ):
        filtered_pairs.append((score, url, domain))
        seen_urls.add(url)

len(filtered_pairs)

domaincounter = Counter()
for _, _, domain in filtered_pairs:
    domaincounter[domain] +=1

for domain, count in domaincounter.most_common():
    print(f"Domain: {domain} has {count} articles")

### Get and sort previously scraped URLs

In [None]:


# Folder containing the url-index-files
folder_path = "raw_unsorted"
destination_folder = "raw_sorted"

# Make sure destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Get all files matching the pattern *.json
file_paths = glob.glob(os.path.join(folder_path, "*.json"))

scraped_urls = set()

for file_path in file_paths:
    filename = os.path.basename(file_path)
    new_file_path = os.path.join(destination_folder, filename)

    # Load the data
    with open(file_path, "r", encoding="utf-8") as f:
        article = json.load(f)

    article_url = article.get("url")
    domain = article_url.split("/")[2]

    if (
            any(keyword in domain for keyword in ["benzinga", "fool", "zacks"])
            and article_url not in scraped_urls
    ):
        scraped_urls.add(article_url)
        shutil.copy(file_path, new_file_path)

print(f'Anzahl bereits gescrapter Artikel: {len(scraped_urls)}')

### generate set of outstanding URLs

In [None]:
outstanding_urls = set()

for url in seen_urls:
    if url not in scraped_urls:
        outstanding_urls.add(url)

print(f'Anzahl noch ausstehender Artikel: {len(outstanding_urls)}')

### Scrape outstanding URLs

In [None]:



# KAFKA config
# conf = {'bootstrap.servers': '172.29.16.101:9092'}
# producer = Producer(conf)
# topic = 'g3-raw-html-test'

USER_AGENTS = [
    # Chrome on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",

    # Firefox on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",

    # Edge on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",

    # Chrome on macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",

    # Safari on macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",

    # Firefox on Linux
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
]


SCREEN_SIZES = [
    "1920,1080",
    "1680,1050",
    "1600,900",
    "1440,900",
    "1366,768",
    "1280,1024",
    "1024,768"
]


OUTPUT_FOLDER = f"outstanding_scraping"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


print(f"\nScraping outstanding URLs ({len(outstanding_urls)} found)")


def kill_orphan_chrome_processes():
    for proc in psutil.process_iter(['name']):
        if proc.info['name'] in ['chrome', 'chromedriver']:
            try:
                proc.kill()
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                pass


def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    for tag in soup(["script", "style", "noscript", "iframe", "svg", "header", "footer", "nav", "aside"]):
        tag.decompose()
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    return str(soup)


def get_driver():
    user_agent = random.choice(USER_AGENTS)
    screen_size = random.choice(SCREEN_SIZES)

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument(f"window-size={screen_size}")
    return webdriver.Chrome(options=options)


def scrape_url(i, url):
    driver = None
    try:
        driver = get_driver()
        domain = url.split("/")[2]
        print(f"\n🌐 ({i}) Opening: {url}")
        driver.get(url)
        wait = WebDriverWait(driver, 1)
        time.sleep(3)

        try:
            cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept')]")))
            cookie_button.click()
            print("✅ Cookie banner accepted")
            time.sleep(3)
        except:
            print("⚠️ No cookie banner appeared or could not be clicked")

        raw_html = driver.page_source
        pure_html = clean_html(raw_html)

        data = {
            'url': url,
            'html': pure_html,
            'timestamp': datetime.now().isoformat()
        }

        filename = f"{OUTPUT_FOLDER}/{domain.replace('.', '_')}_{i}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"💾 Saved JSON: {filename}")

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
    finally:
        if driver:
            driver.quit()
        kill_orphan_chrome_processes()


# --- Run Multithreaded Scraper ---
MAX_WORKERS = 8  # Adjust depending on hardware resources

try:
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [
            executor.submit(scrape_url, i, url)
            for i, url in enumerate(list(outstanding_urls), start=1)
        ]

        for future in as_completed(futures):
            try:
                future.result()  # Optional: retrieve result or catch internal exception
            except Exception as e:
                print(f"Task failed: {e}")

except KeyboardInterrupt:
    print("\nExecution interrupted by user. Shutting down...")
    # Optionally: add clean-up logic here (e.g., write partially completed results to disk)

### Transfer raw scraping files to s3 server

In [None]:


s3_endpoint ="http://172.29.16.105:9000"
s3_access_key = "bdenggroup3"
s3_secret_key = "bdenggroup3"
bucket_name= "bdenggroup3"

s3 = boto3.client("s3",
                  endpoint_url=s3_endpoint,
                  aws_access_key_id=s3_access_key,
                  aws_secret_access_key=s3_secret_key,
                  )

# local_file_path = "raw_sorted/www_benzinga_com_136.json"
# remote_file_path = f'{bucket_name}/www_benzinga_com_136.json'

local_folder1 = "raw_sorted"
local_folder2 = "outstanding_scraping"

file_paths1 = glob.glob(os.path.join(local_folder1, "*.json"))
file_paths2 = glob.glob(os.path.join(local_folder2, "*.json"))

file_paths = file_paths1 + file_paths2

i = 0

for local_file_path in file_paths:
    remote_file_path = f'raw/scrape_raw_{i}.json'
    s3.upload_file(local_file_path, bucket_name, remote_file_path)
    i+=1
    if (i%100 ==0):
        print (f'Checkpoint: {i} of {len(file_paths)} uploaded')



Checkpoint: 100 of 43526 uploaded
