In [None]:
import json
import matplotlib.pyplot as plt
from collections import Counter
import glob
import os

# Folder containing the files
folder_path = "alphavantage_news_json_byYear"

# Get all files matching the pattern *_2024.json
file_paths = glob.glob(os.path.join(folder_path, "*_2024.json"))

all_relevance_scores = []
score_url_pairs = []  # (score, url) tuples

for file_path in file_paths:
    # Extract ticker from filename (e.g., AAPL from AAPL_2024.json)
    ticker = os.path.basename(file_path).split("_")[0]

    # Load the data
    with open(file_path, "r", encoding="utf-8") as f:
        articles = json.load(f)

    # Collect relevance scores
    ticker_relevance_scores = []

    for article in articles:
        article_url = article.get("url")
        for sentiment in article.get("ticker_sentiment", []):
            score_str = sentiment.get("relevance_score")
            if score_str is not None:
                try:
                    score = float(score_str)
                    score_rounded = round(score, 2)
                    ticker_relevance_scores.append(score_rounded)
                    all_relevance_scores.append(score_rounded)
                    score_url_pairs.append((score, article_url))
                except ValueError:
                    pass

    if not ticker_relevance_scores:
        print(f"No relevance scores found in {ticker}_2024.json")
        continue

# Count frequencies
total_counts = Counter(all_relevance_scores)


# Plot histogram
plt.figure(figsize=(10, 5))
plt.bar(total_counts.keys(), total_counts.values(), width=0.01)
plt.xlabel("Relevance Score")
plt.ylabel("Frequency")
plt.title(f"Histogram of All Relevance Scores 2024")
plt.grid(True)
plt.tight_layout()

# Show or save the plot
plt.show()
# Or to save: plt.savefig(f"{ticker}_2024_relevance_hist.png")

plt.close()

In [None]:
print(f"\n🔗 URLs: ({len(score_url_pairs)} found)")

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium.common import TimeoutException
import os
import json
import random
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup, Comment
import psutil

# Simulated input (replace with your real input)
#score_url_pairs = [(0.31, "https://www.example.com"), (0.42, "https://www.bbc.com")]

# Kafka config (if needed later)
# from confluent_kafka import Producer
# producer = Producer({'bootstrap.servers': '172.29.16.101:9092'})
# topic = 'g3-raw-html-test'

USER_AGENTS = [
    # Chrome on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",

    # Firefox on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",

    # Edge on Windows
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",

    # Chrome on macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",

    # Safari on macOS
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",

    # Firefox on Linux
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
]


SCREEN_SIZES = [
    "1920,1080",
    "1680,1050",
    "1600,900",
    "1440,900",
    "1366,768",
    "1280,1024",
    "1024,768"
]


OUTPUT_FOLDER = f"{datetime.now().strftime('%Y%m%d')}_scraping_AV_texts_multi"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

filtered_urls = [url for score, url in score_url_pairs if score > 0.2]
print(f"\n🔗 URLs with relevance score > 0.2 ({len(filtered_urls)} found)")


def kill_orphan_chrome_processes():
    for proc in psutil.process_iter(['name']):
        if proc.info['name'] in ['chrome', 'chromedriver']:
            try:
                proc.kill()
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                pass


def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    for tag in soup(["script", "style", "noscript", "iframe", "svg", "header", "footer", "nav", "aside"]):
        tag.decompose()
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    return str(soup)


def get_driver():
    user_agent = random.choice(USER_AGENTS)
    screen_size = random.choice(SCREEN_SIZES)

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument(f"window-size={screen_size}")
    return webdriver.Chrome(options=options)


def scrape_url(i, url):
    driver = None
    try:
        driver = get_driver()
        domain = url.split("/")[2]
        print(f"\n🌐 ({i}) Opening: {url}")
        driver.get(url)
        wait = WebDriverWait(driver, 0.5)
        time.sleep(2)

        try:
            cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept')]")))
            cookie_button.click()
            print("✅ Cookie banner accepted")
            time.sleep(0.5)
        except:
            print("⚠️ No cookie banner appeared or could not be clicked")

        raw_html = driver.page_source
        pure_html = clean_html(raw_html)

        data = {
            'url': url,
            'html': pure_html,
            'timestamp': datetime.now().isoformat()
        }

        filename = f"{OUTPUT_FOLDER}/{domain.replace('.', '_')}_{i}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"💾 Saved JSON: {filename}")

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
    finally:
        if driver:
            driver.quit()
        kill_orphan_chrome_processes()


# --- Run Multithreaded Scraper ---
MAX_WORKERS = 8  # Adjust depending on hardware resources

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    try:
        last = i
        next = i + 1
    except NameError:
        last = 53050
        next = last+1
    futures = [executor.submit(scrape_url, i, url) for i, url in enumerate(filtered_urls[last:], start=next)]
    for future in as_completed(futures):
        pass  # Optionally handle result or exception here
