In [None]:
# https://www.marketwatch.com/markets/us
# https://www.marketwatch.com/investing?mod=top_nav look at key industries 
# https://www.marketwatch.com/investing/stock/nvda you can look at single stocks to predit as well


In [1]:
# bloomberg scraper
import requests
from bs4 import BeautifulSoup
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from chromedriver_py import binary_path
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
import os
from datetime import datetime, timezone, time as dt_time
import json
import threading
from urllib.parse import urlparse
import re
from google.cloud import storage
import uuid
import traceback
from google.cloud import storage
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import string

from dotenv import load_dotenv

load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./svc_acc_key.json"
MONGO_URI=os.environ["MONGO_URI"]
OPENAI_API_KEY=os.environ["OPENAI_API_KEY"]
STORAGE_BUCKET=os.environ["STORAGE_BUCKET"]

In [2]:

storage_client = storage.Client.from_service_account_json(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
bucket = storage_client.get_bucket(STORAGE_BUCKET)


In [3]:
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client.get_database()

In [4]:
def get_db():
    return db

In [5]:
def get_stocks_list():
    filename = "stocks_list.txt"
    with open(filename, 'r') as file:
        file_contents = file.read().split("\n")
        return file_contents

In [6]:
def save_articles_to_storage(articles, stock_sym, timestamp, run_id):
        if not articles:
            return 

        time_as_str_formatted = timestamp.strftime('%Y-%m-%d-%H-%M-%S').replace('-', '_')
        directory = f"scrapes/{run_id}/{stock_sym.lower()}/yahoo"
        
        db = get_db()
        scrapes_collection = db['scrapes']
        scrapes = []

        for article in articles:
            sanitized_title = re.sub(r'[\/:*?"<>|]', '', article['title']).lower().translate(str.maketrans('', '', string.punctuation)).replace(" ", "_")
            key = f"{directory}/{sanitized_title}.txt"
            new_blob = bucket.blob(key)

            try: 
                new_blob.upload_from_string(json.dumps(article))
            except Exception as e:
                self.logger.info("[scraper]: failed to save article to cloud bucket")
                continue

            app_env = os.environ.get('APP_ENV', 'LOCAL')
            scrape = {
                "stock": stock_sym.lower(),
                "scraped_at": timestamp,
                "bucket_key": key,
                "app_env": app_env,
                "source": "yahoo",
                "url": article["link"],
                "run_id": run_id,
            }

            if article['published_at']:
                parsed_time = datetime.strptime(article['published_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
                scrape['published_at'] = parsed_time

            scrapes.append(scrape)
        scrapes_collection.insert_many(scrapes)

In [None]:
def get_published_at(soup):
    datetime_value = None
    # Find the <time> tag with the specific class
    time_tag = soup.find('time', class_='byline-attr-meta-time')
    if time_tag:
        datetime_value = time_tag['datetime']
    else:
        time_wrapper = soup.find('div', class_='caas-attr-time-style') 
        if time_wrapper:
            time_tag = time_wrapper.find('time')
            if time_tag:
                datetime_value = time_tag['datetime']

    return datetime_value

In [None]:
def get_article_content(soup, link):
    article_text = []
    article_content = soup.find('div', class_=("caas-body", "body yf-5ef8bf"))
    if not article_content:
        print(f"skipped link: {link}")
        return
        
    p_tags = article_content.find_all('p')
    if not p_tags:
        return
        
    for p_tag in p_tags:
        ptext = p_tag.get_text().strip()
        article_text.append(ptext)
        
    article_text_str = '\n'.join(article_text)
    return article_text_str


In [None]:
def scrape_recent_news_for_sym(link, driver):
    driver.get(link)
    time.sleep(3)
    # self.wait_for_article_body(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    title = driver.title
    published_at = get_published_at(soup)
    if not published_at:
        print(f"[scraper] No published at found for {title}")

    article_text_str = get_article_content(soup, link)
    if not article_text_str:
        return
        
    res = {
        "content": article_text_str, 
        "title": title, 
        "link": link,
    }

    if published_at:
        res["published_at"] = published_at

    return res

In [None]:
def get_prev_close(soup):
    prev_close_price = soup.find('fin-streamer', {
        'class': 'yf-mrt107',
        'data-field': 'regularMarketPreviousClose'
    })

    data_value = None
    # Extract the data-value attribute
    if prev_close_price:
        data_value = prev_close_price.get('data-value')
        # print(f"prev close Data Value: {data_value}")
    
    return data_value

In [None]:
def get_pre_market_price_for_stock(soup):
    post_market_price = soup.find('fin-streamer', {
        'class': 'price yf-1tejb6',
        'data-field': 'postMarketPrice'
    })
    
    data_value = None
    # Extract the data-value attribute
    if post_market_price:
        data_value = post_market_price.get('data-value')
        print(f"post_market_price Data Value: {data_value}")
    return data_value
"""
do an OR bc sometimes you're running at nigth and sometimes at day
<fin-streamer class="price yf-1tejb6" data-symbol="XOM" data-testid="qsp-pre-price" data-field="preMarketPrice" data-trend="none" data-pricehint="2" data-value="123.933" active=""><span class="d60f3b00 fc6ee16d">123.94</span></fin-streamer>
"""


In [None]:
def get_articles_for_stock(url, opts, svc):
    try:
        with webdriver.Chrome(service=svc, options=opts) as driver:
            articles_for_stock = set()
            driver.get(url)
            print("Letting page load...")
            time.sleep(5)
            # self.wait_for_stock_article_links(driver)
           
            main_page_source = driver.page_source
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            filtered_stories = soup.find('div', class_=lambda x: x and 'filtered-stories' in x)
            if not filtered_stories:
                print(f"[scraper]: No filtered stories found for url {url}")
                raise ValueError("no stories found")
                
            atags = filtered_stories.find_all("a", class_=lambda x: x and 'subtle-link' in x)
            if not atags:
                print(f"scraper] No atags found for url {url}")
                raise ValueError("no tags found")
                
            for atag in atags:
                link = atag.get('href')
                if not link:
                    continue
            
                articles_for_stock.add(link)

            pre_market_price_for_stock = get_pre_market_price_for_stock(soup)
            prev_close_price_for_stock = get_prev_close(soup)

            res = {
                "articles_for_stock": list(articles_for_stock),
            }

            if pre_market_price_for_stock:
                res["pre_market_price"] = pre_market_price_for_stock
                print("Found pre market price ", pre_market_price_for_stock)
            if prev_close_price_for_stock:
                res["prev_close"] = prev_close_price_for_stock
                print("Found prev_close ", prev_close_price_for_stock)
                
            return res
    except Exception as e:
        print(e)
        raise ValueError(e)

In [None]:
def get_stories_for_stock(articles_for_stock, stock, opts, svc):
    if not articles_for_stock:
        return

    with webdriver.Chrome(service=svc, options=opts) as driver:
        stories_for_stock = []

        # these are all the stories for the stock
        for link in list(articles_for_stock):
            print(f"[scraper] scraping {link}, stock {stock}")
            try: 
                story = scrape_recent_news_for_sym(link, driver)
                if not story:
                    continue
                stories_for_stock.append(story)
            except Exception as e:
                print("Failed to get story for ", link)
                continue
        
        return stories_for_stock

In [None]:
def save_scraped_stock_data(res, stock, run_id):
    if any(key not in res for key in ["pre_market_price", "prev_close"]):
        return
        
    db = get_db()
    scrapes_collection = db['stock_prices']

    doc = {
        "pre_market_price": res["pre_market_price"],
        "prev_close": res["prev_close"],
        "created_at": datetime.now(timezone.utc),
        "stock": stock,
        "run_id": run_id,
    }
    scrapes_collection.insert_one(doc)

In [None]:
def run_scraper(stock, timestamp, run_id, opts, svc, worker_idx):
    url = f"https://finance.yahoo.com/quote/{stock}"
    print(f"[scraper] getting articles for url {url}, idx {worker_idx}")

    try:
        scraped_stock_res = get_articles_for_stock(url, opts, svc)
        articles_for_stock = scraped_stock_res["articles_for_stock"]
        if not articles_for_stock:
            print(f"[scraper] no articles found for stock {stock}")
            return
            
        stories_for_stock = get_stories_for_stock(articles_for_stock, stock, opts, svc)
        if not stories_for_stock:
            print(f"No stories found for stock {stock}")
            return

        print(f"[scraper] found {len(stories_for_stock)} for stock {stock}")
        print(f"[scraper] Saving articles to storage for stock {stock}")
        save_articles_to_storage(stories_for_stock, stock, timestamp, run_id)
        
        save_scraped_stock_data(scraped_stock_res, stock, run_id)
        print(f"[scraper] completed for stock {stock}, ts: {timestamp}")
    except Exception as e:
        print(e)
        raise ValueError(e)

In [None]:
def run_job(stock, timestamp, sema, run_id, worker_idx):
    sema.acquire()       
    opts = webdriver.ChromeOptions()

    # opts.add_argument("--headless")
    # opts.add_argument("--disable-gpu")
    # opts.add_argument("window-size=1920,1080")
    # opts.add_argument("--no-sandbox")
    # opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
    svc = ChromeService(ChromeDriverManager().install())

    try: 
        run_scraper(stock, timestamp, run_id, opts, svc, worker_idx)
        print(f"[scraper] SUCCESS on stock {stock}")
    except Exception as e:
        print(e)
        print(traceback.format_exc())
        print(f"[scraper] FAILED on stock {stock}")
    finally:
        sema.release()

In [None]:
def start(run_id, stocks):
    maxthreads = 5
    sema = threading.Semaphore(value=maxthreads)
    threads = list()

    utc_now = datetime.now(timezone.utc)

    for idx, stock in enumerate(stocks):
        args = (stock, utc_now, sema, run_id, idx)
        thread = threading.Thread(target=run_job, args=args)
        threads.append(thread)
    
    for thread in threads:
        time.sleep(5)
        thread.start()
    

In [None]:
def _start(run_id, stocks):
    # Get the current time in UTC
    current_time_utc = datetime.now(timezone.utc)
    print("Current time is ", current_time_utc)
    date_string = "2024-10-08"
    thirty_min = 1800 # seconds
    
    
    # Convert to a datetime object
    date_object = datetime.strptime(date_string, "%Y-%m-%d").date()
    target_time = datetime.combine(date_object, dt_time(12, 0, 0), timezone.utc)
    # target_time = datetime.combine(date_object, dt_time(5, 33, 0), timezone.utc)
    
    while current_time_utc < target_time:
        print("Current time is ", current_time_utc)
        time.sleep(thirty_min)
        current_time_utc = datetime.now(timezone.utc)

    start(run_id, stocks)
    print("Now it's after", current_time_utc)
    print(current_time_utc)

In [None]:
# stocks = get_stocks_list()
stocks = [
    "WMT", "AMZN", "AAPL", "CVS", "UNH", "XOM", "BRK.B", "GOOGL", "MCK", "CVX", 
    "ABC", "COST", "MSFT", "CAH", "CI", "MPC", "PSX", "VLO", "F", "HD", "JPM", 
    "GM", "ANTM", "GE", "KR", "CMCSA", "T", "VZ", "DELL", "TGT", "SHEL", "BA", 
    "WBA", "BAC", "WFC", "C", "JCI", "JNJ", "IBM", "FMCC", "HUM", "PEP", "UPS", 
    "INTC", "PG", "ADM", "ACI", "MET", "GS", "SYY", "RTX", "HPQ", "BA", "CNC", 
    "LOW", "FDX", "MRK", "CAT", "DIS", "PFE", "LMT", "MS", "CSCO", "KO", "ABBV", 
    "ALL", "AIG", "DAL", "CHTR", "NYL", "AXP", "NFS", "BBY", "LMIC", "MRK", "TSN", 
    "UAL", "TJX", "PGR", "DE", "ABT", "GD", "KO", "NKE", "HCA", "JBL", "AAL", 
    "MDLZ", "TIAA", "CI", "PUSH", "COP", "GIS", "TMO", "BMY", "GS", "EPD", "USAA", 
    "PM", "DHR", "NWM", "RAD", "MMM", "SBUX", "QCOM", "NOC", "COF", "TRV", "ARW", 
    "HON", "DG", "DOW", "WHR", "ARMK", "PFGC", "CHSCP", "PBF"
]

# run_id = str(uuid.uuid4())
run_id = "c9eb6450-dafc-45c1-bce5-2cdfb81cc850"
print(f"[scraper] Starting Yahoo scraper on {len(stocks)} stocks, run id {run_id}")

In [None]:
start(run_id, stocks)
# _start(run_id, stocks)