In [12]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from lxml import etree
from sklearn.preprocessing import MinMaxScaler
import time
import numpy as np
import pandas as pd
import re
import requests

In [2]:
def scrape_table(table):
    data = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        cells = row.find_all("td")
        row_data = [cell.text.strip() for cell in cells]
        data.append(row_data)
    return data

In [3]:
### Code for test scraping with S&P500 ###
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--headless")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://stockanalysis.com/list/sp-500-stocks/")

table = driver.find_element("id", "main-table").get_attribute("outerHTML")
driver.quit()

df = pd.read_html(StringIO(table))[0]
df.drop("No.", inplace=True, axis=1)

In [31]:
k = 100    # Top k stocks by market cap
top_k_stocks_by_mkt_cap = list(df.iloc[:k]["Symbol"])

## Serialized Scraping

In [53]:
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--headless")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(options=chrome_options)

scrape_results_list = []
web_driver_wait_max_time = 2   # Duration for each scraping wait

for stock_ticker in top_k_stocks_by_mkt_cap:
    stock_ticker = stock_ticker.replace(".", "-")
    chunk_collector = {}
    try:
        driver.get(f"https://www.stocktargetadvisor.com/stock/USA/NYSE/{stock_ticker}")
        avg_analyst_rating = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[1]/div/figcaption/p"))
).text
        stock_target_advisor_analysis = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[3]/div/figcaption/p"))
).text
        avg_user_rating = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[4]/div/figcaption/p"))
).text
        avg_analyst_target = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[2]/div[1]/div[1]/div[3]/div/div[1]/div[1]"))
).text
        avg_upside_potential = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[2]/div[1]/div[1]/div[3]/div/div[1]/div[2]/span"))
).text
        chunk_collector["stock_ticker"] = stock_ticker
        chunk_collector["avg_analyst_rating"] = avg_analyst_rating
        chunk_collector["stock_target_advisor_analysis"] = stock_target_advisor_analysis
        chunk_collector["avg_user_rating"] = avg_user_rating
        chunk_collector["avg_analyst_target"] = avg_analyst_target
        chunk_collector["avg_upside_potential"] = avg_upside_potential
        scrape_results_list.append(chunk_collector)
    except:
        print(f"Error scraping for {stock_ticker}")
        continue

Error scraping for BRK-B


## Parallelized Scraping

In [32]:
def scrape_stock(stock_ticker):
    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)
    driver = webdriver.Chrome(options=chrome_options)
    
    web_driver_wait_max_time = 5   # Duration for each scraping wait
    
    stock_ticker = stock_ticker.replace(".", "-")
    chunk_collector = {}
    try:
        driver.get(f"https://www.stocktargetadvisor.com/stock/USA/NYE/{stock_ticker}")
        avg_analyst_rating = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[1]/div/figcaption/p"))
).text
        stock_target_advisor_analysis = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[3]/div/figcaption/p"))
).text
        avg_user_rating = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[1]/div[2]/div[3]/div[1]/div[4]/div/figcaption/p"))
).text
        avg_analyst_target = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[2]/div[1]/div[1]/div[3]/div/div[1]/div[1]"))
).text
        avg_upside_potential = WebDriverWait(driver, web_driver_wait_max_time).until(
    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[1]/div/section/div[2]/div[2]/div[1]/div[1]/div[3]/div/div[1]/div[2]/span"))
).text
        chunk_collector["stock_ticker"] = stock_ticker
        chunk_collector["avg_analyst_rating"] = avg_analyst_rating
        chunk_collector["stock_target_advisor_analysis"] = stock_target_advisor_analysis
        chunk_collector["avg_user_rating"] = avg_user_rating
        chunk_collector["avg_analyst_target"] = avg_analyst_target
        chunk_collector["avg_upside_potential"] = avg_upside_potential
        return chunk_collector
    except Exception as e:
        print(f"Error scraping for {stock_ticker}: {e}")
        return None
    finally:
        driver.quit()

def min_max_normalize(series):
    scaler = MinMaxScaler()
    normalized = scaler.fit_transform(series.values.reshape(-1, 1))
    return normalized.flatten()

In [33]:
max_workers = 5 # Number of concurrent threads

with ThreadPoolExecutor(max_workers=max_workers) as executor:  
    futures = [executor.submit(scrape_stock, ticker) for ticker in top_k_stocks_by_mkt_cap]
    results = [f.result() for f in as_completed(futures) if f.result() is not None]

Error scraping for BRK-B: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF65A9ECF45+75717]
	GetHandleVerifier [0x00007FF65A9ECFA0+75808]
	(No symbol) [0x00007FF65A7B8F9A]
	(No symbol) [0x00007FF65A80F4C6]
	(No symbol) [0x00007FF65A80F77C]
	(No symbol) [0x00007FF65A862577]
	(No symbol) [0x00007FF65A8373BF]
	(No symbol) [0x00007FF65A85F39C]
	(No symbol) [0x00007FF65A837153]
	(No symbol) [0x00007FF65A800421]
	(No symbol) [0x00007FF65A8011B3]
	GetHandleVerifier [0x00007FF65ACED71D+3223453]
	GetHandleVerifier [0x00007FF65ACE7CC2+3200322]
	GetHandleVerifier [0x00007FF65AD05AF3+3322739]
	GetHandleVerifier [0x00007FF65AA06A1A+180890]
	GetHandleVerifier [0x00007FF65AA0E11F+211359]
	GetHandleVerifier [0x00007FF65A9F5294+109332]
	GetHandleVerifier [0x00007FF65A9F5442+109762]
	GetHandleVerifier [0x00007FF65A9DBA59+4825]
	BaseThreadInitThunk [0x00007FF973B2E8D7+23]
	RtlUserThreadStart [0x00007FF9754FC5DC+44]



In [34]:
res = pd.DataFrame(results)
res['avg_upside_potential_decimal'] = (
    res['avg_upside_potential']
    .str.replace(r'[\(\)%]', '', regex=True)  # Remove '(', ')', '%'
    .astype(float) / 100
)
res['upside_normalized'] = min_max_normalize(res['avg_upside_potential_decimal'])

In [44]:
avg_analyst_rating_smart_score = {'Strong Buy': 5,
                                  'Buy': 3.3,
                                  'Hold': 2.5,
                                  'Sell': 1.7,
                                  'Under-perform': 1.7,
                                  'Strong Sell': 0.6}

stock_target_advisor_smart_score = {'Very Bullish': 5,
                                  'Bullish': 4.2,
                                  'Slightly Bullish': 3.3,
                                  'Neutral': 2.5,
                                  'Slightly Bearish': 2,
                                  'Bearish': 1.3,
                                  'Very Bearish': 0.5}

avg_user_rating_smart_score = {'Strong Buy': 5,
                                  'Buy': 3.3,
                                  'Hold': 2.5,
                                  'Sell': 1.7,
                                  'Under-perform': 1.7,
                                  'Strong Sell': 0.6,
                                  }

## Smart Score Parameter Tunings

In [45]:
avg_analyst_rating_smart_score_weightage = 0.25
stock_target_advisor_score_weightage = 0.25
avg_user_rating_score_weightage = 0.25
upside_normalized_weightage = 0.25

In [46]:
res['avg_analyst_rating_smart_score'] = (res['avg_analyst_rating'].map(avg_analyst_rating_smart_score)) / 5
res['stock_target_advisor_score'] = (res['stock_target_advisor_analysis'].map(stock_target_advisor_smart_score)) / 5
res['avg_user_rating_score'] = (res['avg_user_rating'].map(avg_user_rating_smart_score)) / 5
res['smart_score'] = round((res['avg_analyst_rating_smart_score'] * avg_analyst_rating_smart_score_weightage 
                            + res['stock_target_advisor_score'] * stock_target_advisor_score_weightage
                            + res['avg_user_rating_score'] * avg_user_rating_score_weightage
                            + res['upside_normalized'] * upside_normalized_weightage) * 10,
                            1)

In [47]:
res.sort_values('smart_score', ascending=False).head(20)

Unnamed: 0,stock_ticker,avg_analyst_rating,stock_target_advisor_analysis,avg_user_rating,avg_analyst_target,avg_upside_potential,avg_upside_potential_decimal,upside_normalized,avg_analyst_rating_smart_score,stock_target_advisor_score,avg_user_rating_score,smart_score
55,QCOM,Strong Buy,Bullish,Strong Buy,USD 186.13,(+26.30%),0.263,0.538707,1.0,0.84,1.0,8.4
5,GOOGL,Strong Buy,Bullish,Strong Buy,USD 206.69,(+20.96%),0.2096,0.485366,1.0,0.84,1.0,8.3
52,ADBE,Strong Buy,Slightly Bullish,Strong Buy,USD 546.30,(+31.85%),0.3185,0.594146,1.0,0.66,1.0,8.1
90,ANET,Strong Buy,Slightly Bullish,Strong Buy,USD 116.15,(+25.48%),0.2548,0.530516,1.0,0.66,1.0,8.0
64,TMO,Strong Buy,Slightly Bullish,Buy,USD 635.46,(+58.67%),0.5867,0.862052,1.0,0.66,0.66,8.0
34,NOW,Strong Buy,Very Bullish,Buy,"USD 1,142.42",(+12.47%),0.1247,0.400559,1.0,1.0,0.66,7.7
46,ISRG,Strong Buy,Slightly Bullish,Strong Buy,USD 615.84,(+13.48%),0.1348,0.410648,1.0,0.66,1.0,7.7
28,UNH,Strong Buy,Slightly Bearish,Buy,USD 511.68,(+72.48%),0.7248,1.0,1.0,0.4,0.66,7.6
4,MSFT,Strong Buy,Slightly Bullish,Strong Buy,USD 504.28,(+10.86%),0.1086,0.384477,1.0,0.66,1.0,7.6
77,LOW,Strong Buy,Bullish,Buy,USD 280.76,(+25.48%),0.2548,0.530516,1.0,0.84,0.66,7.6
