In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
from dotenv import load_dotenv

from transformers import pipeline
import google.generativeai as genai
import os
from dotenv import load_dotenv
import time


# Get list of S&P 500 tickers from Wikipedia
def get_sp500_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    table = pd.read_html(url)[0]
    return table['Symbol'].tolist()

tickers = get_sp500_tickers()
print(f"Loaded {len(tickers)} tickers.")


  from .autonotebook import tqdm as notebook_tqdm


Loaded 503 tickers.


In [2]:
## HERE WE CUT DOWN TO ONLY MAGNIFICENT 7, TUNE AS YOU SEE FIT TO TRACK SELECT STOCKS
tickers = ["GOOGL","AMZN","AAPL","META","MSFT","NVDA","TSLA"]


In [3]:
# Scrape news headlines from Finviz for a given ticker
def get_headlines(ticker):
    url = f'https://finviz.com/quote.ashx?t={ticker}'
    headers = {'User-Agent': 'Mozilla/5.0'}
    headlines = []
    
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='fullview-news-outer')
        
        if not table:
            return headlines
        
        rows = table.find_all('tr')
        
        for row in rows:
            try:
                time_tag = row.td.text.strip()
                link_tag = row.find('a')
                if link_tag:
                    headline = link_tag.text.strip()
                    headlines.append({
                        'ticker': ticker,
                        'time': time_tag,
                        'headline': headline
                    })
            except:
                continue
                
    except Exception as e:
        print(f"[{ticker}] Error: {e}")
    
    return headlines


In [4]:
all_headlines = []

for i, ticker in enumerate(tickers):
    print(f"{i+1}/{len(tickers)}: Scraping {ticker}...")
    headlines = get_headlines(ticker)
    all_headlines.extend(headlines)
    time.sleep(1)  # Prevents rate limiting

print(f"Total headlines collected: {len(all_headlines)}")


1/7: Scraping GOOGL...
2/7: Scraping AMZN...
3/7: Scraping AAPL...
4/7: Scraping META...
5/7: Scraping MSFT...
6/7: Scraping NVDA...
7/7: Scraping TSLA...
Total headlines collected: 700


In [5]:
from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_model = pipeline("sentiment-analysis")

# Run sentiment on headlines
for h in all_headlines:
    result = sentiment_model(h['headline'])[0]
    h['sentiment_pytorch'] = result['label']
    h['score_pytorch'] = result['score']


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [6]:
df = pd.DataFrame(all_headlines)

# Show counts
print(df['sentiment_pytorch'].value_counts())




sentiment_pytorch
NEGATIVE    453
POSITIVE    247
Name: count, dtype: int64


In [7]:
df.head()

Unnamed: 0,ticker,time,headline,sentiment_pytorch,score_pytorch
0,GOOGL,Today 01:39PM,"In Uncertain Times, This Is What I Am Doing Wi...",NEGATIVE,0.973199
1,GOOGL,12:57PM,"AppLovin's Results Are Stupendous, But I Don't...",NEGATIVE,0.981118
2,GOOGL,12:27PM,Magnificent Seven Stocks Waver,POSITIVE,0.998908
3,GOOGL,10:33AM,Should You Buy Unity Software Stock After a 36...,NEGATIVE,0.998939
4,GOOGL,10:27AM,"Trump Trade: Trump claims no inflation, Medica...",NEGATIVE,0.967766


In [None]:
# Load API Key
load_dotenv()
gemini_api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=gemini_api_key)

# Load Models
gemini_model = genai.GenerativeModel('models/gemini-1.5-flash-latest')

In [9]:
# Gemini Sentiment Function
def get_gemini_sentiment(text):
    prompt = f"""
Rate the sentiment of this stock-related news headline. YOU MUST RESPOND WITH POSITIVE OR NEGATIVE, NO UNKNOWNS Respond ONLY in this format:

Sentiment: POSITIVE or NEGATIVE  
Score: a number between 0 and 1  

Headline: "{text}"
"""
    try:
        response = gemini_model.generate_content(prompt)
        content = response.text.upper()

        sentiment = "UNKNOWN"
        score = 0.5

        if "POSITIVE" in content:
            sentiment = "POSITIVE"
        elif "NEGATIVE" in content:
            sentiment = "NEGATIVE"

        # Find number in text (e.g. "Score: 0.87")
        score_candidates = [float(s) for s in content.split() if s.replace('.', '', 1).isdigit()]
        if score_candidates:
            score = max(min(score_candidates[0], 1), 0)

        return sentiment, score
    
    except Exception as e:
        print(f"Gemini error: {e}")
        return "ERROR", 0.0

In [None]:
import time

start_time = time.time()

augmented_headlines = []
batch_size = 15
headlines = all_headlines

for i in range(0, len(headlines), batch_size):
    batch = headlines[i:i+batch_size]

    for h in batch:
        # PyTorch Transformers sentiment
        torch_result = sentiment_model(h['headline'])[0]
        h['sentiment_pytorch'] = torch_result['label']
        h['score_pytorch'] = torch_result['score']

        # Gemini sentiment
        gemini_sentiment, gemini_score = get_gemini_sentiment(h['headline'])
        h['sentiment_gemini'] = gemini_sentiment
        h['score_gemini'] = gemini_score

        print(f"[{h['ticker']}] {h['headline'][:60]}... | PT: {h['sentiment_pytorch']} | Gemini: {h['sentiment_gemini']}")
        
        augmented_headlines.append(h)

    print(f"Processed {i+batch_size} / {len(headlines)} headlines — sleeping for 60 seconds...")
    time.sleep(60)

# Total runtime
end_time = time.time()
runtime_minutes = (end_time - start_time) / 60
print(f"\n✅ All done! Total runtime: {runtime_minutes:.2f} minutes")


[GOOGL] In Uncertain Times, This Is What I Am Doing With Stocks I Ow... | PT: NEGATIVE | Gemini: NEGATIVE
[GOOGL] AppLovin's Results Are Stupendous, But I Don't Think Such St... | PT: NEGATIVE | Gemini: POSITIVE
[GOOGL] Magnificent Seven Stocks Waver... | PT: POSITIVE | Gemini: NEGATIVE
[GOOGL] Should You Buy Unity Software Stock After a 36% Dip in a Yea... | PT: NEGATIVE | Gemini: NEGATIVE
[GOOGL] Trump Trade: Trump claims no inflation, Medicare wont cover ... | PT: NEGATIVE | Gemini: NEGATIVE
[GOOGL] Waymo may use interior camera data to train generative AI mo... | PT: NEGATIVE | Gemini: POSITIVE
[GOOGL] Nvidia And Google Bet Big On Quantum AI With $150M Investmen... | PT: POSITIVE | Gemini: POSITIVE
[GOOGL] As Wall Street Whipsaws, What's An Investor To Do? Start Her... | PT: NEGATIVE | Gemini: NEGATIVE
[GOOGL] Google Cloud moves deeper into open source AI with Ai2 partn... | PT: POSITIVE | Gemini: POSITIVE
[GOOGL] Alphabet: Embrace Pain As Nobody Rings A Bell At The Bottom... | PT:

In [None]:
import pandas as pd

df_sentiment = pd.DataFrame(augmented_headlines)

# Optionally save it
df_sentiment.to_csv("sp500_headlines_with_sentiment.csv", index=False)

# Preview
df_sentiment.head()


