In [1]:
!pip install googlesearch-python
import yfinance as yf
from googlesearch import search
import re
from google.colab import userdata
# from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import torch


def fetch_stock_data(ticker, start_date, end_date):
    """Fetch historical stock data from Yahoo Finance."""
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data


def load_model():
    """Load LLM model of choice."""
    # model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.3-70B-Instruct", token = userdata.get('HF_TOKEN1')

    # tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
    #model = AutoModelForCausalLM.from_pretrained("gpt2-medium", token1 = userdata.get('HF_TOKEN3'))
    # mylogin = userdata.get('HF_TOKEN3')
    #login(token=mylogin)
    # login(userdata.get('HF_TOKEN1'))
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small", use_auth_token = False)
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small", use_auth_token = False)

    return tokenizer, model


def predict_stock_price(tokenizer, model, stock_data, ticker):
    """Use LLM to predict the next opening price based on historical data and news."""
    recent_data = stock_data.tail(5)[['Open']].copy()
    formatted_data = recent_data.to_string(index=False)
    tomorrow = (pd.to_datetime('today').date() + pd.Timedelta(days=1)).isoformat()

    # Fetch recent news related to the ticker
    try:
        news_queries = [
            f"recent news about {ticker} stock market",
            f"news affecting {ticker} stock price"
        ]
        news_results_generator = search(news_queries)
        news_results = list(news_results_generator)  # Convert generator to a list
        news_text = ""
        if news_results and len(news_results) > 0 and news_results[0].results:
            news_text = " ".join([result.snippet for result in news_results[0].results if result.snippet])
            news_text = f"Recent news: {news_text[:500]}..."  # Limit news text length
        else:
            news_text = "No relevant news found."
    except Exception as e:
        news_text = f"Error fetching news: {e}"

    # Format the prompt including news
    input_text = (
        f"Based on the past 5 days of opening stock prices for {ticker}:\n\n"
        f"{formatted_data}\n\n"
        f"And considering recent news that may affect the stock: {news_text}\n\n"
        f"What is the predicted opening price for {ticker} on {tomorrow}?\n"
        f"Respond with only a single number that you think will be the opening price for {ticker}."
    )

    # Debug print
    print("Prompt:\n", input_text)

    # Tokenize and predict
    device = next(model.parameters()).device
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=60)  # Increased max_length to accommodate news

    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract numerical value robustly
    match = re.search(r'\d+\.\d+|\d+', prediction)
    predicted_price = float(match.group()) if match else None

    # Display result
    print("Predicted Price:", predicted_price)

    if predicted_price is not None:
        return predicted_price
    else:
        return "No numerical prediction found."


def main():
    ticker = "AAPL"  # Change to any stock symbol, for the example here I used Apple as it is one of the most well known stocks.
    start_date = "2025-05-03"
    end_date = "2025-05-08"

    print(f"Fetching stock data for {ticker}...")
    stock_data = fetch_stock_data(ticker, start_date, end_date)

    print("Loading LLM model...")
    tokenizer, model = load_model()

    print("Predicting stock price...")
    prediction = predict_stock_price(tokenizer, model, stock_data, ticker)

    print(f"Predicted next opening price for {ticker}: {prediction}")


if __name__ == "__main__":
    main()

Collecting googlesearch-python
  Downloading googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Downloading googlesearch_python-1.3.0-py3-none-any.whl (5.6 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.3.0
Fetching stock data for AAPL...
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Loading LLM model...




tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Predicting stock price...
Prompt:
 Based on the past 5 days of opening stock prices for AAPL:

Empty DataFrame
Columns: [(Open, AAPL)]
Index: []

And considering recent news that may affect the stock: Error fetching news: 'str' object has no attribute 'results'

What is the predicted opening price for AAPL on 2025-05-09?
Respond with only a single number that you think will be the opening price for AAPL.
Predicted Price: None
Predicted next opening price for AAPL: No numerical prediction found.
