In [None]:
!pip install yfinance langchain faiss-cpu sentence-transformers openai langchain-openai langchain-community

Collecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting openai
  Downloading openai-1.101.0-py3-none-any.whl.metadata (29 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.32-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting frozendict>=2.3.4 

In [None]:
from google.colab import userdata
import os

os.environ['FINNHUB_API_KEY'] = userdata.get('FINNHUB_API_KEY')

In [None]:
!pip install yfinance finnhub-python langchain langchain-google-genai sentence-transformers faiss-cpu

Collecting finnhub-python
  Downloading finnhub_python-2.4.24-py3-none-any.whl.metadata (9.2 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.9-py3-none-any.whl.metadata (7.2 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting grpcio-status<2.0.0,>=1.33.2 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.1->google-ai-generativelanguage<0.7.0,>=0.6.18->langchain-google-genai)
  Downloading grpcio_status-1.74.0-py3-none-any.whl.metadata (1.1 kB)
Downloading finnhub_python-2.4.24-py3-none-any.whl (11 kB)
Downloading langchain_google_genai-2.1.9-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import yfinance as yf
import finnhub
import os
import re
from datetime import datetime, timedelta
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.docstore.document import Document

# Step 1: Prepare initial sample documents for RAG (these are static; dynamic news added later)
# Added Indian stock examples for broader coverage
static_documents = [
    "Apple Inc. (AAPL) is a tech giant known for iPhones and Macs.",
    "Tesla Inc. (TSLA) focuses on electric vehicles. Elon Musk is the CEO.",
    "Amazon.com Inc. (AMZN) dominates e-commerce and cloud (AWS).",
    "Reliance Industries Limited (RELIANCE.NS) is an Indian conglomerate in energy, retail, and telecom.",
    "Tata Consultancy Services (TCS.NS) is an Indian IT services and consulting company.",
    "HDFC Bank Limited (HDFCBANK.NS) is a leading private sector bank in India.",
    "Infosys Limited (INFY.NS) is an Indian multinational corporation providing business consulting and IT services.",
    # Add more base info if needed for other global/Indian stocks
]

# Step 2: Split static documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
static_docs = text_splitter.create_documents(static_documents)

# Step 3: Create embeddings and initial vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(static_docs, embeddings)

# Step 4: Set up local LLM with Phi-2
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    do_sample=False,
    temperature=0.1,
    top_p=0.9,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
)
llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

# Step 5: Custom prompt (emphasize real-time data, conciseness, and latest news)
prompt_template = """Use the following context and any real-time data or latest news to answer the question about stocks. Be concise, factual, and base your answer on the provided information.
Context: {context}
Question: {question}
Answer in 1-3 sentences only:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Step 6: Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={"prompt": PROMPT}
)

# Function to get real-time stock data (using yfinance; supports Indian stocks with .NS)
def get_stock_data(ticker):
    try:
        stock = yf.Ticker(ticker)
        info = stock.info
        return {
            "price": info.get("currentPrice", "N/A"),
            "change": info.get("regularMarketChangePercent", "N/A"),
            "market_cap": info.get("marketCap", "N/A"),
            "pe_ratio": info.get("trailingPE", "N/A")
        }
    except Exception as e:
        return {"error": f"Error fetching data: {str(e)}"}

# Function to get latest news using Finnhub (supports Indian stocks like RELIANCE.NS)
def get_finnhub_news(ticker):
    try:
        api_key = os.getenv('FINNHUB_API_KEY')  # Set your env var
        if not api_key:
            return "Finnhub API key not set."
        finnhub_client = finnhub.Client(api_key=api_key)
        to_date = datetime.now().strftime('%Y-%m-%d')
        from_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')  # Last 30 days for more coverage
        news = finnhub_client.company_news(ticker, _from=from_date, to=to_date)
        if news:
            return "Latest news for {}:\n".format(ticker) + "\n".join([
                f"- {item['headline']}: {item['summary']} (Source: {item['source']}, Date: {datetime.fromtimestamp(item['datetime']).strftime('%Y-%m-%d')})"
                for item in news[:5]  # Top 5 recent
            ])
        else:
            return "No recent news available."
    except Exception as e:
        return f"Error fetching Finnhub news: {str(e)}"

# Function to get quarterly earnings using Finnhub
def get_finnhub_earnings(ticker):
    try:
        api_key = os.getenv('FINNHUB_API_KEY')
        if not api_key:
            return ""
        finnhub_client = finnhub.Client(api_key=api_key)
        earnings = finnhub_client.company_earnings(ticker, limit=4)  # Last 4 quarters
        if earnings:
            earnings_str = "Recent quarterly earnings for {}:\n".format(ticker) + "\n".join([
                f"- Period: {item['period']}, Actual EPS: {item.get('actual', 'N/A')}, Estimate: {item.get('estimate', 'N/A')}, Surprise: {item.get('surprise', 'N/A')}%"
                for item in earnings
            ])
            return earnings_str
        else:
            return "No recent earnings data available."
    except Exception as e:
        return f"Error fetching earnings: {str(e)}"

# Function to get dividends using Finnhub
def get_finnhub_dividends(ticker):
    try:
        api_key = os.getenv('FINNHUB_API_KEY')
        if not api_key:
            return ""
        finnhub_client = finnhub.Client(api_key=api_key)
        to_date = datetime.now().strftime('%Y-%m-%d')
        from_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')  # Last year
        dividends = finnhub_client.stock_dividends(ticker, _from=from_date, to=to_date)
        if dividends:
            dividends_str = "Recent dividends for {}:\n".format(ticker) + "\n".join([
                f"- Date: {item['date']}, Amount: {item['amount']}, Currency: {item.get('currency', 'N/A')}"
                for item in dividends[:5]
            ])
            return dividends_str
        else:
            return "No recent dividends available."
    except Exception as e:
        return f"Error fetching dividends: {str(e)}"

# Function to get stock splits using Finnhub
def get_finnhub_splits(ticker):
    try:
        api_key = os.getenv('FINNHUB_API_KEY')
        if not api_key:
            return ""
        finnhub_client = finnhub.Client(api_key=api_key)
        to_date = datetime.now().strftime('%Y-%m-%d')
        from_date = (datetime.now() - timedelta(days=1825)).strftime('%Y-%m-%d')  # Last 5 years
        splits = finnhub_client.stock_split(ticker, _from=from_date, to=to_date)
        if splits:
            splits_str = "Recent stock splits for {}:\n".format(ticker) + "\n".join([
                f"- Date: {item['date']}, From: {item['fromFactor']}, To: {item['toFactor']}"
                for item in splits
            ])
            return splits_str
        else:
            return "No recent stock splits available."
    except Exception as e:
        return f"Error fetching splits: {str(e)}"

# Function to update vectorstore with new documents (e.g., news, earnings)
def update_vectorstore_with_info(info_str):
    if info_str:
        new_docs = text_splitter.create_documents([info_str])
        vectorstore.add_documents(new_docs)

# Main chat loop
def chat():
    print("Stock Market Chatbot (type 'exit' to quit)")
    # Initial vectorstore with static docs
    global vectorstore
    while True:
        query = input("You: ")
        if query.lower() == 'exit':
            break

        # Detect ticker (improved regex for global/Indian tickers: uppercase 2-10 chars, optional .NS/.BO etc.)
        ticker = None
        # Potential common tickers (expanded for Indian)
        potential_tickers = ['AAPL', 'TSLA', 'AMZN', 'GOOG', 'MSFT', 'RELIANCE.NS', 'TCS.NS', 'HDFCBANK.NS', 'INFY.NS']
        # Regex to match stock tickers (e.g., AAPL, RELIANCE.NS)
        ticker_pattern = r'\b[A-Z]{2,10}(?:\.[A-Z]{2})?\b'
        matches = re.findall(ticker_pattern, query.upper())
        if matches:
            ticker = matches[0]  # Take the first match
        else:
            # Fallback to check if any word matches potential_tickers
            words = query.upper().split()
            for word in words:
                if word in potential_tickers:
                    ticker = word
                    break

        # Fetch real-time data, news, earnings, dividends, splits if ticker detected
        stock_data_str = ""
        additional_info = ""
        if ticker:
            data = get_stock_data(ticker)
            if "error" not in data:
                stock_data_str = f"Current data for {ticker}: Price ${data['price']}, Change {data['change']}%, Market Cap ${data['market_cap']}, PE Ratio {data['pe_ratio']}.\n"
            news_str = get_finnhub_news(ticker)
            earnings_str = get_finnhub_earnings(ticker)
            dividends_str = get_finnhub_dividends(ticker)
            splits_str = get_finnhub_splits(ticker)
            additional_info = news_str + "\n" + earnings_str + "\n" + dividends_str + "\n" + splits_str
            stock_data_str += additional_info
            # Dynamically update vectorstore with latest info for RAG
            update_vectorstore_with_info(additional_info)
        else:
            stock_data_str = "No specific stock ticker detected. Please include one (e.g., AAPL or RELIANCE.NS) for real-time data."

        # Run RAG query with real-time data prepended
        full_query = stock_data_str + query
        response = qa_chain.run(full_query)

        # Post-process response to ensure conciseness and remove artifacts
        # Strip prompt echoes and limit to relevant part
        if 'Answer in 1-3 sentences only:' in response:
            response = response.split('Answer in 1-3 sentences only:')[-1].strip()
        lines = response.split('\n')
        clean_response = ' '.join(lines[:5]).strip()  # Limit to avoid repetition
        if 'Question:' in clean_response:
            clean_response = clean_response.split('Question:')[0].strip()

        print("Bot:", clean_response)

if __name__ == "__main__":
    chat()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Stock Market Chatbot (type 'exit' to quit)
You: Which has higher PE: TSLA or RELIANCE.NS?


ERROR:yfinance:HTTP Error 404: 
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Bot: RELIANCE.NS has a higher PE ratio than TSLA.
You: Any corporate actions like splits for INFY.NS?


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Bot: No corporate actions are scheduled for INFY.NS.
You: Quarterly results for RELIANCE.NS?


ERROR:yfinance:HTTP Error 404: 
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Bot: No recent news available. No recent earnings data available. Error fetching dividends: FinnhubAPIException(status_code: 403): You don't have access to this resource. Error fetching splits: 'Client' object has no attribute'stock_split'
