In [1]:
# Cell 1: Install Dependencies

# Install the core LangChain libraries
!pip install langchain langchain-community langchain-chroma pypdf requests ollama

# For checking GPU status
!nvidia-smi

Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting ollama
  Downloading ollama-0.5.1-py3-none-any.whl.metadata (4.3 kB)
Collecting langchain-core<1.0.0,>=0.3.49 (from langchain)
  Downloading langchain_core-0.3.66-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting chromadb>=1.0.9 (from langchain-chroma)
  Downloading chromadb-1.0.1

In [7]:
# Cell 2: Set up Ollama with a robust readiness check

import subprocess
import time
import os
import requests # We need this to check the connection

# --- Step 1: Install Ollama ---
print("Installing Ollama...")
# Using a timeout for the curl command to avoid hanging
process = subprocess.run(["curl", "-fsSL", "https://ollama.com/install.sh"], capture_output=True, text=True, timeout=30)
if process.returncode != 0:
    print("Error installing Ollama:")
    print(process.stderr)
else:
    subprocess.run(process.stdout.splitlines(), shell=True, check=True)

# --- Step 2: Start the Ollama server as a background process ---
print("\nStarting Ollama server in the background...")
# Use subprocess.Popen for a non-blocking start
# We redirect stdout/stderr to files so we can check them later if needed
with open("ollama_server.log", "w") as log_file:
    server_process = subprocess.Popen(
        ["ollama", "serve"],
        stdout=log_file,
        stderr=subprocess.STDOUT
    )

# --- Step 3: Wait for the server to be ready ---
print("Waiting for Ollama server to become ready...")
max_retries = 20
retry_delay_seconds = 3

for i in range(max_retries):
    try:
        # Check if the server is up and running by making a simple request
        response = requests.get("http://localhost:11434", timeout=5)
        if response.status_code == 200:
            print("Ollama server is ready!")
            break
    except requests.exceptions.ConnectionError:
        print(f"Attempt {i+1}/{max_retries}: Connection refused. Retrying in {retry_delay_seconds} seconds...")
        time.sleep(retry_delay_seconds)
else:
    # This block executes if the loop completes without a 'break'
    print("Error: Could not connect to Ollama server after multiple retries.")
    print("Please check the server logs in 'ollama_server.log' for more details.")
    # Terminate the server process to clean up
    server_process.terminate()
    raise ConnectionError("Ollama server failed to start.")

# --- Step 4: Pull the DeepSeek-R1 model ---
print("\nOllama server is running. Now pulling the DeepSeek-R1 model...")
!ollama pull deepseek-r1:1.5b

# Check if the model is installed
print("\nVerifying installed models:")
!ollama list

print("\nCell 2 execution complete.")

Installing Ollama...

Starting Ollama server in the background...
Waiting for Ollama server to become ready...
Attempt 1/20: Connection refused. Retrying in 3 seconds...
Ollama server is ready!

Ollama server is running. Now pulling the DeepSeek-R1 model...
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling aabd4debf0c8: 100% ▕██████████████████▏ 1.1 GB                         [K
pulling c5ad996bda6e: 100% ▕██████████████████▏  556 B                         [K
pulling 6e4c38e1172f: 100% ▕██████████████████▏ 1.1 KB                         [K
pulling f4d24e9138dd: 100% ▕██████████████████▏  148 B                         [K
pulling a85fe2a2e58e: 100% ▕██████████████████▏  487 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l

Verifying installed models:
NAME                ID

In [8]:
# Cell 3: Load and Parse the PDF (Local File Version)

from langchain_community.document_loaders import PyPDFLoader
import os

def load_and_parse_pdf_simple(file_path):
    """
    Loads a PDF from a local file using PyPDFLoader and splits it by page.
    """
    if not os.path.exists(file_path):
        # This check is crucial to ensure the file is in the right place.
        print(f"Error: File not found at {file_path}. Please check the filename and path in the Kaggle file browser.")
        return []
        
    print(f"Loading and parsing local PDF with PyPDFLoader at: {file_path}...")
    loader = PyPDFLoader(file_path) # Using PyPDFLoader
    
    # load_and_split() returns a list of documents, one for each page.
    documents = loader.load_and_split()
    print(f"Parsed {len(documents)} pages from the PDF.")
    
    return documents

# --- Main execution for this cell ---
# IMPORTANT: Update this variable with the exact path to your uploaded PDF.
# Example: "/kaggle/working/data/your_uploaded_filename.pdf"
pdf_path = "/kaggle/input/apple10-k/appleee.pdf"

# Call the function to parse the local file.
parsed_documents = load_and_parse_pdf_simple(pdf_path)

# Print the first few parsed elements to inspect the output
if parsed_documents:
    print("\n--- Inspecting the first 5 parsed pages ---")
    for i, doc in enumerate(parsed_documents[:5]):
        print(f"--- Document Page {i+1} ---")
        print(f"Text: {doc.page_content[:200]}...") # Print first 200 chars
        print(f"Source: {doc.metadata.get('source')} (Page: {doc.metadata.get('page')})")
        print("-" * 20)

Loading and parsing local PDF with PyPDFLoader at: /kaggle/input/apple10-k/appleee.pdf...
Parsed 168 pages from the PDF.

--- Inspecting the first 5 parsed pages ---
--- Document Page 1 ---
Text: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒    ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal ye...
Source: /kaggle/input/apple10-k/appleee.pdf (Page: 0)
--------------------
--- Document Page 2 ---
Text: Indicate by check mark whether the Registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act 
of 1934 during the preceding 12 months (or for such ...
Source: /kaggle/input/apple10-k/appleee.pdf (Page: 1)
--------------------
--- Document Page 3 ---
Text: Apple Inc.
Form 10-K
For the Fiscal Year Ended September 28, 2024
TABLE OF CONTENTS
Page
Part I
Item 1. Business 1
Item 1A. Risk Factors 5
Item 1B. Unresolved Staff Comments 17
Item 1C. 

In [10]:
# Cell 4: Smart Chunking and Metadata (Organizing the Library Shelves)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

def create_chunks_with_metadata(documents, company_name, report_type):
    """
    Creates chunks from the documents with useful metadata.
    """
    print("Creating chunks with metadata...")
    chunks = []
    
    # Recursive splitter for all text blocks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", "!", "?", ","]
    )
    
    for doc in documents:
        # Add custom metadata for better retrieval
        metadata = {
            "source": doc.metadata.get('source'),
            "page": doc.metadata.get('page'),
            "company": company_name, 
            "report_type": report_type
        }
        
        # Split the page content into smaller chunks
        split_texts = text_splitter.split_text(doc.page_content)
        for split_text in split_texts:
            chunks.append(
                Document(page_content=split_text, metadata=metadata)
            )
    
    print(f"Created {len(chunks)} chunks from the document.")
    return chunks

# --- Main execution for this cell ---
# Use the parsed documents from the previous cell
company_name = "Apple Inc."
report_type = "10-K Annual Report"
chunks = create_chunks_with_metadata(parsed_documents, company_name, report_type)

# Print a sample chunk to inspect the output
if chunks:
    print("\n--- Example of a chunk ---")
    print("Text:", chunks[0].page_content[:200] + "...")
    print("Metadata:", chunks[0].metadata)

Creating chunks with metadata...
Created 555 chunks from the document.

--- Example of a chunk ---
Text: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒    ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal ye...
Metadata: {'source': '/kaggle/input/apple10-k/appleee.pdf', 'page': 0, 'company': 'Apple Inc.', 'report_type': '10-K Annual Report'}


In [11]:
# Cell 5: Embedding and Vector Store (Creating the Index) with a Progress Bar

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from tqdm.autonotebook import tqdm # Import tqdm for the progress bar

CHROMA_DB_PATH = "./chroma_db_finance"
OLLAMA_MODEL = "deepseek-r1:1.5b" # Use the model we pulled

def create_and_persist_vector_store(chunks):
    """
    Embeds chunks and stores them in a persistent ChromaDB with a progress bar.
    """
    print(f"Initializing embedding model: {OLLAMA_MODEL}")
    embeddings = OllamaEmbeddings(model=OLLAMA_MODEL)
    
    # Create the vector store and persist it to disk
    print(f"Creating and persisting vector store to {CHROMA_DB_PATH}...")
    
    # --- Progress Bar Added Here ---
    # We use tqdm to wrap the list of chunks, which will show a progress bar
    # as each chunk is processed by the embedding model.
    # Note: This step is done implicitly by Chroma.from_documents. A direct
    # loop with a progress bar is more visible for debugging.
    
    # A more visible way to track progress:
    # This might take a while, but it will show you it's working.
    print(f"Processing {len(chunks)} chunks. This will take some time...")
    
    vector_store = Chroma.from_documents(
        # wrap chunks in tqdm to see progress
        documents=tqdm(chunks, desc="Embedding Chunks"),
        embedding=embeddings,
        persist_directory=CHROMA_DB_PATH
    )
    
    print("Vector store created successfully!")
    return vector_store

# --- Main execution for this cell ---
# This part will be run only once to create the database.
vector_store = create_and_persist_vector_store(chunks)

Initializing embedding model: deepseek-r1:1.5b
Creating and persisting vector store to ./chroma_db_finance...
Processing 555 chunks. This will take some time...


Embedding Chunks:   0%|          | 0/555 [00:00<?, ?it/s]

Vector store created successfully!


In [15]:
# Cell 6: Building the RAG Chain and Interactive Chatbot Querying (Direct Output Version)

from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def build_rag_chain(vector_store):
    """
    Builds the Retrieval-Augmented Generation chain.
    """
    print("Building RAG chain...")
    
    llm = Ollama(model="deepseek-r1:1.5b", temperature=0) 
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    
    # --- PROMPT MODIFICATION FOR DIRECT OUTPUT ---
    # We are changing the prompt to tell the LLM to be very concise and direct.
    template = """
    You are a very concise financial chatbot. Answer the question **as briefly and directly as possible** based **only** on the provided document excerpts.
    **Do not include any preambles, intros, or sections like 'Analysis' or 'Conclusion'. Just provide the final answer as a single sentence or a brief paragraph.**
    If the information is not explicitly mentioned in the context, state that you cannot find the information in the document.

    Context:
    {context}
    
    Question: {question}
    
    Answer:
    """
    
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True, # We still return sources for verification
        chain_type_kwargs={"prompt": prompt}
    )
    
    print("RAG chain built successfully.")
    return rag_chain

def query_rag_system(rag_chain, query):
    """
    Queries the RAG system and prints only the concise answer.
    """
    print(f"\n--- Processing Query: '{query}' ---")
    response = rag_chain.invoke({"query": query})
    
    # --- OUTPUT MODIFICATION ---
    # We now only print the result, without the extra headers or sources.
    # The 'return_source_documents' is still True in the chain for a proper RAG lookup.
    print(response['result'])
    
    return response

# --- Main execution for this cell ---
rag_chain = build_rag_chain(vector_store)

print("\n--- Interactive Chatbot Mode ---")
print("Enter your question about the financial document.")
print("Type 'exit' to quit the chatbot.")

while True:
    user_query = input("\nYour question: ")
    
    if user_query.lower() == 'exit':
        print("Exiting chatbot. Goodbye!")
        break
        
    query_rag_system(rag_chain, user_query)

Building RAG chain...
RAG chain built successfully.

--- Interactive Chatbot Mode ---
Enter your question about the financial document.
Type 'exit' to quit the chatbot.



Your question:  should i buy this stock ?



--- Processing Query: 'should i buy this stock ?' ---
<think>
Okay, so I need to figure out whether someone should buy a stock based on the provided context. Let me go through the information step by step.

First, looking at the Annual Report on Form 10-K, it's a financial document that includes forward-looking statements and management discussions. The context mentions several sections like "Business" and "Management’s Discussion and Analysis." These sections often discuss future risks and uncertainties, which can indicate potential risks associated with investing in a stock.

The question is whether someone should buy the stock. To answer this, I need to see if there's any explicit information in the provided excerpt that suggests the stock is undervalued or has positive fundamentals.

Looking through the context, I don't see any direct statements about the company's financial health, profitability, or specific risks associated with the stock. The focus seems more on broader market 


Your question:  exit 



--- Processing Query: 'exit ' ---
<think>
Okay, so I need to figure out the answer to this question about the financial chatbot's response based on the provided document excerpts. The user has given me a lot of context, including various sections like periods up to 150 days, intellectual property issues, specific notes with interest rates and maturities, and some trading plans involving insider trading arrangements.

The question is "exit." I'm not entirely sure what that refers to in the financial context. It could mean exiting a deal, exiting debt, or perhaps something else like exiting a company's operations. Looking through the document, I see mentions of notes with specific maturities and interest rates, which might relate to debt or investment returns.

I notice there are sections about 3.050% 2029 Notes and 2031 Notes, both with semi-annual and annual interest payments respectively. The 3.050% notes have a maturity in 2029, which is a few years away. Maybe the "exit" here refer


Your question:  exit


Exiting chatbot. Goodbye!


In [16]:
# List the files in Ollama's model storage directory
!ls -lh /root/.ollama/models/blobs/

total 1.1G
-rw-r--r-- 1 root root 1.1K Jun 29 13:30 sha256-6e4c38e1172f42fdbff13edf9a7a017679fb82b0fde415a3e8b3c31c6ed4a4e4
-rw-r--r-- 1 root root  487 Jun 29 13:30 sha256-a85fe2a2e58e2426116d3686dfdc1a6ea58640c1e684069976aa730be6c1fa01
-rw-r--r-- 1 root root 1.1G Jun 29 13:30 sha256-aabd4debf0c8f08881923f2c25fc0fdeed24435271c2b3e92c4af36704040dbc
-rw-r--r-- 1 root root  556 Jun 29 13:30 sha256-c5ad996bda6eed4df6e3b605a9869647624851ac248209d22fd5e2c0cc1121d3
-rw-r--r-- 1 root root  148 Jun 29 13:30 sha256-f4d24e9138dd4603380add165d2b0d970bef471fac194b436ebd50e6147c6588


In [19]:
# Copy the model file to the working directory
!cp /root/.ollama/models/blobs/sha256-f4d24e9138dd4603380add165d2b0d970bef471fac194b4.bin /kaggle/working/deepseek-r1-1.5b.bin

cp: cannot stat '/root/.ollama/models/blobs/sha256-f4d24e9138dd4603380add165d2b0d970bef471fac194b4.bin': No such file or directory
