In [1]:
!pip install rank_bm25 pypdf pdfplumber sentence-transformers faiss-cpu pandas transformers tabulate scikit-learn

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64

In [2]:
import re
import numpy as np
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

2025-07-28 03:40:18.229102: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753674018.390376      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753674018.439602      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# -------- TEXT CLEANING & EXTRACTION ----------
def extract_clean_text(pdf_path):
    """
    Extracts and cleans text from a PDF financial report by:
    1. Removing boilerplate sections (disclaimers, contacts)
    2. Normalizing whitespace
    3. Preserving the core financial content
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        str: Cleaned text content
    """
    text = ""
    # Extract text page by page
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    
    # Remove common financial report boilerplate
    text = re.sub(r'Forward-Looking Statements.*', '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'Contacts Investors:.*', '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'About Meta.*', '', text, flags=re.DOTALL|re.IGNORECASE)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [4]:
# --------- SMART CHUNKING -----------
def smart_chunk_text(text, chunk_size=80, overlap=20):
    """
    Splits text into coherent chunks by:
    - Breaking at sentence boundaries
    - Maintaining context with overlap between chunks
    - Enforcing maximum token limits per chunk
    
    Args:
        text (str): Input text to chunk
        chunk_size (int): Max words per chunk
        overlap (int): Words to overlap between chunks
        
    Returns:
        list: List of text chunks
    """
    # Split into sentences using punctuation
    sents = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current = []
    tokens = 0
    
    for s in sents:
        s_tok = len(s.split())
        
        # Create new chunk if adding this sentence would exceed limit
        if tokens + s_tok > chunk_size and current:
            chunks.append(' '.join(current))
            # Keep overlap between chunks
            current = current[-overlap:] if overlap else []
            tokens = sum(len(x.split()) for x in current)
            
        current.append(s)
        tokens += s_tok
    
    # Add final chunk
    if current:
        chunks.append(' '.join(current))
        
    return chunks

# --------- EMBEDDING/INDEX ----------
def get_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    """
    Generates vector embeddings for text chunks using SentenceTransformer
    
    Args:
        chunks (list): Text chunks to embed
        model_name (str): Embedding model to use
        
    Returns:
        tuple: (embeddings numpy array, embedding model)
    """
    embedder = SentenceTransformer(model_name)
    embs = embedder.encode(chunks, show_progress_bar=True).astype(np.float32)
    return embs, embedder

def build_faiss_index(embeddings):
    """
    Creates a FAISS index for efficient similarity search
    
    Args:
        embeddings (numpy.array): Array of vector embeddings
        
    Returns:
        faiss.Index: Searchable index
    """
    # Create a flat L2 (Euclidean) distance index
    idx = faiss.IndexFlatL2(embeddings.shape[1])
    idx.add(embeddings)
    return idx

def retrieve(query, embedder, faiss_index, chunks, top_k=3):
    """
    Retrieves most relevant text chunks for a query using semantic search
    
    Args:
        query (str): User question
        embedder: SentenceTransformer model
        faiss_index: FAISS index
        chunks (list): Original text chunks
        top_k (int): Number of chunks to return
        
    Returns:
        list: Top matching chunks
    """
    # Encode query and search index
    q_emb = embedder.encode([query]).astype(np.float32)
    _, idxs = faiss_index.search(q_emb, top_k)
    
    # Return actual text chunks
    return [chunks[i] for i in idxs[0]]

In [5]:
# --------- LLM GENERATION ----------
def gen_answer(prompt):
    """
    Generates answers using TinyLlama 1.1B model with:
    - Meta financial analyst persona
    - Concise response style
    - Context-aware generation
    
    Args:
        prompt (str): Full prompt including context and question
        
    Returns:
        str: Generated answer
    """
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    
    # Initialize model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    
    # Create text generation pipeline
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    
    # Generate and clean output
    output = pipe(prompt, max_new_tokens=200, do_sample=False)[0]['generated_text']
    return output[len(prompt):].strip()

In [6]:
# --------- MAIN PIPELINE -----------
def basic_rag_pipeline(pdf_path, queries):
    """
    Complete RAG pipeline execution:
    1. PDF text extraction
    2. Chunking
    3. Embedding and indexing
    4. Question answering loop
    
    Args:
        pdf_path (str): Path to financial report PDF
        queries (list): List of questions to answer
    """
    print("Extracting and cleaning PDF...")
    text = extract_clean_text(pdf_path)
    
    print("Creating text chunks...")
    chunks = smart_chunk_text(text)
    print(f"Total chunks: {len(chunks)}")
    
    print("Generating embeddings...")
    embeddings, embedder = get_embeddings(chunks)
    
    print("Building search index...")
    idx = build_faiss_index(embeddings)
    
    # Process each question
    for q in queries:
        print(f"\nQ: {q}")
        
        # Retrieve relevant context
        context = '\n'.join(retrieve(q, embedder, idx, chunks, top_k=3))
        
        # Create LLM prompt
        prompt = f"""Based on the following financial report context, answer concisely as a Meta financial analyst.
        Context:
        {context}
        Question: {q}
        Answer:"""
        
        # Generate and display answer
        answer = gen_answer(prompt)
        print(f"Answer:\n{answer}\n")

In [7]:
if __name__ == "__main__":
    # Sample financial report and questions
    PDF = "/kaggle/input/financial-reporta/Metas Q1 2024 Financial Report.pdf"
    
    questions = [
        "When did Meta report its first quarter 2024 results?",
        "What was Meta's total revenue for Q1 2024?",
        "How much did Meta's revenue increase compared to Q1 2023?",
        "What were the total costs and expenses for Meta in Q1 2024?",
        "How much income from operations did Meta report in Q1 2024?",
        "What was Meta's operating margin in Q1 2024?",
        "How much provision for income taxes did Meta have in Q1 2024?",
        "What was Meta's effective tax rate in Q1 2024?",
        "How much net income did Meta earn in Q1 2024?",
        "What was Meta's diluted earnings per share (EPS) for Q1 2024?",
        "What was the amount paid as dividends in Q1 2024?",
        "What revenue range does Meta expect for Q2 2024?",
        "What is the expected effect of foreign currency on Meta's revenue growth?",
    ]
    
    # Run the full pipeline
    basic_rag_pipeline(PDF, questions)

Extracting and cleaning PDF...
Creating text chunks...
Total chunks: 29
Generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Building search index...

Q: When did Meta report its first quarter 2024 results?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta reported its first quarter 2024 results on April 24, 2024.


Q: What was Meta's total revenue for Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's total revenue for Q1 2024 was $12.36 billion.


Q: How much did Meta's revenue increase compared to Q1 2023?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's revenue increased by 27% compared to Q1 2023.


Q: What were the total costs and expenses for Meta in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's total costs and expenses for Q1 2024 were $36,455 million, an increase of 27% year-over-year.


Q: How much income from operations did Meta report in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta reported $12,369 million in net income for Q1 2024.


Q: What was Meta's operating margin in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's operating margin in Q1 2024 was 38%.


Q: How much provision for income taxes did Meta have in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta had a provision for income taxes of $1.814 million in Q1 2024.


Q: What was Meta's effective tax rate in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's effective tax rate in Q1 2024 was 13%.


Q: How much net income did Meta earn in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta earned $12.369 billion in Q1 2024, an increase of 7% year-over-year.


Q: What was Meta's diluted earnings per share (EPS) for Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta's diluted earnings per share (EPS) for Q1 2024 was $4.71.


Q: What was the amount paid as dividends in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (2161 > 2048). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Answer:
$1.27 billion.


Q: What revenue range does Meta expect for Q2 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Meta expects revenue in the range of $1.25 billion to $1.3 billion for Q2 2024.


Q: What is the expected effect of foreign currency on Meta's revenue growth?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
The expected effect of foreign currency on Meta's revenue growth is a 1% headwind to year-over-year total revenue growth, based on current exchange rates.

