In [1]:
!pip install rank_bm25 pypdf pdfplumber sentence-transformers faiss-cpu pandas transformers tabulate scikit-learn

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64

In [2]:
import re
import numpy as np
import pandas as pd
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

2025-07-28 03:48:14.077297: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753674494.285660      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753674494.344751      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# --- TEXT PROCESSING & EMBEDDINGS ---

def extract_clean_text(pdf_path):
    """
    Extracts text from PDF while removing non-content sections.
    
    Process:
    1. Extracts page text sequentially
    2. Removes standard financial boilerplate
    3. Normalizes whitespace
    
    Args:
        pdf_path (str): Path to PDF file
        
    Returns:
        str: Cleaned text content
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            ptxt = page.extract_text()
            if ptxt:
                text += ptxt + "\n"
    
    # Remove standard financial boilerplate sections
    patterns = [
        r'Forward-Looking Statements.*',  # Legal disclaimers
        r'Contacts Investors:.*',         # Contact info
        r'About Meta.*'                   # Company description
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.DOTALL|re.IGNORECASE)
    
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def smart_chunk_text(text, chunk_size=80, overlap=20):
    """
    Splits text into coherent chunks preserving:
    - Sentence boundaries
    - Context via overlap
    - Token count limits
    
    Args:
        text (str): Input text
        chunk_size (int): Target tokens per chunk
        overlap (int): Tokens to overlap between chunks
        
    Returns:
        list: Chunks of text maintaining context
    """
    sents = re.split(r'(?<=[.!?]) +', text)  # Split at sentence boundaries
    chunks, cur, tokens = [], [], 0
    
    for s in sents:
        s_tok = len(s.split())
        
        # Create new chunk if limit exceeded
        if tokens + s_tok > chunk_size and cur:
            chunks.append(' '.join(cur))
            cur = cur[-overlap:] if overlap else []  # Keep context from previous chunk
            tokens = sum(len(x.split()) for x in cur)
            
        cur.append(s)
        tokens += s_tok
    
    # Add final partial chunk
    if cur:
        chunks.append(' '.join(cur))
        
    return chunks

def get_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    """
    Generates vector embeddings for semantic search.
    
    Args:
        chunks (list): Text segments to embed
        model_name (str): SentenceTransformer model to use
        
    Returns:
        tuple: (embeddings array, embedding model)
    """
    embedder = SentenceTransformer(model_name)
    embs = embedder.encode(chunks, show_progress_bar=True).astype(np.float32)
    return embs, embedder

def build_faiss_index(embeddings):
    """
    Creates FAISS index for fast similarity search.
    
    Args:
        embeddings (np.array): Array of text embeddings
        
    Returns:
        faiss.Index: Searchable vector index
    """
    idx = faiss.IndexFlatL2(embeddings.shape[1])  # Euclidean distance index
    idx.add(embeddings)
    return idx

def retrieve(query, embedder, faiss_index, chunks, top_k=3):
    """
    Retrieves most relevant text chunks for a query.
    
    Args:
        query (str): User question
        embedder: SentenceTransformer model
        faiss_index: Prebuilt FAISS index
        chunks (list): Original text chunks
        top_k (int): Number of results to return
        
    Returns:
        list: Top matching text segments
    """
    q_emb = embedder.encode([query]).astype(np.float32)
    _, idxs = faiss_index.search(q_emb, top_k)
    return [chunks[i] for i in idxs[0]]

In [4]:
# --- TABULAR DATA PROCESSING ---

def extract_tables(pdf_path):
    """
    Extracts and validates tables from PDF pages.
    
    Process:
    1. Extracts all table structures
    2. Converts to pandas DataFrames
    3. Filters meaningful tables (>1 column)
    
    Args:
        pdf_path (str): Path to PDF file
        
    Returns:
        list: Cleaned DataFrames of valid tables
    """
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for table in page.extract_tables():
                try:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    # Only keep tables with meaningful structure
                    if df.shape[1] > 1 and df.shape[0] > 0:
                        tables.append(df)
                except Exception as e:
                    continue  # Skip malformed tables
    return tables

def match_tables(tables, query, top_k=2):
    """
    Scores tables by relevance to query terms.
    
    Args:
        tables (list): DataFrames to search
        query (str): User question
        top_k (int): Max tables to return
        
    Returns:
        list: Most relevant DataFrames
    """
    tokens = set(w.lower() for w in re.findall(r"\w+", query))
    scored_tables = []
    
    # Score each table by token matches
    for i, df in enumerate(tables):
        flat = df.to_string().lower()
        score = sum(flat.count(tok) for tok in tokens)
        scored_tables.append((score, i))
    
    # Return top tables with positive scores
    scored_tables.sort(reverse=True)
    return [tables[i] for score, i in scored_tables if score > 0][:top_k]

def format_tables_as_text(tables, maxlen=600):
    """
    Converts tables to readable string format with length limit.
    
    Args:
        tables (list): DataFrames to convert
        maxlen (int): Max characters per table
        
    Returns:
        str: Concatenated table string
    """
    return "\n\n".join([df.head().to_string()[:maxlen] for df in tables])

def query_needs_table(query):
    """
    Heuristic detector for table-relevant questions.
    
    Args:
        query (str): User question
        
    Returns:
        bool: Whether to include tables in context
    """
    keywords = [
        "compare", "table", "net income", "expenses",
        "trend", "margin", "difference", "summary",
        "balance sheet", "cash flow"
    ]
    return any(kw in query.lower() for kw in keywords)

In [5]:
# --- ANSWER GENERATION ---

def gen_answer(prompt):
    """
    Generates answers using TinyLlama model.
    
    Features:
    - 1.1B parameter open-source model
    - Deterministic output (no sampling)
    - Financial response formatting
    
    Args:
        prompt (str): Full instruction context
        
    Returns:
        str: Generated answer
    """
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = pipe(prompt, max_new_tokens=220, do_sample=False)[0]['generated_text']
    return output[len(prompt):].strip()

In [6]:
# --- MAIN EXECUTION ---

def rag_structured_pipeline(pdf_path, queries):
    """
    End-to-end RAG pipeline with table support:
    1. Extracts text and tables from PDF
    2. Processes text into searchable chunks
    3. Answers questions using both text and tabular context
    
    Args:
        pdf_path (str): PDF file location
        queries (list): Questions to answer
    """
    print("Extracting text and tables...")
    text = extract_clean_text(pdf_path)
    chunks = smart_chunk_text(text)
    tables = extract_tables(pdf_path)
    print(f"Text Chunks: {len(chunks)}, Tables: {len(tables)}")
    
    # Setup semantic search
    embs, embedder = get_embeddings(chunks)
    idx = build_faiss_index(embs)
    
    # Process each question
    for q in queries:
        print(f"\nQ: {q}")
        
        # Retrieve text context
        context = '\n'.join(retrieve(q, embedder, idx, chunks, top_k=3))
        
        # Handle table context when needed
        table_context = ""
        if query_needs_table(q):
            matched = match_tables(tables, q)
            if matched:
                table_context = format_tables_as_text(matched, 800)
        
        # Generate answer with combined context
        prompt = (
            f"Text context: {context}\n"
            f"Structured data (from financial tables):\n{table_context}\n"
            f"Answer the query: {q}\n"
        )
        answer = gen_answer(prompt)
        print(f"Answer:\n{answer}\n")

In [7]:
if __name__ == "__main__":
    # Example execution
    PDF = "/kaggle/input/financial-reporta/Metas Q1 2024 Financial Report.pdf"
    queries = [
        "When did Meta report its first quarter 2024 results?",
        "What was Meta's total revenue for Q1 2024?",
        "How much did Meta's revenue increase compared to Q1 2023?",
        "What were the total costs and expenses for Meta in Q1 2024?",
        "How much income from operations did Meta report in Q1 2024?",
        "What was Meta's operating margin in Q1 2024?",
        "How much provision for income taxes did Meta have in Q1 2024?",
        "What was Meta's effective tax rate in Q1 2024?",
        "How much net income did Meta earn in Q1 2024?",
        "What was Meta's diluted earnings per share (EPS) for Q1 2024?",
        "What was the amount paid as dividends in Q1 2024?",
        "What revenue range does Meta expect for Q2 2024?",
        "What is the expected effect of foreign currency on Meta's revenue growth?",
    ]
    rag_structured_pipeline(PDF, queries)

Extracting text and tables...
Text Chunks: 29, Tables: 3


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Q: When did Meta report its first quarter 2024 results?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta reported its first quarter 2024 results on April 24, 2024.


Q: What was Meta's total revenue for Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta's total revenue for Q1 2024 was $12.36 billion, an increase of 7% year-over-year.


Q: How much did Meta's revenue increase compared to Q1 2023?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta's revenue increased by 27% year-over-year.


Q: What were the total costs and expenses for Meta in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: The total costs and expenses for Meta in Q1 2024 were:

Costs and expenses:

1. Research and development: $9,978
2. Marketing and sales: $2,564
3. General and administrative: $3,455

Average price per ad:

1. Average price per ad: $2.20

Total revenue:

1. Revenue: $36,455

Total costs and expenses:

1. Costs and expenses: $36,455

1. Average price per ad: $2.20

1. Total revenue: $36,455

1. Total costs and expenses: $36,455

1. Average price per ad: $2.20

1. Total revenue: $36,455

1. Total


Q: How much income from operations did Meta report in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta reported $12,369 million in income from operations in Q1 2024.


Q: What was Meta's operating margin in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:



Q: How much provision for income taxes did Meta have in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta had a provision for income taxes of $1.814 million in Q1 2024.


Q: What was Meta's effective tax rate in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: 22%


Q: How much net income did Meta earn in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta earned $12,369 million in Q1 2024.


Q: What was Meta's diluted earnings per share (EPS) for Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: 4.71


Q: What was the amount paid as dividends in Q1 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (2147 > 2048). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Answer:
Answer: $1.27 billion


Q: What revenue range does Meta expect for Q2 2024?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: Meta expects revenue to range between $1.25 billion and $1.3 billion for Q2 2024.


Q: What is the expected effect of foreign currency on Meta's revenue growth?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer:
Answer: The expected effect of foreign currency on Meta's revenue growth is a 1% headwind to year-over-year total revenue growth.

