In [12]:
#!pip install camelot-py pypdf nltk sentence_transformers faiss-cpu rank_bm25 accelerate

In [2]:
from pypdf import PdfReader
import pandas as pd

In [3]:
reader = PdfReader("TCS_2024-25.pdf")

In [4]:
# Initialize dictionary to collect text
annual_result = {
    "financial_results": "",
    "segment_information": "",
    "balance_sheet": "",
    "cash_flows": ""
}

# Extract and classify text
for page in reader.pages:
    text = page.extract_text()
    if text:
        text_lower = text.lower()
        
        if "consolidated" in text_lower and "financial results" in text_lower:
            annual_result["financial_results"] += text + "\n"
        
        if "consolidated" in text_lower and "segment information" in text_lower:
            annual_result["segment_information"] += text + "\n"
        
        if "consolidated" in text_lower and "balance sheet" in text_lower:
            annual_result["balance_sheet"] += text + "\n"
        
        if "consolidated" in text_lower and "cash flows" in text_lower:
            annual_result["cash_flows"] += text + "\n"

# Convert dictionary to DataFrame
df_annual_result = pd.DataFrame({
    "Segment": list(annual_result.keys()),
    "Data": list(annual_result.values())
})

print(df_annual_result)


               Segment                                               Data
0    financial_results    \nMarch 31, December 31, March 31, March 31,...
1  segment_information  March 31, December 31, March 31, March 31, Mar...
2        balance_sheet  As at As at\nMarch 31, 2025 March 31, 2024\nAS...
3           cash_flows   \nSelect explanatory notes to the Statement o...


In [5]:
df_annual_result = df_annual_result.assign(
    Data=df_annual_result["Data"].str.split("\n")
).explode("Data").reset_index(drop=True)

In [6]:
df_annual_result["Data"] = (
    df_annual_result["Data"]
    .str.replace(r"[.,()]", "", regex=True)  # remove . , ( )
    .str.split()                             # split on any whitespace
    .str.join(" ")                           # rejoin with single spaces
)


In [7]:
def split_data(row):
    parts = row["Data"].split()
    n_extra = 5  # total col1–col5

    if row["Segment"] in ["financial_results", "segment_information"]:
        # Last 5 parts go into col1–col5
        last_parts = parts[-5:]
        first_part = " ".join(parts[:-5])
        return pd.Series([first_part] + last_parts)

    elif row["Segment"] in ["balance_sheet", "cash_flows"]:
        # Last 2 parts go into col4 and col5
        last_parts = parts[-2:]
        first_part = " ".join(parts[:-2])
        return pd.Series([first_part] + [None, None, None] + last_parts)

    else:
        return pd.Series([row["Data"]] + [None] * n_extra)



# Apply with different column counts
df_split = df_annual_result.apply(split_data, axis=1)

# Rename columns dynamically for clarity
df_split.columns = ["attribute", "col1", "col2", "col3", "2025", "2024"]

# Combine with original DataFrame if needed
df_result = pd.concat([df_annual_result, df_split], axis=1)



In [8]:
def is_int(val):
    try:
        return float(val).is_integer()
    except (ValueError, TypeError):
        return False

# Keep only rows where both are integers
df_result= df_result[df_result["2025"].apply(is_int)]
df_result= df_result[df_result["2024"].apply(is_int)]
df_result.drop(columns=["col1","col2","col3","Data"], inplace=True)

In [9]:
df_result['text'] = (
    "company's " + df_result['attribute'] +
    " in 2024 it is " + df_result["2024"].astype(str) +
    " and for 2025 it is " + df_result["2025"].astype(str)
)

In [10]:
combined_text = ' '.join(df_result['text'].astype(str))

In [11]:
import re
import uuid
from typing import List, Dict
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt_tab')

def chunk_text(text: str, chunk_size: int, overlap: int = 0) -> List[Dict]:
    """Split text into chunks with metadata and unique IDs."""
    tokens = word_tokenize(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text_str = " ".join(chunk_tokens)

        chunks.append({
            "id": str(uuid.uuid4()),
            "text": chunk_text_str,
            "metadata": {
                "chunk_size": chunk_size,
                "start_token": start,
                "end_token": end
            }
        })

        start += chunk_size - overlap

    return chunks

# Example
chunks_100 = chunk_text(combined_text, chunk_size=100)
chunks_400 = chunk_text(combined_text, chunk_size=400)

all_chunks = chunks_100 + chunks_400
print(f"Created {len(all_chunks)} chunks")


[nltk_data] Downloading package punkt_tab to /home/gaurav/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Created 35 chunks


In [12]:
from sentence_transformers import SentenceTransformer
import faiss
from rank_bm25 import BM25Okapi
import numpy as np

# Initialize embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Dense embeddings
dense_embeddings = embedding_model.encode([chunk["text"] for chunk in all_chunks], convert_to_numpy=True)

# Build FAISS index
dimension = dense_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(dense_embeddings)

# Sparse index (BM25)
tokenized_corpus = [word_tokenize(chunk["text"].lower()) for chunk in all_chunks]
bm25 = BM25Okapi(tokenized_corpus)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
def hybrid_search(query: str, top_n: int = 5, alpha: float = 0.5):
    """
    Hybrid retrieval combining FAISS (dense) and BM25 (sparse).
    alpha: weight for dense retrieval (0 to 1).
    """
    # Preprocess query
    query_clean = re.sub(r"[^a-zA-Z0-9\s]", "", query.lower())
    
    # Dense retrieval
    query_embedding = embedding_model.encode([query_clean], convert_to_numpy=True)
    dense_scores, dense_indices = faiss_index.search(query_embedding, top_n)
    dense_results = [(idx, 1 / (1 + score)) for idx, score in zip(dense_indices[0], dense_scores[0])]
    
    # Sparse retrieval
    sparse_scores = bm25.get_scores(word_tokenize(query_clean))
    sparse_indices = np.argsort(sparse_scores)[::-1][:top_n]
    sparse_results = [(idx, sparse_scores[idx]) for idx in sparse_indices]
    
    # Score fusion
    scores_dict = {}
    for idx, score in dense_results:
        scores_dict[idx] = scores_dict.get(idx, 0) + alpha * score
    for idx, score in sparse_results:
        scores_dict[idx] = scores_dict.get(idx, 0) + (1 - alpha) * score
    
    # Sort by combined score
    final_results = sorted(scores_dict.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return [(all_chunks[idx]["text"], score) for idx, score in final_results]

# Example search
#results = hybrid_search("what is total tax expense for year 2025", top_n=5, alpha=0.6)
#for text, score in results:
#    print(f"Score: {score:.4f} | Chunk: {text}")


In [25]:
from llama_cpp import Llama

llm = Llama(
    model_path="/media/gaurav/ubuntudata/SemThree/models/llama2-7b-chat/llama-2-7b-chat.Q4_K_M.gguf",
    n_ctx=4096,
    n_threads=8,
    chat_format="llama-2"
)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /media/gaurav/ubuntudata/SemThree/models/llama2-7b-chat/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:       

In [14]:
question="What were the total expenses for FY 2025?"

In [15]:
results = hybrid_search(question, top_n=5, alpha=0.6)
context=""
for text, score in results:
    context+=text.lower()
print(context)

it is 778 and for 2025 it is 796 company 's depreciation and amortisation expense in 2024 it is 4985 and for 2025 it is 5242 company 's other expenses in 2024 it is 32764 and for 2025 it is 30481 company 's total expenses in 2024 it is 182360 and for 2025 it is 193955 company 's profit before exceptional item and tax in 2024 it is 62955 and for 2025 it is 65331 company 's profit before tax in 2024 it is 61997 and for 2025 it is 65331 company 's current tax in 2024 it is 15864company 's in 2024 it is 2024 and for 2025 it is 2025 company 's revenue from operations in 2024 it is 240893 and for 2025 it is 255324 company 's other income in 2024 it is 4422 and for 2025 it is 3962 company 's total income in 2024 it is 245315 and for 2025 it is 259286 company 's employee benefit expenses in 2024 it is 140131 and for 2025 it is 145788 company 's cost of equipment and software licences in 2024 it is 3702 and for 2025 it is 11648 company 's finance costs in 2024company 's in 2024 it is 2024 and f

In [16]:
#pip install --upgrade llama-cpp-python

In [17]:
#pip install --upgrade llama-cpp-python
#pip install --upgrade huggingface_hub

#huggingface-cli download TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf --local-dir ./models/llama2-7b-chat

In [18]:
def count_tokens(text):
    return len(llm.tokenize(text.encode("utf-8")))

# Build prompt safely within context window
def build_messages(context, question, max_ctx=4096, reserve=512):
    """
    max_ctx: model's context size
    reserve: keep space for system prompt, question, and answer
    """
    system_msg = (
        "Answer STRICTLY and CONCISELY using ONLY the provided context.Provide only answer  "
        "If the answer is not present, reply exactly: Not found in context."
    )
    system_tokens = count_tokens(system_msg)
    question_tokens = count_tokens(question)

    # budget for context
    budget = max_ctx - (system_tokens + question_tokens + reserve)
    context_tokens = llm.tokenize(context.encode("utf-8"))

    if len(context_tokens) > budget:
        context_tokens = context_tokens[:budget]  # truncate
        context = llm.detokenize(context_tokens).decode("utf-8", errors="ignore")

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"}
    ]
    return messages

# Example usage


In [19]:
def validate_query(query: str) -> bool:
    """
    Returns False if query is irrelevant or harmful.
    """
    harmful_keywords = ["suicide", "kill", "drugs", "weapon", "terrorism"]
    irrelevant_keywords = ["joke", "story", "sing", "poem"]  # customize to your use-case

    q_lower = query.lower()
    if any(k in q_lower for k in harmful_keywords):
        print("⚠️ Blocked harmful input.")
        return False
    if any(k in q_lower for k in irrelevant_keywords):
        print("⚠️ Irrelevant query, skipping.")
        return False

    return True

# Usage
if not validate_query(question):
    exit()  # or return a safe response


In [20]:
def validate_output(response: str, context: str) -> str:
    """
    Ensures response is grounded in the provided context.
    If response contains info not in context, replace with guardrail message.
    """
    # Simple heuristic: all words in response should appear in context
    # (you can replace with embedding similarity check for better accuracy)
    context_lower = context.lower()
    for word in response.split():
        if word.lower()  in context_lower:
            return response

    return "Not found in context."

# Usage
#raw_output = resp["choices"][0]["message"]["content"]
#safe_output = validate_output(raw_output, context)
#print(safe_output)


In [None]:
def model_inferance(question):
    results = hybrid_search(question, top_n=5, alpha=0.6)
    context=""
    for text, score in results:
        context+=text.lower()
    messages = build_messages(context, question)
    resp = llm.create_chat_completion(
        messages=messages,
        max_tokens=200,
        temperature=0.0
    )
    return resp["choices"][0]["message"]["content"]

print(model_inferance("What was TCS’s consolidated net profit in FY 2025?"))
    

In [34]:
import time
test_questions = [
    "What was the total consolidated income for FY 2025?",
    "What were employee benefit expenses in FY 2025?",
    "How much did TCS spend on equipment and software licences in FY 2025?",
    "What were the total expenses for FY 2025?",
    "What was profit before tax (PBT) in FY 2025?",
    "How much was the total tax expense in FY 2025?",
    "How much other income did TCS earn in FY 2024?",
    "What was the total consolidated income for FY 2024?",
    "What were employee benefit expenses in FY 2024?",
    "How much did TCS spend on equipment and software licences in FY 2024?",
    "What were the total expenses for FY 2024?"
]

expected_answers = [
   "259286",
    "145788",
    "11648",
    "193955",
    "65331",
    "16534",
    "237",
    "245315",
    "140131",
    "3702",
    "182360"
]
results = []

for q, expected in zip(test_questions, expected_answers):
    start_time = time.time()
    output_text=model_inferance(q)
    end_time = time.time()
    inference_time = round((end_time - start_time) , 2)
    accuracy = int(expected.lower() in output_text.lower())
    
    results.append({
        "Question": q,
        "Expected": expected,
        "Model Output": output_text,
        "Accuracy": accuracy,
        "Inference Time (ms)": inference_time
    })
    #print(results)

df = pd.DataFrame(results)
#print(df)

Llama.generate: 464 prefix-match hit, remaining 1894 prompt tokens to eval
llama_perf_context_print:        load time =  202118.79 ms
llama_perf_context_print: prompt eval time =  158107.03 ms /  1894 tokens (   83.48 ms per token,    11.98 tokens per second)
llama_perf_context_print:        eval time =    9683.83 ms /    31 runs   (  312.38 ms per token,     3.20 tokens per second)
llama_perf_context_print:       total time =  167808.35 ms /  1925 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 56 prefix-match hit, remaining 1722 prompt tokens to eval
llama_perf_context_print:        load time =  202118.79 ms
llama_perf_context_print: prompt eval time =  133945.74 ms /  1722 tokens (   77.78 ms per token,    12.86 tokens per second)
llama_perf_context_print:        eval time =    8594.12 ms /    28 runs   (  306.93 ms per token,     3.26 tokens per second)
llama_perf_context_print:       total time =  142555.69 ms /  1750 tokens
llama_perf_context_print:

In [35]:
df.to_csv('outputRAG.csv', index=False)