### Installs

In [None]:
!pip install -q PyPDF2 sentence-transformers faiss-cpu transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Huggingface Token

In [None]:
import os

os.environ["HF_TOKEN"] = ""

### Imports

In [None]:
import PyPDF2
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np

### Code while using PDF

In [None]:
# Step 1: Extracting text from PDF
def extract_text_from_pdf(pdf_path):
    '''
    Purpose: Extracting text from pdf

    pdf_path: pdf path
    '''
    text = []
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text.append(page.extract_text())  # We are storing text per page
    return text

# Step 2: Chunking text by sequence length (500 tokens)
def chunk_text_by_length(text_list, chunk_size=500):
    '''
    Purpose: Chunking text by sequence length (500 tokens)

    text_list: text list
    chunk_size: 500 (default)
    '''
    chunks = []
    for page_text in text_list:
        tokens = page_text.split()  # Tokenizing by whitespace
        for i in range(0, len(tokens), chunk_size):
            chunk = " ".join(tokens[i:i + chunk_size])  # Creating chunks of 500 tokens
            # Small note: Linewise chunking would possibly give sub-optimal answers in case we are looking for multi line answer.
            #             Obviously by conjugating top k would possibly help in getting the answer even with linewise chunking, but that would be too tedious.
            #             Pagewise chunking would cause cude outofmemory in compute suboptimal devices like colab environment
            chunks.append(chunk)
    return chunks

# Step 3: Embedding the text chunks
def embed_chunks(chunks, model):
    '''
    Purpose: Embedding the text chunks

    chunks: chunks
    model: embedding model
    '''
    embeddings = model.encode(chunks, batch_size=8, convert_to_numpy=True)  # Embedding chunks in batches
    return embeddings

# Step 4: Create FAISS index
def create_faiss_index(embeddings):
    '''
    Purpose: Creating FAISS index

    embeddings: embeddings
    '''
    dimension = embeddings.shape[1]  # Getting the dimension of the embeddings
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(embeddings)  # Adding embeddings to the index
    return index

# Step 5: Performing vector search
def vector_search(query, model, faiss_index, chunks, top_k=5):
    '''
    Purpose: Performing vector search

    query: query
    model: embedding model
    faiss_index: faiss_index
    chunks: chunks
    top_k: top k elements to return from vector search
    '''
    query_embedding = model.encode([query], convert_to_numpy=True)  # Encoding the query
    distances, indices = faiss_index.search(query_embedding, top_k)  # Performing FAISS search
    relevant_chunks = [chunks[idx] for idx in indices[0]]  # Getting the most relevant chunks
    return relevant_chunks

# Step 6: Generating an answer using the relevant chunks
def generate_answer(query, context, model, tokenizer):
    '''
    Purpose: Generating an answer using the relevant chunks

    query: query
    context: context,
    model: model,
    tokenizer: tokenizer
    '''
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer in brief:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids, max_length=200, num_beams=3, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Main function to perform RAG
def run_rag(pdf_path, query, embedding_model, faiss_index, chunks, llm_model, tokenizer):
    '''
    Purpose: Main function to perform RAG

    pdf_path: pdf_path,
    query: query,
    embedding_model: embedding_model,
    faiss_index: faiss_index,
    chunks: chunks,
    llm_model: llm_model,
    tokenizer: tokenizer
    '''
    relevant_chunks = vector_search(query, embedding_model, faiss_index, chunks, top_k=5)
    context = " ".join(relevant_chunks)  # Combining relevant chunks as context
    answer = generate_answer(query, context, llm_model, tokenizer)
    return answer.strip()

# Step 7: Running the process
def main():
    # Loading models
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
    llm_model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large').to('cuda')  # Loading on GPU
    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

    # Loading PDF and extracting text
    pdf_path = 'G S T Smart Guide.pdf'  # PDF file path
    text_list = extract_text_from_pdf(pdf_path)

    # Chunking text based on sequence length
    chunks = chunk_text_by_length(text_list, chunk_size=500)

    # Embedding chunks
    embeddings = embed_chunks(chunks, embedding_model)

    # Creating FAISS index
    faiss_index = create_faiss_index(embeddings)

    # Asking a question and retrieving an answer
    query = "Who are eligible to take Input Tax Credit?"
    answer = run_rag(pdf_path, query, embedding_model, faiss_index, chunks, llm_model, tokenizer)

    print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Token indices sequence length is longer than the specified maximum sequence length for this model (3168 > 512). Running this sequence through the model will result in indexing errors


Answer: every registered person shall, subject to such conditions and restrictions prescribed under Section 49, be entitled to take credit of input tax charged on any supply of goods or services or both which are used or intended to be used in the course or furtherance of his business. 8. Who are not eligible to take Input Tax Credit? A registered person working under composition scheme even when received goods or services are used in furtherance of his business. A non-resident taxable person on receipt of goods and services except on goods imported by him. 9. Input Tax Credit not available on the goods and services 10. Other restrictions of Input Tax Credit 113 11. Clarification on various issues pertaining to GST as under: 115 12. Eligibility & conditions for taking Input Tax Credit 115 13. ITC appears in restricted category, the same cannot be availed by the registered person 116 14. Manner of distribution of credit by Input Service Distributor 119 15. Documents’
