### Installs

In [None]:
!pip install -q python-docx sentence-transformers faiss-cpu transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.3 kB[0m [31m59.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25h

### Huggingface Token

In [None]:
import os

os.environ["HF_TOKEN"] = ""

### RAG Implementation for word document

In [None]:
import docx
import faiss
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import numpy as np

"""### Code using DOCX"""

# Step 1: Extracting text from DOCX
def extract_text_from_docx(docx_path):
    '''
    Purpose: Extracting text from docx file

    docx_path: path to the docx file
    '''
    doc = docx.Document(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)  # We are storing text per paragraph
    return text

# Step 2: Chunking text by sequence length (2500 tokens)
def chunk_text_by_length(text_list, chunk_size=2500):
    '''
    Purpose: Chunking text by sequence length (2500 tokens)

    text_list: list of text
    chunk_size: 2500 (default)
    '''
    chunks = []
    for page_text in text_list:
        tokens = page_text.split()  # Tokenizing by whitespace
        for i in range(0, len(tokens), chunk_size):
            chunk = " ".join(tokens[i:i + chunk_size])  # Creating chunks of 2500 tokens
            chunks.append(chunk)
    return chunks

# Step 3: Embedding the text chunks
def embed_chunks(chunks, model):
    '''
    Purpose: Embedding the text chunks

    chunks: list of chunks
    model: embedding model
    '''
    embeddings = model.encode(chunks, batch_size=8, convert_to_numpy=True)  # Embedding chunks in batches
    return embeddings

# Step 4: Create FAISS index
def create_faiss_index(embeddings):
    '''
    Purpose: Creating FAISS index

    embeddings: embeddings
    '''
    dimension = embeddings.shape[1]  # Getting the dimension of the embeddings
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    index.add(embeddings)  # Adding embeddings to the index
    return index

# Step 5: Performing vector search
def vector_search(query, model, faiss_index, chunks, top_k=5):
    '''
    Purpose: Performing vector search

    query: search query
    model: embedding model
    faiss_index: faiss index
    chunks: list of chunks
    top_k: number of top elements to retrieve
    '''
    query_embedding = model.encode([query], convert_to_numpy=True)  # Encoding the query
    distances, indices = faiss_index.search(query_embedding, top_k)  # Performing FAISS search
    relevant_chunks = [chunks[idx] for idx in indices[0]]  # Getting the most relevant chunks
    return relevant_chunks

# Step 6: Generating an answer using the relevant chunks
def generate_answer(query, context, model, tokenizer):
    '''
    Purpose: Generating an answer using the relevant chunks

    query: the user query
    context: the context retrieved from vector search
    model: the language model
    tokenizer: the tokenizer for the model
    '''
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer in brief:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids, max_length=200, num_beams=3, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Main function to perform RAG
def run_rag(docx_path, query, embedding_model, faiss_index, chunks, llm_model, tokenizer):
    '''
    Purpose: Main function to perform RAG

    docx_path: path to the docx file
    query: user query
    embedding_model: embedding model
    faiss_index: faiss index
    chunks: list of text chunks
    llm_model: language model
    tokenizer: tokenizer for the language model
    '''
    relevant_chunks = vector_search(query, embedding_model, faiss_index, chunks, top_k=5)
    context = " ".join(relevant_chunks)  # Combining relevant chunks as context
    answer = generate_answer(query, context, llm_model, tokenizer)
    return answer.strip()

# Step 7: Running the process
def main():
    # Loading models
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
    llm_model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large').to('cuda')  # Loading on GPU
    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

    # Loading DOCX and extracting text
    docx_path = '66. GST Smart Guide (1).docx'  # DOCX file path
    text_list = extract_text_from_docx(docx_path)

    # Chunking text based on sequence length
    chunks = chunk_text_by_length(text_list, chunk_size=500)

    # Embedding chunks
    embeddings = embed_chunks(chunks, embedding_model)

    # Creating FAISS index
    faiss_index = create_faiss_index(embeddings)

    # Asking a question and retrieving an answer
    query = "Taxpayer is not allowed to use ITC in excess of what percentage of output tax liability?"
    answer = run_rag(docx_path, query, embedding_model, faiss_index, chunks, llm_model, tokenizer)

    print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Answer: 99%
