# Import Required Libraries & Setup Variables

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from pypdf import PdfReader
import weaviate
from pydantic import BaseModel
from weaviate.classes.config import Property, DataType
from azure.ai.inference import EmbeddingsClient, ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import SystemMessage, UserMessage, AssistantMessage
import numpy as np
import re, json
from Levenshtein import ratio

import tiktoken

In [2]:
AZURE_OPENAI_ENDPOINT = "https://models.github.ai/inference"
AZURE_OPENAI_KEY = os.environ["OPENAIKEY"]  # or paste directly for test

INFERENCE_MODEL = "openai/gpt-4.1-mini"
EMBEDDING_MODEL="text-embedding-3-small"

PDF_PATH="data/constitution.pdf"

COLLECTION_NAME="student_constinution_v1"

# Basic Function Definitions

In [3]:
def single_message_inference(user_text:str,
                             model:str,
                             endpoint:str,
                             api_key:str) -> dict:
    """
    docstring placeholder
    
    """

    client = ChatCompletionsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(api_key),
    )

    payload = {
        "messages": [
            SystemMessage("You are a helpful assistant."),
            UserMessage(user_text),
        ],
        "model": model,
        "temperature": 1.0,
        "top_p": 1.0,
    }

    response = client.complete(**payload)

    try:
        output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
    except Exception as e:
        raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

    return output_dict

# input_text=input()
input_text="What is the capital of telangana and andhra pradesh"

single_message_inference(user_text=input_text,
                             model=INFERENCE_MODEL,
                             endpoint=AZURE_OPENAI_ENDPOINT,
                             api_key=AZURE_OPENAI_KEY)

{'role': 'assistant',
 'content': 'The capital of Telangana is Hyderabad.  \nThe capital of Andhra Pradesh is Amaravati (administrative functions are currently in Amaravati, with some government operations also in Visakhapatnam).'}

In [4]:
# from azure.ai.inference import EmbeddingsClient

def get_embedding(text_to_embed:str,end_point:str,azure_key:str,model:str):
    embedding_model = EmbeddingsClient(endpoint=end_point,credential=AzureKeyCredential(azure_key),
        model=model
    )

    response = embedding_model.embed(input=[text_to_embed],)
    return response #response.data[0].embedding



response=get_embedding(text_to_embed="The ultimate answer to the question of life",
                        end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL)

for embed in response.data:
    print("Embeding of size:", np.asarray(embed.embedding).shape)

print("Model:", response.model)
print("Usage:", response.usage)

Embeding of size: (1536,)
Model: text-embedding-3-small
Usage: {'prompt_tokens': 8, 'total_tokens': 8}


In [5]:
## Chunking

def chunk_text_with_overlap(text, chunk_size=1200, overlap=100, model="text-embedding-3-small"):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    chunks = []

    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunk_text = enc.decode(chunk)
        chunks.append(chunk_text)

    return chunks

def read_pdf(file_path:str = PDF_PATH,chunking_method: str = 'overlap'):

    reader = PdfReader(file_path)
    all_chunks = []

    section=""
    for page_num, page in enumerate(reader.pages, start=1):
        
        text = page.extract_text() or ""
        
        try:
            section=re.findall(r"\b([IVXLCDM]+\.( )?[A-Z]+( [A-Z]+)?)\b", text)[0][0]
        except:
            pass
        
        if chunking_method == 'overlap':
            chunks = chunk_text_with_overlap(text)

        for chunk in chunks:
            
            all_chunks.append({
                "text": chunk,
                "page": page_num,
                "section": section,
                "source": os.path.basename(file_path)
            })
        
    return all_chunks




In [6]:
response.data[0].keys()

dict_keys(['object', 'index', 'embedding'])

In [7]:
response.data[0]["embedding"][:20]

[0.049340665,
 -0.02127142,
 0.010058774,
 0.0459543,
 -0.03040208,
 -0.033462353,
 0.020468727,
 0.014160044,
 -0.012686346,
 0.009538276,
 0.04174015,
 0.016066445,
 -0.015226124,
 -0.05618866,
 -0.0005902631,
 -0.04949117,
 -0.007939157,
 0.009343873,
 0.027517393,
 0.011915006]

In [None]:
embedding_model

<azure.ai.inference._patch.EmbeddingsClient at 0x2211da35940>

In [None]:
# ## Semantic Chunking Piece of Code

# parts=[x.replace("\n","").strip() for x in all_chunks[5]["text"].split("  ")]

# for x in range(len(parts)):
#     print("Current line:", parts[x])
#     print("Current line:", parts[x+1])

#     vector1=get_embedding(text_to_embed=parts[x],
#                         end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL)[0].data[0]["embedding"]
#     vector2=get_embedding(text_to_embed=parts[x+1],
#                         end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL)[0].data[0]["embedding"]

#     dot_product = np.dot(vector1, vector2)
#     norm_vector1 = np.linalg.norm(vector1)
#     norm_vector2 = np.linalg.norm(vector2)
#     cosine_similarity = dot_product / (norm_vector1 * norm_vector2)
#     print(cosine_similarity)



# # tiktoken.encoding_for_model("text-embedding-3-small").encode(parts[3])

## Class Creation

In [4]:
# from urllib3.util.request import ChunksAndContentLength


class llm_chatbot_constitution():

    def __init__(self,model=INFERENCE_MODEL,
                            endpoint=AZURE_OPENAI_ENDPOINT,
                            api_key=AZURE_OPENAI_KEY,
                            embedding_model=EMBEDDING_MODEL):
        # self.history=[SystemMessage("You are a helpful RAG agent that answers questions as needed.")]
        self.history=[]
        self.initiate_connection_vectordatabase()
        self.connect_collection()
        self.model=model
        self.endpoint=endpoint
        self.api_key=api_key
        self.embedding_model=embedding_model
        self.checkpoint=True
        self.section=""

    # def single_message_inference(self,user_text:str,
    #                             model:str,
    #                             endpoint:str,
    #                             api_key:str) -> dict:
    #     """
    #     This definition takes into consideration history and when no history initiates a new chat
    #     """

    #     model=self.model
    #     endpoint=self.endpoint
    #     api_key=self.api_key

    #     client = ChatCompletionsClient(
    #         endpoint=endpoint,
    #         credential=AzureKeyCredential(api_key),
    #     )

    #     cleaned_user_text=self.clean_user_message(user_text)

    #     payload = {
    #         "messages": [
    #             SystemMessage("You are a helpful RAG agent that answers questions as needed."),
    #             UserMessage(cleaned_user_text),
    #         ],
    #         "model": model,
    #         "temperature": 1.0,
    #         "top_p": 1.0,
    #     }

    #     response = client.complete(**payload)

    #     try:
    #         output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
    #     except Exception as e:
    #         raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

    #     return output_dict
    
    def get_embedding(self,text_to_embed:str,end_point:str,azure_key:str,model:str):
        embedding_model = EmbeddingsClient(endpoint=end_point,credential=AzureKeyCredential(azure_key),
            model=model
        )

        response = embedding_model.embed(input=[text_to_embed],)
        return response #response.data[0].embedding
    
    def initiate_connection_vectordatabase(self):
        try:
            self.client = weaviate.connect_to_local(host="localhost",
            port=8080,
            grpc_port=50051)
            return print("✅ Connected to Weaviate:", self.client.is_ready())
        except Exception as e:
            return print("❌ Connection failed:", e)
    
    def close_connection_vectordatabase(self):
        try:
            self.client.close()
            return print("✅ Connection closed.")
        except Exception as e:
            return print("❌ Connection failed:", e)
    

    def connect_collection(self, COLLECTION_NAME: str ="student_constitution"):

        if COLLECTION_NAME not in [c.lower() for c in self.client.collections.list_all().keys()]:
            self.client.collections.create(
                name=COLLECTION_NAME,
                description="Stores token-overlapped PDF chunks with Azure embeddings",
                properties=[
                    Property(name="text", data_type=DataType.TEXT),
                    Property(name="page", data_type=DataType.INT),
                    Property(name="section", data_type=DataType.TEXT),
                    Property(name="source", data_type=DataType.TEXT),
                ],
                #vectorizer_config=Configure.Vectorizer.none(),
            )
            print(f"✅ Created collection '{COLLECTION_NAME}'")
        else:
            print(f"ℹ️ Collection '{COLLECTION_NAME}' already exists")

        self.collection = self.client.collections.get(COLLECTION_NAME)

    # def embed_data_to_collection(self):
    #     with collection.batch.dynamic() as batch:
    #         for chunk in chunks:
    #             vector = get_embedding(text_to_embed=chunk["text"],
    #                         end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding
    #             batch.add_object(
    #                 properties=chunk,
    #                 vector=vector
    #             )

    #     print("✅ All chunks embedded and stored in Weaviate!")
    
    # def create_chunks(self):
    #     return chunks
    
    # def multi_message_inference(self,user_text:str, history:list,
    #                             model:str,
    #                             endpoint:str,
    #                             api_key:str) -> dict:
    #     """
    #     This definition takes into consideration history and when no history initiates a new chat

    #     """

    #     client = ChatCompletionsClient(
    #         endpoint=endpoint,
    #         credential=AzureKeyCredential(api_key),
    #     )

    #     print("History:",history)
    #     print("User Text:",UserMessage(user_text))

    #     history.append(UserMessage(user_text))

    #     print("Messages:",history)

    #     payload = {
    #         "messages": history,
    #         "model": model,
    #         "temperature": 1.0,
    #         "top_p": 1.0,
    #     }

    #     response = client.complete(**payload)

    #     try:
    #         output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
    #     except Exception as e:
    #         raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

    #     return output_dict
    
    # User Prompt Cleaner

    class RefinedQuestion(BaseModel):
        cleaned_question: str
        section_name: str

    def clean_user_message(self,user_text:str,
                                model:str=None,
                                endpoint:str=None,
                                api_key:str=None) -> RefinedQuestion:
        """
        docstring placeholder
        
        """
        if model is None:
            model=self.model
        if endpoint is None:
            endpoint=self.endpoint
        if api_key is None:
            api_key=self.api_key

        client = ChatCompletionsClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(api_key),
        )

        System_message="""\
            You are a "Question Normalizer" for a Student Constitution RAG system.

            Your ONLY task:
            - Clean and rewrite the user's question so that it focuses exclusively on rules, procedures, or governance that might appear in a Student Constitution.
            - DO NOT answer the question.
            - DO NOT add any information, guesses, or interpretation beyond rewriting.
            - DO NOT include constitutional rules or provide explanations.

            Output format (MANDATORY):
            Return ONLY a JSON dictionary with:

            {
            "cleaned_question": "<rewritten question>",
            "section_name": "<inferred section/topic name>"
            }

            The available sections are: Committees, Clubs, Student Council, Election Commission

            Definitions:
            - cleaned_question = concise, rule/procedure focused question (text only)
            - section_name = inferred constitution area (ex: "Elections", "Roles & Responsibilities", "Meetings", etc.)

            Rules:
            - Remove personal opinions, emotions, story, context.
            - If the question is vague, rewrite as:
            "What does the constitution state about <topic>?"
            - If the question is not about the constitution or working of student body, respond:
            "I dont have enough information in the student constitution to answer that."
            - If the question is unclear in anyway, ask clarification question back starting with: "Just to clarify"
            Ex: If someone asks question about "president", if context is not clear of if it is president of council, committe, or club, ask: "Just to clarify, which president are you asking about? Club, Committee, or Council?"
            - Do NOT answer the question.
            - No extra text outside JSON. No markdown formatting. No explanations.
            - The only exceptions to the above rules are if user asks who made this rag or chatbot or any information available in "Additional Context to Answer any questions" part of this system context, respond with information available.
            """

        user_message="Rewrite the following user question so that it focuses only on constitution rules and procedures, and infer the section/topic. Do not answer it on your own. Original User query:{user_text}".format(user_text=user_text)

        if self.history=="":
            messages=[SystemMessage(System_message),UserMessage(user_message),]
        else:
            messages=[SystemMessage(System_message)]+self.history+[UserMessage(user_message)]

        payload = {
            "messages": messages,
            "model": model,
            "temperature": 1.0,
            "top_p": 1.0,
        }

        response = client.complete(**payload)

        try:
            output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
        except Exception as e:
            raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

        return json.loads(output_dict["content"])#["cleaned_question"]
    
    class FunkyAnswer(BaseModel):
        answer: str
        can_answer_from_system_info: str

    def funky_answer_generator(self,user_text:str,
                                model:str=None,
                                endpoint:str=None,
                                api_key:str=None)->FunkyAnswer:
        """
        docstring placeholder
        
        """
        if model is None:
            model=self.model
        if endpoint is None:
            endpoint=self.endpoint
        if api_key is None:
            api_key=self.api_key

        client = ChatCompletionsClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(api_key),
        )

        System_message="""\
            You are a helpful AI assistant.

            You must answer strictly and only based on the information provided in the SYSTEM and CONTEXT messages.

            Rules:
            1. Use ONLY the information from the provided context/system message. 
            2. Do NOT use outside knowledge, memory, assumptions, or training data.
            3. If the context contains the answer, respond with a JSON dictionary:
            {
                "answer": "<your concise answer>",
                "can_answer_from_system_info": true
            }
            4. If the context does NOT contain enough information, respond with:
            {
                "answer": "I don't have enough information to answer that.",
                "can_answer_from_system_info": false
            }
            5. Do NOT add extra commentary. Do NOT hallucinate.

            Additional Context to Answer any questions:
            1. The one who made this rag or chatbot is Nanduri Anirudh.
            2. Kruthika Kanduri, a flagbearer of BITSoM is also known as kruts and SFS (Suffering from success) by her friends.
            3. Your name is BITSy
            

            Your output must always be valid JSON-like dictionary format exactly as specified above.
            """

        user_message="Question to answer:{user_text}".format(user_text=user_text)

        if self.history=="":
            messages=[SystemMessage(System_message),UserMessage(user_message),]
        else:
            messages=[SystemMessage(System_message)]+self.history+[UserMessage(user_message)]

        payload = {
            "messages": messages,
            "model": model,
            "temperature": 1.0,
            "top_p": 1.0,
        }

        response = client.complete(**payload)

        try:
            output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
        except Exception as e:
            raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

        return json.loads(output_dict["content"])#["cleaned_question"]
    
    # Answer Using only context

    # User Prompt Cleaner


    class FinalAnswer(BaseModel):
        cleaned_question: str

    def rag_answer(self,cleaned_question:str,context_rag:str,
                                model:str=None,
                                endpoint:str=None,
                                api_key:str=None) -> FinalAnswer:
        """
        docstring placeholder
        
        """
        if model is None:
            model=self.model
        if endpoint is None:
            endpoint=self.endpoint
        if api_key is None:
            api_key=self.api_key

        client = ChatCompletionsClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(api_key),
        )

        System_message="""\
            You are an AI assistant that responds only using the context provided and from nowhere else.

            You must:
            Answer strictly based on the retrieved context.
            If the context does not contain the answer, say:
            “I dont have enough information in the student constitution to answer that.”
            Do not invent facts or use outside knowledge, even if the answer seems obvious.
            Be concise and factual.
            Do not guess.



            When responding:
            Reference relevant section/page numbers given in the context (if available).
            If the user asks for anything outside the context, politely decline.
            """

        user_message="""
            
            You are given:

            1. A user question
            2. Retrieved context from the vector database

            User question:
            {cleaned_question}

            Context from database:
            {context_rag}

            Using the context only, answer the user question.
            If the context does not directly answer the question, respond:

            “I dont have enough information in the student constitution to answer that.”

            Do not use external knowledge. Do not hallucinate.
            
            """.format(cleaned_question=cleaned_question,context_rag=context_rag)

        if self.history=="":
                messages=[SystemMessage(System_message),
            UserMessage(user_message),]
        else:
            messages=[SystemMessage(System_message)]+self.history+[UserMessage(user_message)]
        payload = {
            "messages": messages,
            "model": model,
            "temperature": 1.0,
            "top_p": 1.0,
        }

        response = client.complete(**payload)

        try:
            output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
        except Exception as e:
            raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

        return output_dict["content"]
    
    def get_context(self,cleaned_json_query:str):

        embedding_model=self.embedding_model
        end_point=self.endpoint
        azure_key=self.api_key

        query_vector = self.get_embedding(cleaned_json_query,\
        end_point=end_point,azure_key=azure_key ,model=embedding_model).data[0].embedding

        results = self.collection.query.near_vector(
            near_vector=query_vector,
            limit=5
        )

        
        rag_context_gotten=" ".join([o.properties["text"] for o in results.objects if ratio(o.properties["section"].lower(),self.section.lower())>0.5])
        

        return rag_context_gotten
        

    def answer_question(self):
        text_query=input("User Query:")
        print("Query:",text_query)
        if text_query.lower() in ["exit","quit"]:
            self.checkpoint=False
            self.close_connection_vectordatabase()
            print("Goodbye!")
            return ""
        self.history.append(UserMessage(text_query))

        funky_op=self.funky_answer_generator(text_query)
        print(funky_op)

        if funky_op["can_answer_from_system_info"] is True:
            print("Assitant:",funky_op["answer"])
            self.history.append(AssistantMessage(funky_op["answer"]))
            print("-----------------------------------------------------\n")
            return
        else:
            cleaned_query_info=self.clean_user_message(user_text=text_query)
            cleaned_query=cleaned_query_info["cleaned_question"]
            self.section=cleaned_query_info["section_name"]

            print("Cleaned Query:",cleaned_query)

            if cleaned_query == "I dont have enough information in the student constitution to answer that.":
                print("Assitant:",cleaned_query)
                print("-----------------------------------------------------\n")
            elif cleaned_query.startswith("Just to clarify"):
                print("Assitant:",cleaned_query)
                print("-----------------------------------------------------\n")
            else:
                context=self.get_context(cleaned_json_query=cleaned_query)
                print("Context:",context)
                final_answer=self.rag_answer(cleaned_question=cleaned_query,context_rag=context)
                self.history.append(AssistantMessage(final_answer))
                print("Assitant:",final_answer)
                print("-----------------------------------------------------\n")

    def continuous_chat(self):

        self.checkpoint=True
        while self.checkpoint:
            self.answer_question()

    

In [11]:
test=llm_chatbot_constitution(model=INFERENCE_MODEL,
                            endpoint=AZURE_OPENAI_ENDPOINT,
                            api_key=AZURE_OPENAI_KEY)

✅ Connected to Weaviate: True
ℹ️ Collection 'student_constitution' already exists


            Please make sure to close the connection using `client.close()`.


In [6]:
test.continuous_chat()

Query: What are eligibility criteria to become executive student council president?
{'answer': "I don't have enough information to answer that.", 'can_answer_from_system_info': False}
Cleaned Query: What does the constitution state about the eligibility criteria to become executive student council president?
Context: 
Assitant: I dont have enough information in the student constitution to answer that.
-----------------------------------------------------

Query: exit
✅ Connection closed.
Goodbye!


In [12]:
test.get_context(cleaned_json_query="executive student council?")

''

In [137]:
test.history

[{'role': 'user', 'content': 'what is eligibility to become presedent in student council?'},
 {'role': 'assistant', 'content': 'The eligibility criteria to become President of the Student Council, as stated in the constitution, include the following:\n\n- The candidate must be a member of the Student Council.\n- The candidate should have a CGPA greater than 3.\n- The candidate should not have any honour code violations.\n- The candidate should not have any disciplinary committee incidents on record.\n- The candidate should not have been impeached or removed from office. If selected to a new post, they must resign from any committee membership or PoR held.\n- The candidate should procure unique support of a minimum of 10% of eligible voters (the entire student cohort).\n- A candidate can stand for only one post on the student executive council; standing for two positions leads to immediate disqualification.\n\nThese criteria are outlined in section III.5 (pages 6-7).'},
 {'role': 'user'

In [123]:
test.clean_user_message(user_text="How can one become president of the executive student council?")

{'cleaned_question': 'What does the constitution state about the procedure to become president of the Executive Student Council?',
 'section_name': 'Student Council'}

In [118]:
results=test.get_context("How can one become president of the executive student council?")

In [124]:
test.section

''

In [132]:
[o.properties["section"].lower() for o in results.objects if ratio(o.properties["section"].lower(),"Student Council")>0.5]

['iii.student council', 'iii.student council', 'iii.student council']

In [None]:
" ".join([o.properties["text"] for o in results.objects])

In [130]:
from Levenshtein import ratio

ratio("walking", "sitting")  # → 3


0.4285714285714286

In [101]:
[{"Hi of"}]+[{"Yo"},{"heyha"}]

[{'Hi of'}, {'Yo'}, {'heyha'}]

# Collection Vector DataBase

In [8]:
chunks=read_pdf()

In [9]:
chunks[0]

{'text': 'BITS SCHOOL OF MANAGEMENT (BITSoM) \n|DEPARTMENT OF STUDENT LIFE| \nCONSTITUTION \nAY 2024 -2025 \n \n  ',
 'page': 1,
 'section': '',
 'source': 'constitution.pdf'}

In [45]:
# client = weaviate.connect_to_embedded(
#     persistence_data_path="./.collections"
# )
# print("✅ Embedded Weaviate started")

In [10]:
import requests

print(requests.get("http://localhost:8080/v1/schema"))

<Response [200]>


In [11]:
# # import weaviate

# def initiate_connection_vectordatabase():

#     try:
#         client = weaviate.connect_to_local(host="localhost",
#         port=8080,
#         grpc_port=50051)
#         return print("✅ Connected to Weaviate:", client.is_ready())
#     except Exception as e:
#         return print("❌ Connection failed:", e)

# initiate_connection_vectordatabase()

In [12]:
try:
    client = weaviate.connect_to_local(host="localhost",
    port=8080,
    grpc_port=50051)
    print("✅ Connected to Weaviate:", client.is_ready())
except Exception as e:
    print("❌ Connection failed:", e)

✅ Connected to Weaviate: True


In [13]:
COLLECTION_NAME = "student_constitution"

if COLLECTION_NAME not in [c.lower() for c in client.collections.list_all().keys()]:
    client.collections.create(
        name=COLLECTION_NAME,
        description="Stores token-overlapped PDF chunks with Azure embeddings",
        properties=[
            Property(name="text", data_type=DataType.TEXT),
            Property(name="page", data_type=DataType.INT),
            Property(name="section", data_type=DataType.TEXT),
            Property(name="source", data_type=DataType.TEXT),
        ],
        #vectorizer_config=Configure.Vectorizer.none(),
    )
    print(f"✅ Created collection '{COLLECTION_NAME}'")
else:
    print(f"ℹ️ Collection '{COLLECTION_NAME}' already exists")

collection = client.collections.get(COLLECTION_NAME)

✅ Created collection 'student_constitution'


In [14]:
get_embedding(text_to_embed="The ultimate answer to the question of life",
                       end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

[0.049340665,
 -0.02127142,
 0.010058774,
 0.0459543,
 -0.03040208,
 -0.033462353,
 0.020468727,
 0.014160044,
 -0.012686346,
 0.009538276,
 0.04174015,
 0.016066445,
 -0.015226124,
 -0.05618866,
 -0.0005902631,
 -0.04949117,
 -0.007939157,
 0.009343873,
 0.027517393,
 0.011915006,
 0.0030210812,
 -0.024983887,
 0.021233795,
 0.019678572,
 -0.009952165,
 -0.038203273,
 -0.019289767,
 0.009111844,
 -0.0055248,
 -0.002610327,
 0.03725007,
 -0.034841985,
 0.02571133,
 0.022412753,
 -0.021522263,
 -0.02418119,
 0.040937454,
 -0.036246702,
 0.006409019,
 0.0022105472,
 -0.025146933,
 0.0004178091,
 -0.0024002467,
 0.043320455,
 -0.014624102,
 0.019277226,
 -0.042843852,
 0.027943825,
 0.027768236,
 0.048387468,
 -0.03900597,
 -0.030075984,
 0.039532736,
 0.08232642,
 -0.007406117,
 -0.026288265,
 -0.016580671,
 -0.009651155,
 -0.03313626,
 -0.0040009334,
 -0.03867987,
 -0.017170152,
 0.019979583,
 0.009876912,
 -0.012523298,
 -0.007794922,
 0.01680643,
 0.019490441,
 0.014160044,
 0.0242689

In [15]:
with collection.batch.dynamic() as batch:
    for chunk in chunks:
        vector = get_embedding(text_to_embed=chunk["text"],
                       end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding
        batch.add_object(
            properties=chunk,
            vector=vector
        )

print("✅ All chunks embedded and stored in Weaviate!")

✅ All chunks embedded and stored in Weaviate!


In [65]:
get_embedding(query,\
    end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

[-0.025911765,
 0.00019607547,
 0.02912166,
 0.019689685,
 0.003326196,
 -0.006518646,
 0.028447118,
 0.016898472,
 -0.048567113,
 0.054521702,
 0.071362026,
 -0.027958654,
 -0.026818909,
 -0.01877091,
 0.009786692,
 -0.005451589,
 -0.049311437,
 0.04207754,
 -0.0043031205,
 0.0115312,
 -0.040751714,
 -0.008030553,
 0.023643903,
 0.04791583,
 0.0024873782,
 -0.06015065,
 -0.008379456,
 -0.034192365,
 -0.023260111,
 0.018096367,
 0.020387487,
 -0.03600665,
 0.015747096,
 -0.06912905,
 0.010781062,
 0.036564894,
 -0.029517082,
 -0.040961057,
 -0.0037652305,
 0.009641317,
 -0.009908807,
 -0.010565906,
 -0.058848083,
 0.032052435,
 0.020468898,
 0.032680456,
 -0.073594995,
 -0.074571915,
 -0.017607905,
 0.040495854,
 -0.014153778,
 -0.027400412,
 0.036797497,
 0.013328044,
 -0.004044352,
 -0.03551819,
 0.022271557,
 -4.581606e-05,
 -0.03940263,
 0.0087109115,
 0.017189223,
 0.006460496,
 0.029772943,
 0.023806725,
 0.04782279,
 -0.0029831093,
 -0.0005905888,
 0.041891463,
 0.061871897,
 0.

In [16]:
query = "What is the clubs election process?"
query_vector = get_embedding(query,\
    end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3
)

print("\n🔎 Top Results:")
for o in results.objects:
    print(f"\n📘 Page {o.properties['page']}(Source: {o.properties['source']}, section: {o.properties['section']}):\n Source Text: {o.properties['text'][:300]}...")



🔎 Top Results:

📘 Page 25(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 24 
 
VI.2m(iii). PoRs must ensure that no questions on preferences are to be entertained on such 
platform and the candidates have the right to decline to answer any question related to club 
preferences.  
VI.2m(iv). Elections may be conducted through in-person ballots. In case a member is not 
av...

📘 Page 24(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 23 
 
VI.2c. The selection or election of PoRs must be overseen by the outgoing PoRs and under the 
supervision of the VP of Clubs. PoRs are required to obtain approval from the VP of Clubs 
regarding their proposed selection process and timeline before starting the process. 
VI.2d. A candidate is p...

📘 Page 26(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 25 
 
objective, without personal remarks or targeting  individuals. The brief can be in the form of 
an in-person person meeting or on mail. 
VI.3h. Existing P

In [None]:
# 2. Load your collection
collection_name = "student_constitution"  # ensure this matches EXACTLY (case-sensitive)

if collection_name not in client.collections.list_all():
    print(f"\nCollection '{collection_name}' does NOT exist in schema.")
    exit()

collection = client.collections.get(collection_name)


Collection 'student_constitution' does NOT exist in schema.


: 

In [9]:
client.collections.list_all()

{'Student_constitution': _CollectionConfigSimple(name='Student_constitution', description='Stores token-overlapped PDF chunks with Azure embeddings', generative_config=None, properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={}), _Property(name='page', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vectorizer_configs={}), _Property(name='section', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs=

In [10]:
# 3. Count objects
count = collection.aggregate.over_all(total_count=True)
print("\nObject Count:", count.total_count)


Object Count: 0


In [None]:
# User Prompt Cleaner



class RefinedQuestion(BaseModel):
    cleaned_question: str
    section_name: str

def clean_user_message(user_text:str,
                             model:str,
                             endpoint:str,
                             api_key:str) -> RefinedQuestion:
    """
    docstring placeholder
    
    """

    client = ChatCompletionsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(api_key),
    )

    System_message="""\
        You are a "Question Normalizer" for a Student Constitution RAG system.

        Your ONLY task:
        - Clean and rewrite the user's question so that it focuses exclusively on rules, procedures, or governance that might appear in a Student Constitution.
        - DO NOT answer the question.
        - DO NOT add any information, guesses, or interpretation beyond rewriting.
        - DO NOT include constitutional rules or provide explanations.

        Output format (MANDATORY):
        Return ONLY a JSON dictionary with:

        {
        "cleaned_question": "<rewritten question>",
        "section_name": "<inferred section/topic name>"
        }

        The available sections are: Committees, Clubs, Student Council, Election Commission

        Definitions:
        - cleaned_question = concise, rule/procedure focused question (text only)
        - section_name = inferred constitution area (ex: "Elections", "Roles & Responsibilities", "Meetings", etc.)

        Rules:
        - Remove personal opinions, emotions, story, context.
        - If the question is vague, rewrite as:
        "What does the constitution state about <topic>?"
        - Do NOT answer the question.
        - No extra text outside JSON. No markdown formatting. No explanations.
        """

    user_message="Rewrite the following user question so that it focuses only on constitution rules and procedures, and infer the section/topic. Do not answer it on your own. Original User query:{user_text}".format(user_text=user_text)

    payload = {
        "messages": [
            SystemMessage(System_message),
            UserMessage(user_message),
        ],
        "model": model,
        "temperature": 1.0,
        "top_p": 1.0,
    }

    response = client.complete(**payload)

    try:
        output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
    except Exception as e:
        raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

    return json.loads(output_dict["content"])["cleaned_question"]


input_text="How do clubs perform elections?"

cleaned_query=clean_user_message(user_text=input_text,
                             model=INFERENCE_MODEL,
                             endpoint=AZURE_OPENAI_ENDPOINT,
                             api_key=AZURE_OPENAI_KEY)

In [None]:
# Cleaning Query and getting context

query = "What is the clubs election process?"

cleaned_query = clean_user_message(user_text=query,
                             model=INFERENCE_MODEL,
                             endpoint=AZURE_OPENAI_ENDPOINT,
                             api_key=AZURE_OPENAI_KEY)

cleaned_json_query=json.loads(cleaned_query)["cleaned_question"]

query_vector = get_embedding(cleaned_json_query,\
    end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=5
)

print("\n🔎 Top Results:")
for o in results.objects:
    print(f"\n📘 Page {o.properties['page']}(Source: {o.properties['source']}, section: {o.properties['section']}):\n Source Text: {o.properties['text'][:300]}...")



🔎 Top Results:

📘 Page 23(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 22 
 
VI. CLUBS 
 
Clubs are the heart and soul of the student body as they provide avenues for students to 
explore their areas of interest, follow their passions, kindle their creative side, enjoy with 
peers alongside their studies. The school offers a myriad of professional as well as social clu...

📘 Page 26(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 25 
 
objective, without personal remarks or targeting  individuals. The brief can be in the form of 
an in-person person meeting or on mail. 
VI.3h. Existing PoRs must announce the opening of nominations for the positions, making 
them accessible to all junior club members. 
VI.3i. Candidates must ...

📘 Page 30(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 29 
 
VI.8d. The recruitment process shall be initiated by the VP of Clubs thr ough an email sent to 
all first-year students, which shall include application f

In [None]:
query_vector = get_embedding(cleaned_json_query,\
    end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=5
)

rag_context_gotten=" ".join([o.properties["text"] for o in results.objects])

In [None]:
# Answer Using only context

# User Prompt Cleaner


class FinalAnswer(BaseModel):
    cleaned_question: str

def rag_answer(cleaned_question:str,context_rag:str,
                             model:str,
                             endpoint:str,
                             api_key:str) -> FinalAnswer:
    """
    docstring placeholder
    
    """

    client = ChatCompletionsClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(api_key),
    )

    System_message="""\
        You are an AI assistant that responds only using the context provided and from nowhere else.

        You must:
        Answer strictly based on the retrieved context.
        If the context does not contain the answer, say:
        “I dont have enough information in the student constitution to answer that.”

        Do not invent facts or use outside knowledge, even if the answer seems obvious.

        Be concise and factual.

        Do not guess.

        When responding:

        Reference relevant section/page numbers given in the context (if available).

        If the user asks for anything outside the context, politely decline.
        """

    user_message="""
        
        You are given:

        1. A user question
        2. Retrieved context from the vector database

        User question:
        {cleaned_question}

        Context from database:
        {context_rag}

        Using the context only, answer the user question.
        If the context does not directly answer the question, respond:

        “I dont have enough information in the student constitution to answer that.”

        Do not use external knowledge. Do not hallucinate.
        
        """.format(cleaned_question=cleaned_question,context_rag=context_rag)

    payload = {
        "messages": [
            SystemMessage(System_message),
            UserMessage(user_message),
        ],
        "model": model,
        "temperature": 1.0,
        "top_p": 1.0,
    }

    response = client.complete(**payload)

    try:
        output_dict = {'role': response['choices'][-1]['message']['role'], 'content': response['choices'][-1]['message']['content']}
    except Exception as e:
        raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")

    return output_dict



rag_answer(cleaned_question=cleaned_json_query,context_rag=rag_context_gotten,
                             model=INFERENCE_MODEL,
                             endpoint=AZURE_OPENAI_ENDPOINT,
                             api_key=AZURE_OPENAI_KEY)["content"]

In [108]:
client.close()

In [107]:
print(cleaned_query)

The constitution outlines the election process for clubs as follows:

- The President and General Secretary are elected from the senior cohort, and the Joint Secretary is elected from the first-year cohort.
- The selection of the President and General Secretary must be conducted after the Executive Student Council Election and within the timeframe set by the VP of Clubs (VI.2a, VI.2b).
- Nominations are opened by existing PoRs and accessible to all junior club members (VI.3h).
- Candidates must submit an application with a manifesto/SOP, which is reviewed and verified by current PoRs before being shared with club members (VI.3i).
- Elections are conducted only through in-person ballots or via Microsoft Form if a member cannot vote in person, ensuring transparency and fairness (VI.3j, VI.3k(iv)).
- Candidates present their candidacy in a meeting where members can ask questions, excluding questions on club preferences; candidates may decline to answer such questions (VI.3k(ii), VI.3k(iii

In [89]:
import json
cleaned_query=json.loads(cleaned_query)

In [91]:
cleaned_query["cleaned_question"]

'What are the procedures for conducting elections in clubs according to the constitution?'

In [92]:
query = "What is the clubs election process?"

cleaned_query = clean_user_message(user_text=query,
                             model=INFERENCE_MODEL,
                             endpoint=AZURE_OPENAI_ENDPOINT,
                             api_key=AZURE_OPENAI_KEY)["content"]

cleaned_json_query=json.loads(cleaned_query)["cleaned_question"]

query_vector = get_embedding(cleaned_json_query,\
    end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL).data[0].embedding

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=5
)

print("\n🔎 Top Results:")
for o in results.objects:
    print(f"\n📘 Page {o.properties['page']}(Source: {o.properties['source']}, section: {o.properties['section']}):\n Source Text: {o.properties['text'][:300]}...")



🔎 Top Results:

📘 Page 23(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 22 
 
VI. CLUBS 
 
Clubs are the heart and soul of the student body as they provide avenues for students to 
explore their areas of interest, follow their passions, kindle their creative side, enjoy with 
peers alongside their studies. The school offers a myriad of professional as well as social clu...

📘 Page 26(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 25 
 
objective, without personal remarks or targeting  individuals. The brief can be in the form of 
an in-person person meeting or on mail. 
VI.3h. Existing PoRs must announce the opening of nominations for the positions, making 
them accessible to all junior club members. 
VI.3i. Candidates must ...

📘 Page 25(Source: constitution.pdf, section: VI. CLUBS):
 Source Text: 24 
 
VI.2m(iii). PoRs must ensure that no questions on preferences are to be entertained on such 
platform and the candidates have the right to decline to answ

In [None]:
all_chunks=read_pdf()
all_chunks

[{'text': 'BITS SCHOOL OF MANAGEMENT (BITSoM) \n|DEPARTMENT OF STUDENT LIFE| \nCONSTITUTION \nAY 2024 -2025 \n \n  ',
  'page': 1,
  'section': '',
  'source': 'constitution.pdf'},
 {'text': '1 \n \nCONTENTS  \n \nSr. No.  Particulars Page No.  \n1 Preamble  2 \n2 Constitution 3 \n3 Student Council 4 \n4 Election Commission 8 \n5 Committees 10 \n6 Clubs 22 \n7 Annexure: Oath for Elected Members 40 \n \n  ',
  'page': 2,
  'section': '',
  'source': 'constitution.pdf'},
 {'text': '2 \n \nI.PREAMBLE \n \n“The students of BITS School of Management join in unison to nominate & elect the student \nrepresentative body dedicated to representing collective interests, spearheading initiatives & \nenhancing experiences on campus. This association is dedicated to voice student perspectives, \naddress concerns, and foster an environment that promotes active participation in various \nstudent-led initiatives. It will facilitate communication and cooperati on among students, \nfaculty, administratio

Current line: 5  levels. All elected members must read, understand and sign the Code of Conduct within 48 hours of assuming office and must adhere to it and the constitution at all times.
Current line: III.3. DUTIES
0.3335255062466194
Current line: III.3. DUTIES
Current line: III.3a. To safeguard the interests of the students and to ensure that the students' view is expressed in the BITSoM community, without compromising on the reputation, standards, values, morals and long-term interests of the Institute
0.4428357290930296
Current line: III.3a. To safeguard the interests of the students and to ensure that the students' view is expressed in the BITSoM community, without compromising on the reputation, standards, values, morals and long-term interests of the Institute
Current line: III.3b. To act as the mediato r as and when required between the students and the administration in the matters pertaining to the interests of any individual or the batch as a whole  III.3c. To monitor, aid a

IndexError: list index out of range

In [102]:
get_embedding(text_to_embed=parts[x],
                        end_point=AZURE_OPENAI_ENDPOINT,azure_key=AZURE_OPENAI_KEY ,model=EMBEDDING_MODEL)[0]

{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [-0.011682971, 0.0670343, 0.018965358, 0.03390658, 0.040812515, 0.048860785, 0.012364478, -0.00937234, -0.018108606, 0.0646977, 0.053585898, -0.051197376, 0.006539219, 0.0022587078, 0.070513226, 0.009554074, -0.009229547, 0.027260266, 0.00109609, 0.09496958, 0.01971826, 0.020899538, 0.012001008, 0.08686938, -0.044343367, -0.002636782, -0.021301951, -0.014318131, 0.017836003, -0.025287144, 0.036788378, -0.007918458, -0.046576113, -0.018407172, -0.020886557, 0.012805835, 0.0031819872, -0.034425825, 0.029259354, -0.058726404, -0.019199017, -0.020354332, -0.0060297116, -0.05363782, -0.023365943, -0.03460756, -0.02946705, 0.026741024, -0.044680875, 0.041201945, -0.012663043, -0.0005606204, 0.008223514, 0.01575903, -0.0058025424, 0.022080816, 0.0041863983, -0.016771555, -0.030219954, 0.02271689, 0.03304983, 0.02067886, -0.06142647, 0.042136583, 0.013746963, -0.06859203, 0.0014271075, 0.010164185, 0.022405343, 0.0314

In [45]:
reader = PdfReader(PDF_PATH)

In [56]:
print(len(reader.pages[35].extract_text()))

2461


# Total Code Steps

In [3]:
# Example: GitHub-hosted Azure inference endpoint

embedding_client = EmbeddingsClient(
    endpoint=AZURE_OPENAI_ENDPOINT,
    credential=AzureKeyCredential(AZURE_OPENAI_KEY)
)
print("✅ Connected to Azure OpenAI Inference API")


✅ Connected to Azure OpenAI Inference API


In [None]:
client = weaviate.connect_to_embedded(
    persistence_data_path="./.collections"
)
print("✅ Embedded Weaviate started")

In [None]:
COLLECTION_NAME = "PdfChunks"

if COLLECTION_NAME not in [c.name for c in client.collections.list_all()]:
    client.collections.create(
        name=COLLECTION_NAME,
        description="Stores token-overlapped PDF chunks with Azure embeddings",
        properties=[
            Property(name="text", data_type=DataType.TEXT),
            Property(name="page", data_type=DataType.INT),
            Property(name="source", data_type=DataType.TEXT),
        ],
    )
    print(f"✅ Created collection '{COLLECTION_NAME}'")
else:
    print(f"ℹ️ Collection '{COLLECTION_NAME}' already exists")

collection = client.collections.get(COLLECTION_NAME)


In [None]:
def chunk_text_with_overlap(text, chunk_size=400, overlap=60, model="text-embedding-3-small"):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    chunks = []

    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunk_text = enc.decode(chunk)
        chunks.append(chunk_text)

    return chunks


In [None]:
def read_pdf_chunks(pdf_path, chunk_size=400, overlap=60):
    reader = PdfReader(pdf_path)
    all_chunks = []

    for page_num, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        page_chunks = chunk_text_with_overlap(text, chunk_size, overlap)
        for chunk in page_chunks:
            all_chunks.append({
                "text": chunk,
                "page": page_num,
                "source": os.path.basename(pdf_path)
            })
    return all_chunks


In [None]:
def get_embedding_azure(text):
    response = embedding_client.embeddings.create(
        model="text-embedding-3-small",
        input=[text]
    )
    return response.data[0].embedding


In [None]:
pdf_path = "your_file.pdf"  # Replace with your PDF
chunks = read_pdf_chunks(pdf_path)
print(f"📄 Extracted {len(chunks)} chunks from {pdf_path}")

with collection.batch.dynamic() as batch:
    for chunk in chunks:
        vector = get_embedding_azure(chunk["text"])
        batch.add_object(
            properties=chunk,
            vector=vector
        )

print("✅ All chunks embedded and stored in Weaviate!")


In [None]:
query = "What are the main challenges discussed in the document?"
query_vector = get_embedding_azure(query)

results = collection.query.near_vector(
    near_vector=query_vector,
    limit=3
)

print("\n🔎 Top Results:")
for o in results.objects:
    print(f"\n📘 Page {o.properties['page']} ({o.properties['source']}):\n{o.properties['text'][:300]}...")
