In [2]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
os.environ["CHROMA_TELEMETRY_ENABLED"] = "FALSE"
from langchain_chroma import Chroma
import shutil
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq



In [4]:
MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
db_name = "vector_db"

In [5]:
load_dotenv()
llm = ChatGroq(
    GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

                    GROQ_API_KEY was transferred to model_kwargs.
                    Please confirm that GROQ_API_KEY is what you intended.


In [6]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder,glob="**/*.md",loader_cls=lambda path: TextLoader(path, encoding='utf-8'))
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

In [7]:
text_splitter = CharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=120,
    separator="\n\n"
)

chunks = text_splitter.split_documents(documents)

In [8]:
len(chunks)

33

In [9]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(', '.join(doc_types))

05_contact, 04_infrastructure, 00_institute_overview, 01_academics, 02_admissions, 03_student_life


In [10]:
#vector embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={'trust_remote_code': True}
)

  embeddings = HuggingFaceEmbeddings(
<All keys matched successfully>


In [15]:
# Create chroma vectorstore
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [16]:
vectorstore._collection.count()

99

In [17]:
#dimensions of a vector in vectorstore vector_db
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimensions = len(sample_embedding)
print(dimensions)

Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


768


In [18]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange', 'yellow', 'pink'][['00_institute_overview', '01_academics', '02_admissions', '03_student_life', '04_infrastructure', '05_contact'].index(t)] for t in doc_types]

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


In [19]:
hover_text = [
    f"Type: {doc.metadata['doc_type']}<br>Text: {doc.page_content[:100]}..."
    for doc in chunks
]
doc_types = [doc.metadata['doc_type'] for doc in chunks]

# Create 2D scatter plot
fig = go.Figure(data=[
    go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=colors,  # color should match number of docs
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        text=hover_text,
        hoverinfo='text'
    )
])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [20]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
llm = ChatGroq(
    model_name=MODEL,
    groq_api_key=os.getenv("GROQ_API_KEY")
)

# Memory setup
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True
)

# Retriever from Chroma
# retriever = vectorstore.as_retriever(search_kwargs={"k":10}) # no: of input chunks
retriever = vectorstore.as_retriever()

# Conversation chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [22]:
query = "Can a student choose AI as a degree"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Yes, a student can choose Artificial Intelligence (AI) as a degree. According to the provided information, SSIE offers a four-year **Bachelor of Technology** degree in Artificial Intelligence as one of its seven specializations.


In [23]:
def chat(message, history):
    result = conversation_chain.invoke({'question': message})
    return result['answer']

In [24]:
view = gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


In [None]:
from langchain_core.callbacks import StdOutCallbackHandler  
#check the amount of context being sent

llm = ChatGroq(
    model_name=MODEL,
    groq_api_key=os.getenv("GROQ_API_KEY")
)

# Memory setup
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True
)

# Retriever from Chroma
retriever = vectorstore.as_retriever()

# Conversation chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    callbacks=[StdOutCallbackHandler()]
)

query = "what is the library's name"
result = conversation_chain.invoke({"question": query})
print(result["answer"])



[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
// File: campus_facilities.md
# Infrastructure

## Academic Facilities
Vigyan Bhavan central library houses 85,000+ volumes with IEEE access. Agastya supercomputing lab features 20 PFLOPS capacity while Takshashila innovation complex offers 3D printing and IoT sandbox. 

## Wellness & Recreation
Arogya medical center provides telemedicine facilities. The campus features Olympic-size swimming pool, cricket academy, and Rasayan food court serving regional cuisines. Dedicated meditation gardens and Vipassana cells support mental wellness.

// File: campus_facilities.md
# Infrastructure

## Academic Facilities
Vig



[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: what is the library's name
Assistant: The library's name is Vigyan Bhavan central library.
Follow Up Input: what is the library's name
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
// File: campus_facilities.md
# Infrastructure

## Academic Facilities
Vigyan Bhavan central library houses 85,000+ volumes with IEEE access. Agastya supercomputing lab features 20