In [35]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr


In [36]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
os.environ["CHROMA_TELEMETRY_ENABLED"] = "FALSE"
from langchain_chroma import Chroma
import shutil
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq



In [37]:
MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
db_name = "vector_db"

In [38]:
load_dotenv()
llm = ChatGroq(
    GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)


                    GROQ_API_KEY was transferred to model_kwargs.
                    Please confirm that GROQ_API_KEY is what you intended.



In [39]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder,glob="**/*.md",loader_cls=lambda path: TextLoader(path, encoding='utf-8'))
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

In [40]:
text_splitter = CharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=120,
    separator="\n\n"
)

chunks = text_splitter.split_documents(documents)

In [41]:
len(chunks)

33

In [42]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(', '.join(doc_types))

05_contact, 03_student_life, 00_institute_overview, 02_admissions, 01_academics, 04_infrastructure


In [43]:
#vector embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={'trust_remote_code': True}
)

<All keys matched successfully>


In [44]:
# Create chroma vectorstore
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [45]:
vectorstore._collection.count()

66

In [46]:
#dimensions of a vector in vectorstore vector_db
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimensions = len(sample_embedding)
print(dimensions)

768


In [47]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange', 'yellow', 'pink'][['00_institute_overview', '01_academics', '02_admissions', '03_student_life', '04_infrastructure', '05_contact'].index(t)] for t in doc_types]

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

In [48]:
hover_text = [
    f"Type: {doc.metadata['doc_type']}<br>Text: {doc.page_content[:100]}..."
    for doc in chunks
]
doc_types = [doc.metadata['doc_type'] for doc in chunks]

# Create 2D scatter plot
fig = go.Figure(data=[
    go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=colors,  # color should match number of docs
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        text=hover_text,
        hoverinfo='text'
    )
])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [49]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [52]:
llm = ChatGroq(
    model_name=MODEL,
    groq_api_key=os.getenv("GROQ_API_KEY")
)

# Memory setup
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True
)

# Retriever from Chroma
retriever = vectorstore.as_retriever()

# Conversation chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)

In [55]:
query = "What are the programs provided by SSIE"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

The Silver Stone Institute of Engineering (SSIE) offers the following academic programs:

**Bachelor of Technology (4-year) degrees** in seven specializations:
1. Computer Engineering
2. Electronics & Telecommunications
3. Mechanical Engineering
4. Civil Engineering
5. Artificial Intelligence
6. Biotechnology
7. Renewable Energy Systems

**M.Tech programs (2-year)** in:
1. Data Science
2. Robotics
3. Smart Infrastructure

Each program includes industry-aligned coursework with mandatory internships through the **TechConnect** industry immersion initiative.
