In [11]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr


In [12]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
os.environ["CHROMA_TELEMETRY_ENABLED"] = "FALSE"
# from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
import shutil
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq



In [13]:
MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
db_name = "vector_db"

In [14]:
load_dotenv()
llm = ChatGroq(
    GROQ_API_KEY=os.getenv("GROQ_API_KEY"),
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

                    GROQ_API_KEY was transferred to model_kwargs.
                    Please confirm that GROQ_API_KEY is what you intended.


In [15]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder,glob="**/*.md",loader_cls=lambda path: TextLoader(path, encoding='utf-8'))
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

In [16]:
text_splitter = CharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=120,
    separator="\n\n"
)

chunks = text_splitter.split_documents(documents)

In [17]:
len(chunks)

33

In [18]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(', '.join(doc_types))

05_contact, 00_institute_overview, 01_academics, 04_infrastructure, 03_student_life, 02_admissions


In [19]:
#vector embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={'trust_remote_code': True}
)

<All keys matched successfully>


In [None]:
# Create chroma vectorstore
vectorstore = FAISS.from_documents(chunks, embedding = embeddings)

total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d

In [21]:
print(total_vectors, dimensions)

33 768


In [26]:
# Prework
vectors = []
documents = []
doc_types = []
colors = []
color_map = {'00_institute_overview': 'blue', '01_academics': 'green', '02_admissions': 'red', '03_student_life': 'orange', '04_infrastructure': 'yellow', '05_contact': 'pink'}

for i in range(total_vectors):
    vectors.append(vectorstore.index.reconstruct(i))
    doc_id = vectorstore.index_to_docstore_id[i]
    document = vectorstore.docstore.search(doc_id)
    documents.append(document.page_content)
    doc_type = document.metadata['doc_type']
    doc_types.append(doc_type)
    colors.append(color_map[doc_type])

vectors = np.array(vectors)

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [27]:
hover_text = [
    f"Type: {doc.metadata['doc_type']}<br>Text: {doc.page_content[:100]}..."
    for doc in chunks
]
doc_types = [doc.metadata['doc_type'] for doc in chunks]

# Create 2D scatter plot
fig = go.Figure(data=[
    go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=6,
            color=colors,  # color should match number of docs
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        text=hover_text,
        hoverinfo='text'
    )
])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    xaxis_title='x',
    yaxis_title='y',
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [28]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [29]:
llm = ChatGroq(
    model_name=MODEL,
    groq_api_key=os.getenv("GROQ_API_KEY")
)

# Memory setup
memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True
)

# Retriever from Chroma
retriever = vectorstore.as_retriever()

# Conversation chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [30]:
query = "Can a student choose AI as a degree"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

According to the provided information, yes, a student can choose Artificial Intelligence (AI) as a degree. SSIE offers a four-year **Bachelor of Technology** degree in Artificial Intelligence as one of its seven specializations.


In [31]:
def chat(message, history):
    result = conversation_chain.invoke({'question': message})
    return result['answer']

In [32]:
view = gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
