In [4]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [6]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.express as px
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/var/data/python/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/var/data/python/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/var/data/python/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/var/data/python/lib/python3.

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [3]:
MODEL='gpt-4o-mini'
db_name='vector_db'

In [4]:
load_dotenv()
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')

In [5]:
folders=glob.glob("knowledge/*")
documents=[]
for folder in folders:
    doc_type=os.path.basename(folder)
    loader=DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader)
    folder_docs=loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"]=doc_type
        documents.append(doc)

In [6]:
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks=text_splitter.split_documents(documents)

Created a chunk of size 1058, which is longer than the specified 1000
Created a chunk of size 1610, which is longer than the specified 1000
Created a chunk of size 2204, which is longer than the specified 1000
Created a chunk of size 2372, which is longer than the specified 1000
Created a chunk of size 1269, which is longer than the specified 1000
Created a chunk of size 1111, which is longer than the specified 1000
Created a chunk of size 1117, which is longer than the specified 1000
Created a chunk of size 1099, which is longer than the specified 1000
Created a chunk of size 1514, which is longer than the specified 1000
Created a chunk of size 1449, which is longer than the specified 1000
Created a chunk of size 1518, which is longer than the specified 1000
Created a chunk of size 16572, which is longer than the specified 1000
Created a chunk of size 13130, which is longer than the specified 1000
Created a chunk of size 11606, which is longer than the specified 1000
Created a chunk o

In [7]:
len(chunks)

354

In [8]:
embeddings=OpenAIEmbeddings()

In [9]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [10]:
vectorstore=Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 354 documents


In [15]:
collection=vectorstore._collection
sample_embedding= collection.get(limit=1, include=['embeddings']) ['embeddings'][0]
dimensions=len(sample_embedding)
print(f"Dimension: {dimensions}")

Dimension: 1536


In [14]:
sample_embedding

array([ 7.35489375e-05,  1.28459996e-02,  2.41141897e-02, ...,
       -1.05055822e-02, -2.95839291e-02, -1.54362367e-02])

In [13]:
all_embeddings = collection.get(include=['embeddings'])['embeddings']

# Convert to numpy array
embeddings_array = np.array(all_embeddings)

# Reduce dimensions to 2D using t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_array)

# Create a DataFrame for Plotly
import pandas as pd
df = pd.DataFrame(embeddings_2d, columns=['Component 1', 'Component 2'])

# Plot using Plotly
fig = px.scatter(df, x='Component 1', y='Component 2', title='2D Visualization of Embeddings')
fig.show()

In [19]:
# Fetch all embeddings from the collection
all_embeddings = collection.get(include=['embeddings'])['embeddings']

# Convert to numpy array
embeddings_array = np.array(all_embeddings)

# Reduce dimensions to 3D using t-SNE
tsne = TSNE(n_components=3, random_state=42)
embeddings_3d = tsne.fit_transform(embeddings_array)

# Create a DataFrame for Plotly
import pandas as pd
df = pd.DataFrame(embeddings_3d, columns=['Component 1', 'Component 2', 'Component 3'])

# Plot using Plotly
fig = px.scatter_3d(df, x='Component 1', y='Component 2', z='Component 3', title='3D Visualization of Embeddings')
fig.show()

In [21]:
llm=ChatOpenAI(temperature=0.7, model_name=MODEL)
memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever=vectorstore.as_retriever()
conversation_chain=ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [22]:
query="Cost of HS Fabric floor mat (DCT)?"
result=conversation_chain.invoke({'question': query})
print(result['answer'])


The cost of the HS Fabric floor mat (DCT) is £73.84.


In [23]:
def chat(message, history):
    result=conversation_chain.invoke({'question': query})
    return result['answer']

In [25]:
view=gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
