In [None]:
!pip install -q python-dotenv
!pip install -q gradio
!pip install -q -U langchain-community
!pip install -q unstructured
!pip install -q openpyxl
!pip install -q tiktoken
!pip install -q chromadb

In [1]:
import os
import glob
import chromadb
import gradio as gr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from dotenv import load_dotenv
from openai import OpenAI
#from google.colab import userdata
from chromadb import chromadb
from sklearn.manifold import TSNE


In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# Load environment variables in google Colab
# openai = userdata.get('OPENAI_API_KEY')
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [4]:
# price is a factor for our company, so we're going to use a low cost model
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [5]:
# Knowledge base is in /RAG_ImEx/Data
knowledge_base_path = "Data/*"  

In [None]:
# 1. Load all Excel sheets in the knowledge base path using glob
all_dfs = []
excel_files = glob.glob(knowledge_base_path) # Use glob for wildcard matching

if not excel_files:  # Check if any Excel files were found
    print(f"No Excel files found in '{knowledge_base_path}'.")
else:
    for filepath in excel_files:
        try: #Add try except block to handle potential errors during file reading
            df = pd.read_excel(filepath, header=0)
            all_dfs.append(df)
        except Exception as e:
            print(f"Error reading file '{filepath}': {e}")

    if all_dfs: # Check if any dataframes were successfully loaded
        # Concatenate all dataframes into a single dataframe
        combined_df = pd.concat(all_dfs, ignore_index=True)
        # Now you can work with the combined_df
        print(f"Successfully combined data from {len(excel_files)} Excel files.")
        # ... your code to process combined_df ...
    else:
        print("No dataframes could be loaded.")

In [None]:

# 2. Create documents
documents = []
for index, row in combined_df.iterrows():
    text = ' '.join(row[combined_df.columns].astype(str))
    metadata = {'row': index + 2, 'filename': filename}  # Add filename to metadata
    doc = Document(page_content=text, metadata=metadata)
    documents.append(doc)


# 3. Create vector database
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)


In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
query = "which company import cold rolled stainless steel?"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)