In [None]:
!pip install -q python-dotenv
!pip install -q gradio
!pip install -q unstructured
!pip install -q openpyxl
!pip install -q tiktoken
!pip install -q chromadb
!pip install -q sentence-transformers
!pip install -q --upgrade langchain
!pip install -q -U langchain-chroma
!pip install -q -U langchain-google-vertexai
!pip install -q -U langchain-community
!pip install -q -U langchain langchain-huggingface

In [1]:
import os
import glob
import json
import chromadb
import gradio as gr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import google.generativeai as genai
from dotenv import load_dotenv
from openai import OpenAI
from chromadb import chromadb
from sklearn.manifold import TSNE
from datetime import datetime
#from google.colab import userdata

In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings, SentenceTransformerEmbeddings, HuggingFaceEmbeddings 
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_google_vertexai import ChatVertexAI #Import ChatVertexAI from langchain_google_vertexai
from pydantic import BaseModel

from chromadb import Client, Settings 

In [3]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')


In [4]:
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
Gemini_model = genai.GenerativeModel("gemini-1.5-flash")
MODEL = "gemini-1.5-flash" 
#Testing Gemini AI
#response = model.generate_content("Explain how AI works")
#print(response.text)

In [5]:
# Initialize Vertex AI with your Project ID and location
PROJECT_ID = "gen-lang-client-0840327518"  # Replace with your actual Project ID
LOCATION = "asia-southeast1" #Replace with your location
from vertexai import init
init(project=PROJECT_ID, location=LOCATION)

# Define a base language model (if you haven't already)
class _LanguageModel(BaseModel):
    pass


In [6]:
# Load environment variables in google Colab
# openai = userdata.get('OPENAI_API_KEY')
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [7]:
# price is a factor for our company, so we're going to use a low cost model
db_name = "vector_db"

In [8]:
# Knowledge base is in /RAG_ImEx/Data
#knowledge_base_path = "Data/*"  
# Knowledge base is in /RAG_ImEx/Data
knowledge_base_path = "Data_test/*"  

In [9]:
# *** Timestamp Handling ***
def convert_timestamps(cell):
    """Converts Excel timestamps (float or datetime) to ISO 8601 strings."""
    if isinstance(cell, (float, int)):  # Check for numeric timestamps
        try:
            return pd.Timestamp(cell, unit='D').isoformat()  # Convert to datetime and then ISO string
        except ValueError:
            return cell #If the cell is not a valid timestamp, return the original value
    elif isinstance(cell, datetime): #Check if the cell is already in datetime format
        return cell.isoformat() #Return the ISO format of the datetime
    return cell  # Return other cell types unchanged

                

In [None]:
def combine_excel_files(knowledge_base_path, columns_to_remove=None, address_cols=None, header_translations=None):
    """
    Combines multiple Excel files into a single JSON string, handling variations in
    column positions and optional column removal/address merging.

    Args:
        knowledge_base_path: A glob pattern (e.g., "path/to/files/*.xlsx").
        columns_to_remove (optional): A list of column names to remove.
        address_cols (optional): A list of address column names to merge.
        header_translations (optional): Whether to merge address columns.
    Returns:
        A JSON string containing the combined data, or None if an error occurs.
        Prints error messages to the console.
    """

    excel_files = []
    all_data = []

    # Find all Excel files based on the provided path (can be a glob pattern)
    initial_paths = glob.glob(knowledge_base_path)
    
    
    for path in initial_paths:
        if os.path.isdir(path):
            excel_files.extend(glob.glob(os.path.join(path, "*.xlsx"))) # Add Excel files from subdirectories
        elif os.path.isfile(path) and path.lower().endswith(('.xls', '.xlsx')):
            excel_files.append(path)
        else:
            print(f"Skipping: {path} (Not a directory or an Excel file)")

    if not excel_files:
        print(f"No Excel files found matching pattern: {knowledge_base_path}")
        return None

    for file_path in excel_files:
        try:
            if os.path.isdir(file_path): # redundant check, already handled above.
                print(f"Skipping directory: {file_path}")
                continue

            print(f"Processing file: {file_path}")
            df = pd.read_excel(file_path)
            
            # Convert all relevant columns to string type *before* any string operations
            for col in df.columns:
                # Check if the column has mixed types or if it's numeric
                if pd.api.types.is_numeric_dtype(df[col]) or not all(isinstance(x, str) or pd.isna(x) for x in df[col]):
                    df[col] = df[col].astype(str)  # Convert the column to string type

            # Convert column names to lowercase for consistency. Do this early.
            df.columns = df.columns.str.lower()
            if columns_to_remove:
                columns_to_remove = [col.lower() for col in columns_to_remove]
            if address_cols:
                address_cols = [col.lower() for col in address_cols]
            if header_translations:
                header_translations = {k.lower(): v for k, v in header_translations.items()}
            
            # Merge address columns if specified and all columns are present.
            if address_cols and all(col in df.columns for col in address_cols):
                df['dia chi don vi doi tac'] = df[address_cols].apply(lambda row: ' '.join(row.dropna()), axis=1)
                df = df.drop(address_cols, axis=1)
            elif address_cols and not all(col in df.columns for col in address_cols):
                print(f"Warning: Not all address columns found in {file_path}. Skipping address merge.")
           
            # Remove specified columns.
            if columns_to_remove:
                for col in columns_to_remove:
                    try:
                        df = df.drop(col, axis=1)
                    except KeyError:
                        print(f"Warning: Column '{col}' not found in {file_path}. Skipping.")
                    except Exception as e:
                        print(f"An error occurred during column removal in {file_path}: {e}")
                        return None  # Stop processing if a general exception occurs
            
            # header translations.
            if header_translations:
                try:
                    df = df.rename(columns=header_translations)
                except KeyError as e:  # Handle cases where translation key is not found
                    print(f"Error translating headers: Key '{e}' not found in DataFrame.")
                except Exception as e: # Catch other potential exceptions
                    print(f"An error occurred during header translation: {e}")
                    return None
            
            # Convert datetime columns to ISO format strings.
            for col in df.columns:
                if df[col].dtype == 'datetime64[ns]':
                    df[col] = df[col].apply(lambda x: x.isoformat() if pd.notna(x) else None)

            # Convert DataFrame to a list of dictionaries.
            data = df.to_dict(orient='records')
            '''
            # Create a dictionary with metadata and the data.
            document_meta = {"doc_path": file_path, "doc_type": "DataBase", "text": data}
            '''
            all_data.append(data)

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            return None

    #return json.dumps(all_data, ensure_ascii=False, indent=4)
    return all_data

In [None]:
columns_to_remove = [
    'MA HQ', 'MA HAI QUAN', 'MA CHI CUC',
    'TEN HAI QUAN', 'SO DT', 'DIA DIEM CHO THONG QUAN',
    'DIA DIEM CHO THONG QUAN',
    'DIA DIEM DO HANG', 'DIA DIEM XEP HANG', 'P.TIEN V.CHUYEN', 'Tri GIA'
]
address_cols = ['dia chi 1', 'dia chi 2', 'dia chi 3', 'dia chi 4']

header_translations = {
    "MA LH": "Import_Code", 
    "NHA NHAP KHAU" : "Cty nhập khẩu",
    "DIA CHI" : "Địa chỉ cty nhập ",
    "DON VI DOI TAC":"Cty bán hàng", 
    "DIA CHI DON VI DOI TAC" : "Địa chỉ cty bán hàng",
    "DK TT": "Hình Thức Thanh Toán",
    "DK GH": "Incoterm",
    "TI GIA": "Exchange_Rate",
    "NGAY" : "Date",
    "MA HANG" : "HSCODE",
    "TEN HANG": "Product_Info", 
    "LUONG": "Quantity",
    "DVT" : "Unit",
    "DON GIA" : "Unit_Price",
    "MA NT": "Payment_Currency",
    "THUE XNK" : "Import_Tax",
    "XUAT XU" : "Origin", 
    "MA DOANH NGHIEP" : "Tax ID",
   
}
combined_df = combine_excel_files(knowledge_base_path, columns_to_remove, address_cols, header_translations)

if combined_df is not None:
    print("Combined DataFrame:")
    print(combined_df)
    #combined_df.to_excel("combined_file.xlsx", index=False)  # Save to a new Excel file
    #Save to a new Excel file (optional):
    #processed_df.to_excel("processed_file.xlsx", index=False)



In [None]:
# Convert to LangChain Documents(this is the crucial step)
langchain_documents = []
for file_data in combined_df:  # Iterate through each list of dictionaries (each Excel file's data)
    for doc in file_data: # Iterate through the dictionaries in the list. Each dict is one row.
        metadata = {'source_file': 'DataBase',  'page_content' = ""} # Add metadata, if you have a doc_path
        page_content
        for key, value in doc.items():
                page_content += f"{key}: {value}\n"
        langchain_doc = Document(page_content=page_content, metadata=metadata)
        langchain_documents.append(langchain_doc)

print(f"Created {len(langchain_documents)} LangChain documents.")

In [None]:
langchain_documents[1]

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(langchain_documents)

In [None]:
len(chunks)

In [None]:
print(chunks[1])

In [None]:
# Choose a suitable model.  'all-mpnet-base-v2' is a good general-purpose option.
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")  # Or another Sentence Transformer model
# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name) # Use the chunks (which are LangChain Documents)

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
doc_types = set(chunk.metadata['source_file'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
'''
results = collection.get(include=['embeddings', 'documents', 'metadatas'])
print(results)
'''
results = collection.get(include=['embeddings'])
print(results)

In [None]:
# Prework
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['source_file'] for metadata in result['metadatas']]
colors = [['blue'][['DataBase'].index(t)] for t in doc_types]

In [None]:
# Let's try 3D!
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# create a new Chat with Gemini
#llm = ChatVertexAI(temperature=0.7, model_name=MODEL)
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")

# Rebuild the model (this is the crucial missing step)
ChatVertexAI.model_rebuild()

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# Putting it together: set up the conversation chain with Gemini, 
# the vector store, and memory
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=retriever, 
    memory=memory
)

In [None]:
query = "what Samsung C&T importing?"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False, share=True)

In [None]:
from langchain_core.callbacks import StdOutCallbackHandler
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

query = "what VSSC import?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)