In [None]:
!pip install -q python-dotenv
!pip install -q gradio
!pip install -q -U langchain-community
!pip install -q unstructured
!pip install -q openpyxl
!pip install -q tiktoken
!pip install -q chromadb
!pip install -q sentence-transformers
!pip install -q --upgrade langchain
!pip install -q -U langchain-chroma
!pip install -q -U langchain-google-vertexai

In [None]:
!pip install -q -U langchain-google-vertexai

In [2]:
import os
import glob
import json
import chromadb
import gradio as gr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import google.generativeai as genai
from dotenv import load_dotenv
from openai import OpenAI
from chromadb import chromadb
from sklearn.manifold import TSNE
from datetime import datetime
#from google.colab import userdata

In [3]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredExcelLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings, SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_google_vertexai import ChatVertexAI #Import ChatVertexAI from langchain_google_vertexai
from pydantic import BaseModel


In [4]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')

In [5]:
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
Gemini_model = genai.GenerativeModel("gemini-1.5-flash")
MODEL = "gemini-1.5-flash" 
#Testing Gemini AI
#response = model.generate_content("Explain how AI works")
#print(response.text)

In [6]:
# Initialize Vertex AI with your Project ID and location
PROJECT_ID = "gen-lang-client-0840327518"  # Replace with your actual Project ID
LOCATION = "asia-southeast1" #Replace with your location
from vertexai import init
init(project=PROJECT_ID, location=LOCATION)

# Define a base language model (if you haven't already)
class _LanguageModel(BaseModel):
    pass


In [7]:
# Load environment variables in google Colab
# openai = userdata.get('OPENAI_API_KEY')
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [8]:
# price is a factor for our company, so we're going to use a low cost model
db_name = "vector_db"

In [9]:
# Knowledge base is in /RAG_ImEx/Data
#knowledge_base_path = "Data/*"  
# Knowledge base is in /RAG_ImEx/Data
knowledge_base_path = "Data_test/*"  

In [10]:
# *** Timestamp Handling ***
def convert_timestamps(cell):
    """Converts Excel timestamps (float or datetime) to ISO 8601 strings."""
    if isinstance(cell, (float, int)):  # Check for numeric timestamps
        try:
            return pd.Timestamp(cell, unit='D').isoformat()  # Convert to datetime and then ISO string
        except ValueError:
            return cell #If the cell is not a valid timestamp, return the original value
    elif isinstance(cell, datetime): #Check if the cell is already in datetime format
        return cell.isoformat() #Return the ISO format of the datetime
    return cell  # Return other cell types unchanged

                

In [99]:
def combine_excel_files(knowledge_base_path, columns_to_remove=None, address_cols=None,  merge_address=True):
    """
    Combines multiple Excel files into a single Pandas DataFrame, handling 
    variations in column positions and optional column removal/address merging.

    Args:
        knowledge_base_path: A glob pattern (e.g., "path/to/files/*.xlsx").
        columns_to_remove (optional): A list of column names to remove.
        address_cols (optional): A list of address column names to merge.
        merge_address (optional): A boolean value, whether to merge address columns or not.

    Returns:
        A JSON strings containing the combined data, or None if an error occurs.
        Prints error messages to the console.
    """
    # Initialize as an empty list
    excel_files = []  
    all_data = [] 

    # Handle initial glob pattern:
    initial_paths = glob.glob(knowledge_base_path)

    for path in initial_paths:
        if os.path.isdir(path):  # If it's a directory, glob inside it
            excel_files.extend(glob.glob(os.path.join(path, "*.xlsx")))  # Add Excel files from subdirectory
        elif os.path.isfile(path) and path.lower().endswith(('.xls', '.xlsx')): #If it is a file and ends with .xls or .xlsx
            excel_files.append(path) #Append the file path
        else:
            print(f"Skipping: {path} (Not a directory or an Excel file)")
            
    if not excel_files:
        print(f"No Excel files found matching pattern: {knowledge_base_path} or in subdirectories.")
        return None

    for file_path in excel_files:
        try:
            if os.path.isdir(file_path): #Check if the path is a directory
                print(f"Skipping directory: {file_path}")
                continue #Skip to the next path
   
            df = pd.read_excel(file_path)
            print(f"Processing file: {file_path}")  # Indicate which file is being processed
            
            
             # 0. Lowercase Headers:  Do this *immediately* after reading the file
            df.columns = df.columns.str.lower()  # Convert column names to lowercase
            columns_to_remove = [col.lower() for col in columns_to_remove]
            address_cols = [col.lower() for col in address_cols]
            
            # 1. Merge Address Columns (if specified and all columns are present)
            if merge_address and address_cols and all(col.lower() in df.columns.str.lower() for col in address_cols): #Check lowercase of the columns
                df['dia chi don vi doi tac'] = df[address_cols].apply(lambda row: ' '.join(row.dropna()), axis=1) #Lowercase the new column name
                df = df.drop(address_cols, axis=1)
            elif merge_address and address_cols and not all(col.lower() in df.columns.str.lower() for col in address_cols): #Check lowercase of the columns
                print(f"Warning: Not all address columns found in {file_path}. Skipping address merge for this file.")

             # 2. Remove Specified Columns (if specified)
             if columns_to_remove:
                for col in columns_to_remove:
                    try:
                        df = df.drop(col, axis=1)  # Try to drop the column
                    except KeyError:  # If it's not found, catch the error and continue
                        print(f"Warning: Column '{col}' not found in {file_path}. Skipping.")  # Informative message
                    except Exception as e: # Catch other potential exceptions
                        print(f"An error occurred during column removal in {file_path}: {e}")
                    return None    

            # 3. Apply to the entire DataFrame
            #df = df.applymap(convert_timestamps) 
            #OR if you only want to convert timestamps in certain columns
             for col in df.columns:
                if df[col].dtype == 'datetime64[ns]':
                    df[col] = df[col].apply(lambda x: x.isoformat() if pd.notna(x) else None)



             # 4. Convert DataFrame to list of dictionaries ***
             data = df.to_dict(orient='records')# This is the crucial step
            
             # 5. Make JSON-ready dict
             document_meta = {"doc_path": file_path, "doc_type": "DataBase", "text": df.to_dict(orient='records')}
             all_data.append(document_meta)     
        except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None
            
        return json.dumps(all_data, ensure_ascii=False, indent=4)  
    else:
        return None


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 82)

In [102]:
columns_to_remove = [
    'MA HQ', 'MA HAI QUAN', 'TEN HAI QUAN', 'SO DT', 'DIA DIEM CHO THONG QUAN',
    'DIA DIEM DO HANG', 'DIA DIEM XEP HANG', 'P.TIEN V.CHUYEN', 'Tri GIA'
]
address_cols = ['dia chi 1', 'dia chi 2', 'dia chi 3', 'dia chi 4']

combined_df = combine_excel_files(knowledge_base_path, columns_to_remove, address_cols)

if combined_df is not None:
    print("Combined DataFrame:")
    print(combined_df)
    #combined_df.to_excel("combined_file.xlsx", index=False)  # Save to a new Excel file
    #Save to a new Excel file (optional):
    #processed_df.to_excel("processed_file.xlsx", index=False)



Processing file: Data_test\Raw Data\72NK.T1.2024test.xlsx
Combined DataFrame:
[
    {
        "doc_path": "Data_test\\Raw Data\\72NK.T1.2024test.xlsx",
        "doc_type": "DataBase",
        "text": [
            {
                "ma lh": "A12",
                "ma doanh nghiep": "278759-05-30T00:00:00",
                "nha nhap khau": "CôNG TY TNHH HANWA SMC STEEL SERVICE Hà NộI",
                "dia chi": "Lô số 47, Khu công nghiệp Quang Minh, thị trấn Quang Minh, huyện Mê Linh, TP Hà Nội, Việt Nam",
                "don vi doi tac": "GUANGZHOU HANWA TRADING CO., LTD.",
                "dk tt": "KC",
                "dk gh": "CFR",
                "ma nt": "USD",
                "ti gia": "2035-11-16T00:00:00",
                "ngay": "2024-01-03T00:00:00",
                "ma hang": "199808-12-11T00:00:00",
                "ten hang": "Thép hợp kim (Bo > 0.0008%), dạng cuộn, cán phẳng, được mạ kẽm bằng phương pháp điện phân, mác SECC-MD1 (tương đương SECC JIS G3313), mới 100%. K

  df = df.applymap(convert_timestamps)


In [None]:
#Testing
'''
file_path="Data_test\\72NK.T1.2024test.xlsx"
df = pd.read_excel(file_path)
print(f"Columns in {file_path}: {df.columns}")  # Print the columns!
'''

In [None]:
def combine_df_translate(combined_df, header_translations=None):
    """
    Translates headers of a combined DataFrame.

    Args:
        combined_df: The combined Pandas DataFrame.
        header_translations (optional): A dictionary of header translations.

    Returns:
        A Pandas DataFrame with translated headers, or the original DataFrame 
        if no translations are provided or if an error occurs.  Prints error 
        messages to the console.
    """
    if combined_df is None:
        print("Error: combined_df is None. Cannot translate headers.")
        return None
     # 0. Lowercase Headers:  Do this *immediately* after reading the file
    header_translations = {k.lower(): v for k, v in header_translations.items()}
        
    if header_translations:
        try:
            translated_df = combined_df.rename(columns=header_translations)
            return translated_df
        except KeyError as e:  # Handle cases where translation key is not found
            print(f"Error translating headers: Key '{e}' not found in DataFrame.")
            return combined_df # Return the dataframe without translation if there is error.
        except Exception as e: # Catch other potential exceptions
            print(f"An error occurred during header translation: {e}")
            return None
    else:
        return combined_df  # Return the original DataFrame if no translations

In [None]:
# Example usage (assuming you have a combined_df from your file processing):
# ... (your code to combine Excel files into combined_df) ...
header_translations = {
    "MA LH": "Import_Code", 
    "NHA NHAP KHAU" : "Cty nhập khẩu",
    "DIA CHI" : "Địa chỉ cty nhập ",
    "DON VI DOI TAC":"Cty bán hàng", 
    "DIA CHI DON VI DOI TAC" : "Địa chỉ cty bán hàng",
    "DK TT": "Hình Thức Thanh Toán",
    "DK GH": "Incoterm",
    "MA NT": "Payment_Currency",
    "TI GIA": "Exchange_Rate",
    "NGAY" : "Date",
    "MA HANG" : "HSCODE",
    "TEN HANG": "Product_Info", 
    "LUONG": "Quantity",
    "DVT" : "Unit",
    "DON GIA" : "Unit_Price",
    "THUE XNK" : "Import_Tax",
    "XUAT XU" : "Origin", 
    "MA DOANH NGHIEP" : "Tax ID",
   
}

translated_df = combine_df_translate(combined_df, header_translations)
if translated_df is not None:
    print("DataFrame with translated headers:")
    print(translated_df.head())
    # ... (continue processing or save the translated_df)import pandas as pd

In [None]:
def create_vector_documents_from_df(translated_df, selected_cols=None, output_file="vector_documents.txt"):
    # ... (your code for handling missing input)
    """
    Creates a text file of vector documents from a translated DataFrame.

    Args:
        translated_df: The Pandas DataFrame with translated headers.
        output_file: The name of the output text file.
    """
    documents = []  # List to store dictionaries (documents and metadata)
    
    if selected_cols is None:
        print("Error: selected_cols is None. Cannot select headers.")
        return None
        
    if translated_df is None:
        print("Error: translated_df is None. Cannot create vector documents.")
        return
  

    with open(output_file, "w", encoding="utf-8") as outfile:
        try:
            for _, row in translated_df.iterrows():
                document_parts = []
                for col in selected_cols:
                    value = row.get(col)
                    if value is not None:
                        document_parts.append(col +" "+ str(value))
                document = " ".join(document_parts)
                document_data = {"text": document, "doc_type": "DataBase"}
                documents.append(document_data)
                
                outfile.write(document + "\n")
        except Exception as e:
            print(f"An error occurred during document creation: {e}")
            return
            
    return documents
    print(f"Vector documents written to {output_file}")


In [None]:
# Example usage (assuming you have a translated_df):

# ... (your code to combine and translate Excel files into translated_df) ...
selected_cols = [
        "Import_Code", "Seller", "Incorm_Term", "Payment_Currency",
        "HSCODE", "Product_Info", "Origin", "Buyer", "Buyer_address",
        "Seller_address", "doc_type"
    ]

selected_cols_2 = [
    "Import_Code", 
    "Importer",
    "Importer_address",
    "Seller", 
    "Seller_address",
    "Payment_Term",
    "Incoterm",
    "Payment_Currency",
    "Exchange_Rate",
    "Date",
    "HSCODE",
    "Product_Info", 
    "Quantity",
    "Unit",
    "Unit_Price",
    "Import_Tax",
    "Origin", 
    "Tax ID",
    ]

documents = create_vector_documents_from_df(translated_df,selected_cols_2)  # Use the translated DataFrame

# Or specify the output file name:
# create_vector_documents_from_df(translated_df, "my_vectors.txt")

In [None]:
print(documents)

In [None]:
# Convert to LangChain Documents(this is the crucial step)
langchain_documents = []
for doc in documents:
    metadata = {"doc_type": doc.get("doc_type", "")}  # Create a dictionary for metadata
    langchain_doc = Document(page_content=doc['text'], metadata=metadata)
    langchain_documents.append(langchain_doc)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(langchain_documents)

In [None]:
len(chunks)

In [None]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
# Choose a suitable model.  'all-mpnet-base-v2' is a good general-purpose option.
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")  # Or another Sentence Transformer model
# Delete if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name) # Use the chunks (which are LangChain Documents)

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

In [None]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue'][['DataBase'].index(t)] for t in doc_types]

In [None]:
# Let's try 3D!
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# create a new Chat with Gemini
#llm = ChatVertexAI(temperature=0.7, model_name=MODEL)
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")

# Rebuild the model (this is the crucial missing step)
ChatVertexAI.model_rebuild()

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

# Putting it together: set up the conversation chain with Gemini, 
# the vector store, and memory
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=retriever, 
    memory=memory
)

In [None]:
query = "what Samsung C&T importing?"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

In [None]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False, share=True)

In [None]:
from langchain_core.callbacks import StdOutCallbackHandler
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

query = "what VSSC import?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)