In [None]:
import os
import base64
import pandas as pd
import json
import xml.etree.ElementTree as ET
from docx import Document 
import speech_recognition as sr
import cv2
import ffmpeg
import numpy as np

from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredPowerPointLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pprint import pprint

class DataLoader:
    def __init__(self, folder_path, chunk_size=100, chunk_overlap=10):
        self.folder_path = folder_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def process_text_file(self, file_path):
        loader = TextLoader(file_path, encoding="utf-8")
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
        chunks = text_splitter.split_documents(documents)
        return [
            {
                "chunk_no": int(idx),
                "text": chunk.page_content,
                "path": file_path,
                "file_type": "text",
                "media_type": "text"
            }
            for idx, chunk in enumerate(chunks)
        ]

    def process_pdf_file(self, file_path):
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, 
            chunk_overlap=self.chunk_overlap
            )
        chunks = text_splitter.split_documents(documents)
        return [
            {
                "chunk_no": int(idx),
                "text": chunk.page_content,
                "path": file_path,
                "file_type": "pdf",
                "media_type": "text"
            }
            for idx, chunk in enumerate(chunks)
        ]

    def process_pptx_file(self, file_path):
        loader = UnstructuredPowerPointLoader(file_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, 
            chunk_overlap=self.chunk_overlap
            )
        chunks = text_splitter.split_documents(documents)
        return [
            {
                "chunk_no": int(idx),
                "text": chunk.page_content,
                "path": file_path,
                "file_type": "pptx",
                "media_type": "text"
            }
            for idx, chunk in enumerate(chunks)
        ]

    def process_xlsx_file(self, file_path):
        xls = pd.ExcelFile(file_path)
        chunked_documents = []
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
            chunked_documents.append({
                "chunk_no": 1,
                "text": df.to_dict(),
                "path": file_path,
                "file_type": "table",
                "media_type": "text"
            })
        return chunked_documents

    # def process_doc_file(self, file_path):
    #      NOTE This requires pywin32 and Microsoft Word installed on Windows
    #     try:
    #         word = win32.Dispatch("Word.Application")
    #         doc = word.Documents.Open(file_path)
    #         text = doc.Range().Text
    #         doc.Close(False)
    #         word.Quit()
    #         text_splitter = RecursiveCharacterTextSplitter(
    #             chunk_size=self.chunk_size, 
    #             chunk_overlap=self.chunk_overlap
    #             )
    #         chunks = text_splitter.split_text(text)
    #         return [
    #             {
    #                 "chunk_no": int(idx),
    #                 "text": chunk,
    #                 "path": file_path,
    #                 "file_type": "doc",
    #                 "media_type": "text"
    #             }
    #             for idx, chunk in enumerate(chunks)
    #         ]
    #     except Exception as e:
    #         print(f"Error processing DOC file {file_path}: {e}")
    #         return None

    def process_docx_file(self, file_path):
        # NOTE: This requires python-docx installed
        try:
            doc = Document(file_path)
            text = '\n'.join([para.text for para in doc.paragraphs])
            text_splitter = RecursiveCharacterTextSplitter(
                separators=[" "], 
                chunk_size=self.chunk_size, 
                chunk_overlap=self.chunk_overlap
                )
            chunks = text_splitter.split_text(text)
            return [
                {
                    "chunk_no": int(idx),
                    "text": chunk,
                    "path": file_path,
                    "file_type": "docx",
                    "media_type": "text"
                }
                for idx, chunk in enumerate(chunks)
            ]
        except Exception as e:
            print(f"Error processing DOCX file {file_path}: {e}")
            return None

    def process_audio_file(self, file_path):
        recognizer = sr.Recognizer()
        try:
            # Ensure the audio file is in a format SpeechRecognition can handle (e.g., WAV)
            # If it's MP3, you might need pydub to convert it first or ensure ffmpeg is used.
            # For simplicity, assuming .wav is given, or that sr can handle it.
            # If using ffmpeg for audio extraction, it's often best to convert to .wav explicitly.
            if os.path.splitext(file_path)[1].lower() == '.mp3':
                # This part is optional, only if you want to handle MP3s directly with sr without prior conversion
                # It would require pydub (pip install pydub)
                # from pydub import AudioSegment
                # audio = AudioSegment.from_mp3(file_path)
                # temp_wav_path = file_path.replace('.mp3', '.wav')
                # audio.export(temp_wav_path, format="wav")
                # actual_file_path = temp_wav_path
                pass # For now, let's assume it's handled or pre-converted
            else:
                actual_file_path = file_path

            with sr.AudioFile(actual_file_path) as source:
                audio_data = recognizer.record(source)
                text = recognizer.recognize_google(audio_data) # Using Google Web Speech API for recognition
            chunks = [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size)]

            # if 'temp_wav_path' in locals() and os.path.exists(temp_wav_path):
            #     os.remove(temp_wav_path) # Clean up temporary WAV if created

            return [
                {
                    "chunk_no": int(idx),
                    "text": chunk,
                    "path": file_path,
                    "file_type": "audio",
                    "media_type": "text"
                }
                for idx, chunk in enumerate(chunks)
            ]
        except Exception as e:
            print(f"Error processing audio file {file_path}: {e}")
            return None
        
    def process_image_file(self, file_path):
        try:
            with open(file_path, 'rb') as image_file:
                image_base64 = base64.b64encode(image_file.read()).decode('utf-8')
            return [{
                "path": file_path,
                "file_type": "image",
                "image": image_base64,
                "media_type": "image"
            }]
        except Exception as e:
            print(f"Error processing image file {file_path}: {e}")
            return None

    def process_csv_file(self, file_path):
        try:
            csv_data = pd.read_csv(file_path).to_string()
            return [{
                "chunk_no": 1,
                "text": csv_data,
                "path": file_path,
                "file_type": "csv",
                "media_type": "text"
            }]
        except Exception as e:
            print(f"Error processing CSV file {file_path}: {e}")
            return None
        
    def process_xml_file(self, file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            xml_data = ET.tostring(root, encoding='unicode')
            return [{
                "chunk_no": 1,
                "text": xml_data,
                "path": file_path,
                "file_type": "xml",
                "media_type": "text"
            }]
        except Exception as e:
            print(f"Error processing XML file {file_path}: {e}")
            return None
        

    def process_json_file(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.dumps(json.load(file), indent=4)
            return [{
                "chunk_no": 1,
                "text": json_data,
                "path": file_path,
                "file_type": "json",
                "media_type": "text"
            }]
        except Exception as e:
            print(f"Error processing JSON file {file_path}: {e}")
            return None
        
    def process_video_file(self, file_path):
        video_data = []
        audio_path = file_path.replace(os.path.splitext(file_path)[1], "_extracted_audio.wav")

        try:
            # --- Extracting audio using ffmpeg-python ---
            print(f"Extracting audio from {file_path} to {audio_path}...")
            (
                ffmpeg
                .input(file_path)
                .output(audio_path, acodec='pcm_s16le', ar='16000') # PCM 16-bit WAV, 16kHz sample rate
                .run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
            )
            print("Audio extraction complete. Processing audio...")
            audio_extracted_data = self.process_audio_file(audio_path)
            if audio_extracted_data:
                video_data.extend(audio_extracted_data)
                print(f"Added {len(audio_extracted_data)} audio chunks.")
            else:
                print("No audio data extracted or processed.")

        except ffmpeg.Error as e:
            print(f"FFmpeg error extracting audio from {file_path}: {e.stderr.decode('utf8')}")
        except Exception as e:
            print(f"An unexpected error occurred during audio extraction: {e}")
        finally:
            if os.path.exists(audio_path):
                os.remove(audio_path) # Clean up temporary audio file
                print(f"Cleaned up temporary audio file: {audio_path}")

        try:
            # --- Extracting frames using OpenCV ---
            print(f"Extracting frames from {file_path} using OpenCV...")
            cap = cv2.VideoCapture(file_path)
            if not cap.isOpened():
                raise IOError(f"Error: Could not open video file {file_path}")

            fps = cap.get(cv2.CAP_PROP_FPS)
            if fps == 0:
                print("Warning: FPS is 0, cannot extract frames by interval. Extracting every Nth frame instead.")
                frames_to_skip = 30 # Default to every 30 frames if FPS is unknown
            else:
                frame_interval = 1 # seconds (same as original logic)
                frames_to_skip = int(fps * frame_interval)
                if frames_to_skip == 0: # Ensure at least 1 frame is skipped to avoid processing every frame
                    frames_to_skip = 1

            current_frame_idx = 0
            extracted_frame_count = 0

            while True:
                ret, frame = cap.read()
                if not ret:
                    break # End of video

                if current_frame_idx % frames_to_skip == 0:
                    # Convert frame (NumPy array) to JPEG bytes
                    # Use a higher quality for encoding, e.g., 90
                    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 90]
                    ret_encode, buffer = cv2.imencode('.jpg', frame, encode_param)
                    if ret_encode:
                        frame_base64 = base64.b64encode(buffer).decode('utf-8')
                        video_data.append({
                            "chunk_no": extracted_frame_count, # Use a separate counter for extracted frames
                            "image": frame_base64,
                            "path": file_path,
                            "file_type": "video_frame",
                            "media_type": "image"
                        })
                        extracted_frame_count += 1
                current_frame_idx += 1

            cap.release()
            print(f"Extracted {extracted_frame_count} video frames.")

        except Exception as e:
            print(f"Error processing video frames from {file_path}: {e}")
            if 'cap' in locals() and cap.isOpened():
                cap.release() # Ensure release even on error

        return video_data

    def process_file(self, file_path):
        extension = os.path.splitext(file_path)[1].lower()
        if extension == '.txt':
            return self.process_text_file(file_path)
        elif extension == '.pdf':
            return self.process_pdf_file(file_path)
        elif extension == '.pptx':
            return self.process_pptx_file(file_path)
        elif extension == '.xlsx':
            return self.process_xlsx_file(file_path)
        elif extension == '.doc':
            return self.process_doc_file(file_path)
        elif extension == '.docx':
            return self.process_docx_file(file_path)
        elif extension in ['.wav', '.mp3']: # Note: .mp3 might need pydub for speech_recognition
            return self.process_audio_file(file_path)
        elif extension in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
            return self.process_image_file(file_path)
        elif extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.mkv']: # Added more common video extensions
            return self.process_video_file(file_path)
        elif extension == '.csv':
            return self.process_csv_file(file_path)
        elif extension == '.xml':
            return self.process_xml_file(file_path)
        elif extension == '.json':
            return self.process_json_file(file_path)
        else:
            print(f"Unsupported file type: {extension} for file {file_path}")
            return None

    def load_data(self):
        all_data = []
        for root, dirs, files in os.walk(self.folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                processed_data = self.process_file(file_path)
                if processed_data is not None:
                    all_data.extend(processed_data)
                else:
                    print(f"Skipping file due to processing error or unsupported type: {file_path}")
        return all_data

Info: win32com is not available on non-Windows platforms. DOC file processing will be skipped.


In [3]:


import weaviate
from weaviate.classes.config import Configure, Multi2VecField
from weaviate.classes.query import Filter
import base64
import os
import re
from hashlib import md5

class DatabaseClient:
    
    def __init__(self, folder_path):
        self.__folder_path = folder_path
        self.__create_client()
        if self.__generate_collection():
            self.loader = DataLoader(folder_path, chunk_size=300, chunk_overlap=50)
            self.__data_ingestion(folder_path)
   
    def __create_client(self):
        self.client = weaviate.connect_to_local(port=8081)
        # self.client = weaviate.Client(
        #         url="http://localhost:8081"  # host port, not container port
        #     )
    
    # def __get_hashed_path(self):
    #     return self.__folder_path.split("\\")[-1]  + ''.join(filter(str.isalpha, md5(self.__folder_path.encode()).hexdigest()))

    def __get_hashed_path(self):
        folder_name = os.path.basename(self.__folder_path)
        sanitized_name = re.sub(r'[^A-Za-z0-9_]', '', folder_name)
        if not sanitized_name or not sanitized_name[0].isalpha() or not sanitized_name[0].isupper():
            sanitized_name = "Doc" + sanitized_name.capitalize()
        hash_suffix = ''.join(filter(str.isalnum, md5(self.__folder_path.encode()).hexdigest()))[:6]
        class_name = f"{sanitized_name}_{hash_suffix}"
        return class_name


    def __generate_collection(self):
        collection_name = self.__get_hashed_path()
        if self.client.collections.exists(collection_name):
            self.collection = self.client.collections.get(self.__get_hashed_path())
            return False
            # self.client.collections.delete(collection_name)

        self.collection = self.client.collections.create(
            name = self.__get_hashed_path(),
            vectorizer_config=Configure.Vectorizer.multi2vec_clip(
                    image_fields=[
                        Multi2VecField(
                            name="image"
                        )
                    ],
                    text_fields=[
                        Multi2VecField(
                            name="text"
                        ),
                        Multi2VecField(
                            name="path"
                        )
                    ]
            )
        )
        return True
    
    def __data_ingestion(self, folder_path):
        object_list = self.loader.load_data()
        with self.collection.batch.dynamic() as batch:
            for object in object_list:
                batch.add_object(
                    properties=object
                )
    
    def search_with_text(self, query : str, search_for="all", limit=5):
        if search_for == "all":
            response =  self.collection.query.near_text(
                query=query,
                limit=limit
            )
        else:
            response = self.collection.query.near_text(
                query=query,
                filters=Filter.by_property("media_type").equal(search_for),
                limit=limit
            )
        return [object.properties for object in response.objects]
    
    def __to_base64(self, path):
            with open(path, 'rb') as file:
                return base64.b64encode(file.read()).decode('utf-8')
            
    def search_with_image(self, image_path, search_for = 'all', limit=5):
        if search_for == "all":
            response = self.collection.query.near_image(
                near_image=self.__to_base64(image_path),
                limit=limit
            )
        else:
            response = self.collection.query.near_image(
                near_image=self.__to_base64(image_path),
                filters=Filter.by_property("media_type").equal(search_for),
                limit=limit
                )
        return [object.properties for object in response.objects]

    def list_collections(self):
        return self.client.collections.list_all()
    
    def delete_collections(self, name):
        return self.client.collections.delete(name)
    
    def close_connection(self):
        self.client.close()
            
    def __repr__(self):
        return f"""Folder: {self.__folder_path}"""


In [None]:
class RetrieverClient:
    def __init__(self, folder_path):
        self.__database = DatabaseClient(folder_path)

    def __retrieve(self, text=None, image_path=None, search_for="all", limit=5):
        if image_path == None and text == None:
            return []
        elif image_path != None and text != None:
            return self.__database.search_with_image(image_path, search_for=search_for, limit=limit) + self.__database.search_with_text(text, search_for=search_for, limit=limit)
        elif text == None:
            return self.__database.search_with_image(image_path, search_for=search_for, limit=limit)
        elif image_path == None:
            return self.__database.search_with_text(text, search_for=search_for, limit=limit)
    
    def __organize_by_media_type(self, properties):
        response =  {
            'text': [],
            'image': []
        }
        for property in properties:
            if property["media_type"] == "text" and property not in response['text']:
                response['text'].append(property)
            if property["media_type"] == "image" and property not in response['image']:
                response["image"].append(property)
        return response
    
    def search(self, text=None, image_path=None, search_for="all", limit=5):
        """
            Inputs: text and/or images
            Options: search_for [all, text, image (name it media in frontend)], 
                    limit
            Output: {"text":[],
                    "image":[]}   
                it's a dictionary with "text" and "images" fields which are lists
        """
        return self.__organize_by_media_type(self.__retrieve(text, image_path, search_for=search_for, limit=limit))
        
    def close_database_connection(self):
        self.__database.close_connection()
        

In [None]:
import ollama
import speech_recognition as sr

class Chat:
    
    def __init__(self, model="gemma3:latest"):
        self.__messages = []
        self.__model = model
    
    def __process_audio_file(self, file_path):
        recognizer = sr.Recognizer()
        try:
            with sr.AudioFile(file_path) as source:
                audio_data = recognizer.record(source)
                return recognizer.recognize_google(audio_data)

        except Exception as e:
            print(f"Error processing audio file {file_path}: {e}")
            return None
        
    def __append_user_message(self, user_query=None, user_image_paths=None, user_audio_path=None):
        audio_content = ""
        image_paths = []
        if user_query is None:
            user_query = "Describe."
        if user_image_path:
            image_paths.append(user_image_path)
        if user_audio_path:
            audio_content = self.__process_audio_file(user_audio_path)
   
        query_with_context = f"""Given Context: {audio_content}
                                 Query: {user_query}"""
        self.__messages.append({"role": "user", "content": query_with_context, "images": image_paths})

    def append_assistant_message(self, content):
        self.__messages.append({"role": "assistant", "content" : content})  
    
    def get_assistant_response(self, user_text=None, user_image_path=None, user_audio_path=None):
        self.__append_user_message(user_text, user_image_path, user_audio_path)
        return ollama.chat(self.__model, self.__messages, options={"temperature":0})
    
    def get_history(self):
        return self.__messages


In [30]:
retriever = RetrieverClient('/Users/anushverma/Desktop/lumar/_archive/documents')
chat = Chat()

In [None]:
# when user adds 
response = chat.get_assistant_response(user_image_path='/Users/anushverma/Desktop/lumar/_archive/documents/cat3.jpg')

In [33]:
dict(response)

{'model': 'gemma3:latest',
 'created_at': '2025-05-23T11:56:12.747742Z',
 'done': True,
 'done_reason': 'stop',
 'total_duration': 27605313166,
 'load_duration': 60969708,
 'prompt_eval_count': 291,
 'prompt_eval_duration': 24229854167,
 'eval_count': 126,
 'eval_duration': 3278491333,
 'message': Message(role='assistant', content="Here's a description of the image:\n\nThe image shows a beautiful, fluffy, grey British Shorthair cat. It's a classic British Shorthair with a dense, plush coat in shades of grey. The cat has striking amber-colored eyes and a dignified expression. \n\nIt's wearing a miniature, ornate golden crown adorned with glittering jewels. A red and black feathered ribbon is tied around its neck. The background is a dark, solid black, which makes the cat and its crown stand out dramatically. The overall effect is regal and humorous, portraying the cat as a miniature king or queen.", images=None, tool_calls=None)}