In [29]:
import weaviate
from weaviate.classes.config import Configure, Multi2VecField
from weaviate.classes.query import Filter
from loader import DataLoader
import base64
from hashlib import md5

class DatabaseClient:
    
    def __init__(self, folder_path):
        self.__folder_path = folder_path
        self.__create_client()
        if self.__generate_collection():
            self.loader = DataLoader(folder_path, chunk_size=300, chunk_overlap=50)
            self.__data_ingestion(folder_path)
   
    def __create_client(self):
        self.client = weaviate.connect_to_local()
    
    def __get_hashed_path(self):
        return self.__folder_path.split("\\")[-1]  + ''.join(filter(str.isalpha, md5(self.__folder_path.encode()).hexdigest()))

    def __generate_collection(self):
        collection_name = self.__get_hashed_path()
        if self.client.collections.exists(collection_name):
            # self.collection = self.client.collections.get(self.__get_hashed_path())
            # return False
            self.client.collections.delete(collection_name)

        self.collection = self.client.collections.create(
            name=collection_name,
            vectorizer_config=Configure.Vectorizer.text2vec_ollama(
                api_endpoint="http://host.docker.internal:11434",
                model='nomic-embed-text'
            )
        )
        return True
    
    def __data_ingestion(self, folder_path):
        object_list = self.loader.load_data()
        with self.collection.batch.dynamic() as batch:
            for object in object_list:
                batch.add_object(
                    properties=object
                )
    
    def search_with_text(self, query : str, search_for="all"):
        if search_for == "all":
            response =  self.collection.query.near_text(
                query=query,
                limit=5
            )
        elif search_for == "image":
            response =  self.collection.query.near_text(
                query=query,
                filters=Filter.by_property("media_type").equal("image"),
                limit=5
            )
        elif search_for == "text":
            response =  self.collection.query.near_text(
                query=query,
                filters=Filter.by_property("media_type").equal("text"),
                limit=5
            )

        return [object.properties for object in response.objects]
        
    def search_with_image(self, image_path, search_for = 'all'):
        def to_base64(path):
            with open(path, 'rb') as file:
                return base64.b64encode(file.read()).decode('utf-8')
        if search_for == "all":
            response = self.collection.query.near_image(
                near_image=to_base64(image_path),
                limit=5
            )
        elif search_for == "image":
            response = self.collection.query.near_image(
                near_image=to_base64(image_path),
                filters=Filter.by_property("media_type").equal("image"),
                limit=5
                )
        elif search_for == "text":
            response = self.collection.query.near_image(
                near_image=to_base64(image_path),
                filters=Filter.by_property("media_type").equal("text"),
                limit=5
            )
        return [object.properties for object in response.objects]
    
    def list_collections(self):
        return self.client.collections.list_all()
    
    def delete_collections(self, name):
        return self.client.collections.delete(name)
    
    def close_connection(self):
        self.client.close()
            
    def __repr__(self):
        return f"""Folder: {self.__folder_path}"""


In [30]:
database = DatabaseClient(folder_path=r"C:\Users\Anush\Desktop\Christ\Specialization Project\Localinsight\text-only-test\documents")

In [32]:
database.list_collections()

{'Documentseebedcaceeab': _CollectionConfigSimple(name='Documentseebedcaceeab', description=None, generative_config=None, properties=[_Property(name='path', description="This property was generated by Weaviate's auto-schema feature on Thu Aug 22 05:49:04 2024", data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=False), vectorizer='text2vec-ollama'), _Property(name='media_type', description="This property was generated by Weaviate's auto-schema feature on Thu Aug 22 05:49:04 2024", data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=False), vectorizer='text2vec-ollama'), _Property(name='te

In [33]:
query = "Why did Alice decide to follow the White Rabbit down the rabbit-hole?"

In [34]:
database.search_with_text(query)

[{'text': 'Alice was not a bit hurt, and she jumped up on to her feet in a moment: she looked up, but it was all dark overhead; before her was another long passage, and the White Rabbit was still in sight, hurrying down it. There was not a moment to be lost: away went Alice like the wind, and was just in time',
  'chunk_no': 23.0,
  'path': 'C:\\Users\\Anush\\Desktop\\Christ\\Specialization Project\\Localinsight\\text-only-test\\documents\\alice.txt',
  'media_type': 'text'},
 {'text': 'So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.',
  'chunk_no': 3.0,
  'path': 'C:\\Users\\Anush\\Desktop\\Christ\\Specialization Project\\Localinsight\\text-only-test\\documents\\alice.txt',
  'media_type': 'text'},
 {'text': 'she had never before seen a rabbit wit