In [1]:
import os
import tempfile
from time import time
from typing import Any, Dict, List, Tuple

from dotenv import load_dotenv
from huggingface_hub import snapshot_download
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import torch

import embedding_utils as eu

##### Parameters and environment variables

In [2]:
load_dotenv('minio.env')
MINIO_URL = os.environ['MINIO_URL']
MINIO_ACCESS_KEY = os.environ['MINIO_ACCESS_KEY']
MINIO_SECRET_KEY = os.environ['MINIO_SECRET_KEY']
if os.environ['MINIO_SECURE']=='true': MINIO_SECURE = True 
else: MINIO_SECURE = False 
PGVECTOR_HOST = os.environ['PGVECTOR_HOST']
PGVECTOR_DATABASE = os.environ['PGVECTOR_DATABASE']
PGVECTOR_USER = os.environ['PGVECTOR_USER']
PGVECTOR_PASSWORD = os.environ['PGVECTOR_PASSWORD']
PGVECTOR_PORT = os.environ['PGVECTOR_PORT']

MODELS_BUCKET = 'hf-models'
EMBEDDING_MODEL = 'intfloat/multilingual-e5-small' # Embedding model to use for converting text chunks to vector embeddings.
EMBEDDING_MODEL_REVISION = 'ffdcc22a9a5c973ef0470385cef91e1ecb461d9f'

BATCH_SIZE = 1  #100
CHUNK_SIZE = 1000                   # Text chunk sizes which will be converted to vector embeddings
CHUNK_OVERLAP = 10
DIMENSION = 384                     # Embeddings size
BUCKET_NAME = 'custom-corpus'         # Bucket name for batch creation of embeddings.

##### The systems temp directory

In [3]:
tempfile.gettempdir()

'/var/folders/_5/jt7lb09d49n9qscq4l2m3sph0000gn/T'

##### Download the model from Hugging Face and upload it to MinIO

In [4]:
eu.upload_model_to_minio(MODELS_BUCKET, EMBEDDING_MODEL, EMBEDDING_MODEL_REVISION)

Starting download from HF to /var/folders/_5/jt7lb09d49n9qscq4l2m3sph0000gn/T/hf-models/models--intfloat--multilingual-e5-small/snapshots/ffdcc22a9a5c973ef0470385cef91e1ecb461d9f.


Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

Uploading to MinIO.


##### Download the model from MinIO

In [5]:
model_path = eu.download_model_from_minio(MODELS_BUCKET, EMBEDDING_MODEL, EMBEDDING_MODEL_REVISION)

/var/folders/_5/jt7lb09d49n9qscq4l2m3sph0000gn/T/hf-models/models--intfloat--multilingual-e5-small/snapshots/ffdcc22a9a5c973ef0470385cef91e1ecb461d9f


##### Simple script to create embeddings from a document in MinIO

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = SentenceTransformer(model_path, device=device)
chunker = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len)

#chunks = ['Hello world','This is a test']
temp_file = eu.get_document_from_minio(BUCKET_NAME, 'The Strange Case of Dr Jekyll and Mr Hyde.txt')
file = open(temp_file, 'r')
data = file.read()
chunks = chunker.split_text(data)
print('Number of chunks:', len(chunks))
print('Length of the first chunk:', len(chunks[0]))
#print(chunks[0])

embeddings = embedding_model.encode(chunks, batch_size=BATCH_SIZE).tolist()
print('Number of embeddings:', len(embeddings))
print('Length of the first embedding:', len(embeddings[0]))

Number of chunks: 213
Length of the first chunk: 993
Number of embeddings: 213
Length of the first embedding: 384


##### Save the embeddings to the vector database

In [7]:
eu.save_embeddings_to_vectordb(chunks, embeddings)