In [None]:
#%pip install transformers torch

In [None]:
# Importar as bibliotecas necessárias
from google.cloud.aiplatform.matching_engine import MatchingEngineIndex
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
from google.cloud import storage
from datetime import datetime
import json
import uuid
import re
import os

PROJECT_ID = 'estudo-ia-449223'
LOCATION = 'us-central1'
uid = str(uuid.uuid4())[:5]

PROCESSING_BUCKET_NAME = 'demo-bucket-rag'
INDEX = f'demo-rag-product-embs_{uid}'
INDEX_ENDPOINT = f'demo-rag-product-embs-endpoint_{uid}'
INDEX_ENDPOINT_DEPLOYED = f'demo_rag_product_embs_deployed_{uid}'
embed_file_path = f'demo-embeddings_{uid}.json'
sentence_file_path = f'demo-sentences_{uid}.json'

aiplatform.init(project=PROJECT_ID, location=LOCATION)
storage_client = storage.Client()

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r"\u2022", "", text) # Remove bullet points
    return re.sub(r"\s+", " ", cleaned_text).strip() # Remove espaços extras

def generate_txt_embeddings(sentences):
    model = TextEmbeddingModel.from_pretrained('text-embedding-004')
    embeddings = model.get_embeddings(sentences)
    return [embedding.values for embedding in embeddings]

def upload_file(bucket_name, file_path):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(os.path.basename(file_path))
    blob.upload_from_filename(file_path)

def save_embeddings(sentences, embeddings):
    with open(embed_file_path, "w") as embed_file, open(sentence_file_path, "w") as sentence_file:
        for sentence, embedding in zip(sentences, embeddings):
            json.dump({"id": uid, "embedding": embedding}, embed_file)
            embed_file.write("\n")
            json.dump({"id": uid, "sentence": clean_text(sentence)}, sentence_file)
            sentence_file.write("\n")
    print(f"Arquivos {embed_file_path} e {sentence_file_path} salvos.")

In [None]:
sentences = ['o itau gosta de openmetadata', 'o bradesco gosta de astrix', 'o unibanco gosta de goiaba', 'o nubank gosta de abobora']
embeddings = generate_txt_embeddings(sentences)

save_embeddings(sentences, embeddings)

upload_file(PROCESSING_BUCKET_NAME, embed_file_path)
upload_file(PROCESSING_BUCKET_NAME, sentence_file_path)

In [None]:
# cria o index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = INDEX,
    contents_delta_uri = f"gs://{PROCESSING_BUCKET_NAME}/{embed_file_path}",
    dimensions = 768,
    approximate_neighbors_count = 10,
    project=PROJECT_ID,
    location=LOCATION
)

In [None]:
## cria o endpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = INDEX_ENDPOINT,
    public_endpoint_enabled = True
)

In [None]:
inicio = datetime.now()

# cria o deploy
my_index_endpoint.deploy_index(
    index = my_index, deployed_index_id = INDEX_ENDPOINT_DEPLOYED
)

print(f"{round((datetime.now() - inicio).total_seconds() / 60, 2)} minutos.")