In [1]:
! pip install tqdm
! pip install chromadb




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


^C


In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import chromadb

In [None]:
# Carregar o dataset
file_path = 'drive/MyDrive/datasets/PLN/Dados-abertos-COFOG-GC-2022.csv'
data = pd.read_csv(file_path, encoding='latin1', delimiter=';')

# Função de pooling para obter embeddings
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Inicializar o modelo e tokenizer
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to('cuda')

# Função para gerar embeddings para uma coluna em lotes
def get_embeddings(texts, batch_size=1024):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Gerando embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_dict = tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        embeddings.append(F.normalize(batch_embeddings, p=2, dim=1).cpu().numpy())
    return np.vstack(embeddings)

# Diretório para salvar os embeddings
output_dir = 'output_embeddings'
os.makedirs(output_dir, exist_ok=True)

# Gerar e salvar embeddings para cada coluna
embeddings_dict = {}
for column in tqdm(data.columns, desc="Processando colunas"):
    texts = data[column].astype(str).tolist()
    embeddings = get_embeddings(texts)
    embeddings_dict[column] = embeddings
    np.save(os.path.join(output_dir, f'{column}_embeddings.npy'), embeddings)

print("Embeddings gerados e salvos com sucesso.")


In [3]:
import numpy as np
import os
import chromadb
from tqdm import tqdm

# Inicializar o cliente ChromaDB
client = chromadb.Client()

# Função para limpar e validar nomes de coleções
def clean_column_name(name):
    clean_name = name.strip().replace(' ', '_').replace('.', '_').replace('-', '_')
    clean_name = ''.join(c for c in clean_name if c.isalnum() or c in ['_', '-'])
    return clean_name[:63]

# Função para verificar se uma coleção já existe
def collection_exists(client, name):
    try:
        client.get_collection(name=name)
        return True
    except ValueError:
        return False

# Diretório onde os embeddings estão salvos
output_dir = 'drive/MyDrive/datasets/PLN/output_embeddings/'

# Colunas que você quer carregar e testar
columns_to_test = ['NATUREZA DESPESA DETALHADA DESCRICAO', 'ACAO GOVERNO DESCRICAO']

# Carregar embeddings de arquivos .npy para ChromaDB
collections = {}
embeddings_dict = {}

try:
    for column in columns_to_test:
        clean_name = clean_column_name(column)
        file_path = os.path.join(output_dir, f"{clean_name}_embeddings.npy")

        # Carregar os embeddings do arquivo .npy
        embeddings = np.load(file_path)
        embeddings_dict[clean_name] = embeddings

        # Certifique-se de que a coleção existe, caso contrário, crie-a
        if not collection_exists(client, clean_name):
            collection = client.create_collection(name=clean_name)
        else:
            collection = client.get_collection(name=clean_name)

        # Armazenar embeddings no ChromaDB
        for i, embedding in enumerate(tqdm(embeddings, desc=f"Armazenando embeddings para {column}")):
            try:
                collection.add(ids=[f"{clean_name}_{i}"], embeddings=[embedding.tolist()], metadatas=[{"index": i}])
            except Exception as e:
                print(f"Erro ao adicionar embedding {i} para {column}: {e}")

        collections[clean_name] = collection
except Exception as e:
    print(f"Ocorreu um erro: {e}")



Armazenando embeddings para NATUREZA DESPESA DETALHADA DESCRICAO:   0%|          | 0/75991 [00:00<?, ?it/s]

: 

In [None]:
# Testar a similaridade usando um dos embeddings carregados
query_vector = embeddings_dict['NATUREZA_DESPESA_DETALHADA_DESCRICAO'][0]
results = collections['NATUREZA_DESPESA_DETALHADA_DESCRICAO'].query(vector=query_vector.tolist(), top_k=5)

for result in results['ids']:
    print(result)