# Dependencies

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_transformers import EmbeddingsRedundantFilter, LongContextReorder
from langchain.retrievers.document_compressors import EmbeddingsFilter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import os
from pathlib import Path

# vectorstore

In [None]:
LOCAL_VECTOR_STORE_DIR = Path("../data/vectorstore")

# Function to create a vectorstore

In [None]:
def create_vectorstore(embeddings, documents, vectorstore_name):
    """Create a Chroma vectorstore with persistence."""
    persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vector_store.persist()
    return vector_store

# Function to select embeddings provider

In [None]:
def select_embeddings(provider):
    if provider == "openai":
        return OpenAIEmbeddings()
    elif provider == "huggingface":
        return HuggingFaceEmbeddings(model_name="thenlper/gte-large") # "all-MiniLM-L6-v2" "thenlper/gte-large"
    elif provider == "google":
        return GooglePalmEmbeddings(google_api_key=os.getenv("GOOGLE_API_KEY"))
    else:
        raise ValueError("Invalid provider.")

# Function to create a vectorstore-based retriever

In [None]:
def create_retriever(vectorstore, search_type="similarity", k=10, score_threshold=None):
    """Create a retriever based on the vectorstore."""
    search_kwargs = {}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold

    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    return retriever

# Function to create a contextual compression retriever

In [None]:
def create_compression_retriever(embeddings, base_retriever, chunk_size=3000, k=10, similarity_threshold=0.8):
    """Create a ContextualCompressionRetriever."""
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
    redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
    relevant_filter = EmbeddingsFilter(embeddings=embeddings, k=k, similarity_threshold=similarity_threshold)
    reordering = LongContextReorder()

    pipeline_compressor = DocumentCompressorPipeline(
        transformers=[splitter, redundant_filter, relevant_filter, reordering]
    )
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=pipeline_compressor, 
        base_retriever=base_retriever
    )
    return compression_retriever

# Remove duplicates

In [None]:
def remove_duplicates(results):
    seen = set()
    unique_results = []
    for result in results:
        if result[0] not in seen:  # result[0] é o conteúdo do fragmento
            seen.add(result[0])
            unique_results.append(result)
    return unique_results


# Retrieve documents along with similarity scores and sort them by relevance.

In [None]:
def retrieve_with_scores(retriever, query, embeddings, k=4):
    results = retriever.get_relevant_documents(query)
    query_embedding = embeddings.embed_query(query)

    # Calculate similarity scores manually
    results_with_scores = []
    for result in results:
        doc_embedding = embeddings.embed_query(result.page_content)
        similarity = cosine_similarity(
            np.array(query_embedding).reshape(1, -1),
            np.array(doc_embedding).reshape(1, -1)
        )[0][0]  # Extract the similarity score
        results_with_scores.append((result.page_content, result.metadata, similarity))

    # Sort results by similarity score
    sorted_results = sorted(results_with_scores, key=lambda x: x[2], reverse=True)

    # Remove duplicates
    unique_results = remove_duplicates(sorted_results)
    return unique_results[:k]


# Load document fragments

In [None]:
with open("../data/fragments.pkl", "rb") as f:
    fragments = pickle.load(f)

print(f"Fragments loaded: {len(fragments)}")

# Configure embeddings provider

In [None]:
provider = "huggingface"  # Change to "openai" or "google"
embeddings = select_embeddings(provider)

# Create vectorstore

In [None]:
vectorstore_name = "my_vectorstore"
vectorstore = create_vectorstore(embeddings, fragments, vectorstore_name)

# Create a basic retriever

In [None]:
retriever = create_retriever(vectorstore, search_type="similarity", k=4)

# Create a contextual compression retriever

In [None]:
compression_retriever = create_compression_retriever(
    embeddings=embeddings, 
    base_retriever=retriever, 
    chunk_size=500, 
    k=16, 
    similarity_threshold=0.8
)


# Test retrieval with the basic retriever


In [None]:
query = "Which course is best for beginners?"
results = retriever.get_relevant_documents(query)

print("Search results (Basic Retriever):")
for result in results:
    print(f"- Text: {result.page_content[:100]}...")
    print(f"  Metadata: {result.metadata}")


In [None]:
query = "How much does the Blockchain Fundamentals course cost?"
results = retriever.get_relevant_documents(query)

print("Search results (Basic Retriever):")
for result in results:
    print(f"- Text: {result.page_content[:100]}...")
    print(f"  Metadata: {result.metadata}")

In [None]:
# Testar a recuperação com similaridade calculada manualmente
query = "What are the discounted prices for the courses?"
sorted_results = retrieve_with_scores(retriever, query, embeddings)

# Exibir resultados
print("Sorted results with calculated similarity:")
for text, metadata, score in sorted_results:
    print(f"- Text: {text[:100]}...")
    print(f"  Metadata: {metadata}")
    print(f"  Similarity Score: {score:.4f}")


In [None]:
# Exibir todos os resultados sem limitar o número
query = "What are the discounted prices for the courses?"
results = retrieve_with_scores(retriever, query, embeddings, k=10)

print("All sorted results:")
for i, (text, metadata, score) in enumerate(results):
    print(f"Result {i+1}:")
    print(f"- Text: {text[:200]}...")
    print(f"  Metadata: {metadata}")
    print(f"  Similarity Score: {score:.4f}\n")


In [None]:
import chromadb
from chromadb.utils import embedding_functions

# Configurar o ChromaDB
DB_PATH = "./chroma.sqlite3"
COLLECTION_NAME = "blockchain_courses"

# Inicializar cliente e função de embedding
client = chromadb.PersistentClient(path=DB_PATH)
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="thenlper/gte-large")

# Verificar ou criar a coleção
collection = client.get_or_create_collection(COLLECTION_NAME, embedding_function=embedding_function)

# Definir cursos disponíveis
courses = [
    {
        "course_name": "Blockchain Fundamentals",
        "description": "Learn the basics of blockchain, its history, and how it is transforming industries.",
        "price": 50.00,
        "discounted_price": 45.00,  # Para pagamentos em XRP
        "duration": "5 weeks"
    },
    {
        "course_name": "Blockchain in Practice with XRPL",
        "description": "Hands-on course on using the XRP Ledger for practical applications.",
        "price": 80.00,
        "discounted_price": 72.00,
        "duration": "4 weeks"
    },
    {
        "course_name": "Cryptocurrency Trading",
        "description": "Learn trading strategies and risk management in the cryptocurrency market.",
        "price": 100.00,
        "discounted_price": 90.00,
        "duration": "6 weeks"
    },
    {
        "course_name": "Trading on Blockchain with XRPL",
        "description": "Develop trading strategies on blockchain using decentralized tools on the XRPL.",
        "price": 120.00,
        "discounted_price": 108.00,
        "duration": "5 weeks"
    }
]

# Dados para adicionar
documents = [f"{course['course_name']}: {course['description']}" for course in courses]
metadatas = [
    {
        "course_name": course["course_name"],
        "price": course["price"],
        "discounted_price": course["discounted_price"],
        "duration": course["duration"]
    } for course in courses
]
ids = [f"course_{i}" for i in range(len(courses))]

# Adicionar dados à coleção
collection.add(documents=documents, metadatas=metadatas, ids=ids)
print("Cursos adicionados com sucesso!")

# Validar o conteúdo da coleção
print("Conteúdo armazenado:")
print(collection.peek())
