# Dependencies

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_transformers import EmbeddingsRedundantFilter, LongContextReorder
from langchain.retrievers.document_compressors import EmbeddingsFilter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import os
from pathlib import Path

# vectorstore

In [2]:
LOCAL_VECTOR_STORE_DIR = Path("../data/vectorstore")

# Function to create a vectorstore

In [3]:
def create_vectorstore(embeddings, documents, vectorstore_name):
    """Create a Chroma vectorstore with persistence."""
    persist_directory = LOCAL_VECTOR_STORE_DIR.as_posix() + "/" + vectorstore_name
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vector_store.persist()
    return vector_store

# Function to select embeddings provider

In [4]:
def select_embeddings(provider):
    if provider == "openai":
        return OpenAIEmbeddings()
    elif provider == "huggingface":
        return HuggingFaceEmbeddings(model_name="thenlper/gte-large") # "all-MiniLM-L6-v2" "thenlper/gte-large"
    elif provider == "google":
        return GooglePalmEmbeddings(google_api_key=os.getenv("GOOGLE_API_KEY"))
    else:
        raise ValueError("Invalid provider.")

# Function to create a vectorstore-based retriever

In [5]:
def create_retriever(vectorstore, search_type="similarity", k=10, score_threshold=None):
    """Create a retriever based on the vectorstore."""
    search_kwargs = {}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold

    retriever = vectorstore.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    return retriever

# Function to create a contextual compression retriever

In [6]:
def create_compression_retriever(embeddings, base_retriever, chunk_size=3000, k=10, similarity_threshold=0.8):
    """Create a ContextualCompressionRetriever."""
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=100)
    redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
    relevant_filter = EmbeddingsFilter(embeddings=embeddings, k=k, similarity_threshold=similarity_threshold)
    reordering = LongContextReorder()

    pipeline_compressor = DocumentCompressorPipeline(
        transformers=[splitter, redundant_filter, relevant_filter, reordering]
    )
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=pipeline_compressor, 
        base_retriever=base_retriever
    )
    return compression_retriever

# Remove duplicates

In [7]:
def remove_duplicates(results):
    seen = set()
    unique_results = []
    for result in results:
        if result[0] not in seen:  # result[0] é o conteúdo do fragmento
            seen.add(result[0])
            unique_results.append(result)
    return unique_results


# Retrieve documents along with similarity scores and sort them by relevance.

In [8]:
def retrieve_with_scores(retriever, query, embeddings, k=4):
    results = retriever.get_relevant_documents(query)
    query_embedding = embeddings.embed_query(query)

    # Calculate similarity scores manually
    results_with_scores = []
    for result in results:
        doc_embedding = embeddings.embed_query(result.page_content)
        similarity = cosine_similarity(
            np.array(query_embedding).reshape(1, -1),
            np.array(doc_embedding).reshape(1, -1)
        )[0][0]  # Extract the similarity score
        results_with_scores.append((result.page_content, result.metadata, similarity))

    # Sort results by similarity score
    sorted_results = sorted(results_with_scores, key=lambda x: x[2], reverse=True)

    # Remove duplicates
    unique_results = remove_duplicates(sorted_results)
    return unique_results[:k]


# Load document fragments

In [9]:
with open("../data/fragments.pkl", "rb") as f:
    fragments = pickle.load(f)

print(f"Fragments loaded: {len(fragments)}")

Fragments loaded: 9


# Configure embeddings provider

In [10]:
provider = "huggingface"  # Change to "openai" or "google"
embeddings = select_embeddings(provider)

# Create vectorstore

In [11]:
vectorstore_name = "my_vectorstore"
vectorstore = create_vectorstore(embeddings, fragments, vectorstore_name)

# Create a basic retriever

In [12]:
retriever = create_retriever(vectorstore, search_type="similarity", k=4)

# Create a contextual compression retriever

In [13]:
compression_retriever = create_compression_retriever(
    embeddings=embeddings, 
    base_retriever=retriever, 
    chunk_size=500, 
    k=16, 
    similarity_threshold=0.8
)


# Test retrieval with the basic retriever


In [14]:
query = "Which course is best for beginners?"
results = retriever.get_relevant_documents(query)

print("Search results (Basic Retriever):")
for result in results:
    print(f"- Text: {result.page_content[:100]}...")
    print(f"  Metadata: {result.metadata}")


Search results (Basic Retriever):
- Text: Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, i...
  Metadata: {'page': 0, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: not require any prior knowledge. It provides a strong foundation to understand blockchain and its us...
  Metadata: {'page': 2, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: ● Use XRPL for fast, low-cost transactions. ● Explore specific tools for decentralized trading. ● De...
  Metadata: {'page': 2, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: Course 1: Blockchain Fundamentals ○ Original Price: $50.00 ○ Discounted Price: $45.00 2. Course 2: B...
  Metadata: {'page': 4, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}


In [15]:
query = "How much does the Blockchain Fundamentals course cost?"
results = retriever.get_relevant_documents(query)

print("Search results (Basic Retriever):")
for result in results:
    print(f"- Text: {result.page_content[:100]}...")
    print(f"  Metadata: {result.metadata}")

Search results (Basic Retriever):
- Text: Course 1: Blockchain Fundamentals ○ Original Price: $50.00 ○ Discounted Price: $45.00 2. Course 2: B...
  Metadata: {'page': 4, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, i...
  Metadata: {'page': 0, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: not require any prior knowledge. It provides a strong foundation to understand blockchain and its us...
  Metadata: {'page': 2, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
- Text: ● Use XRPL for fast, low-cost transactions. ● Explore specific tools for decentralized trading. ● De...
  Metadata: {'page': 2, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}


In [16]:
# Testar a recuperação com similaridade calculada manualmente
query = "What are the discounted prices for the courses?"
sorted_results = retrieve_with_scores(retriever, query, embeddings)

# Exibir resultados
print("Sorted results with calculated similarity:")
for text, metadata, score in sorted_results:
    print(f"- Text: {text[:100]}...")
    print(f"  Metadata: {metadata}")
    print(f"  Similarity Score: {score:.4f}")


Sorted results with calculated similarity:
- Text: Course 1: Blockchain Fundamentals ○ Original Price: $50.00 ○ Discounted Price: $45.00 2. Course 2: B...
  Metadata: {'page': 4, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.8672
- Text: Included Materials ● Detailed video tutorials. ● Pre-configured contract templates for XRPL. ● Acces...
  Metadata: {'page': 1, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.8081
- Text: to take the courses? ○ Answer: No, a high-performance computer is not required. Here are the minimum...
  Metadata: {'page': 3, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.8009
- Text: ● Use XRPL for fast, low-cost transactions. ● Explore specific tools for decentralized trading. ● De...
  Metadata: {'page': 2, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.7986


In [17]:
# Exibir todos os resultados sem limitar o número
query = "What are the discounted prices for the courses?"
results = retrieve_with_scores(retriever, query, embeddings, k=10)

print("All sorted results:")
for i, (text, metadata, score) in enumerate(results):
    print(f"Result {i+1}:")
    print(f"- Text: {text[:200]}...")
    print(f"  Metadata: {metadata}")
    print(f"  Similarity Score: {score:.4f}\n")


All sorted results:
Result 1:
- Text: Course 1: Blockchain Fundamentals ○ Original Price: $50.00 ○ Discounted Price: $45.00 2. Course 2: Blockchain in Practice with XRPL ○ Original Price: $80.00 ○ Discounted Price: $72.00 3. Course 3: Int...
  Metadata: {'page': 4, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.8672

Result 2:
- Text: Included Materials ● Detailed video tutorials. ● Pre-configured contract templates for XRPL. ● Access to simulation tools. Certification ● Certificate awarded after completing a practical project, suc...
  Metadata: {'page': 1, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score: 0.8081

Result 3:
- Text: to take the courses? ○ Answer: No, a high-performance computer is not required. Here are the minimum and recommended specifications: ● Minimum Requirements ○ Operating System: Windows 10, macOS Mojave...
  Metadata: {'page': 3, 'source': '..\\data\\docs\\Informações dos cursos.pdf'}
  Similarity Score