In [1]:
%%capture
!pip install numpy openai tiktoken

In [3]:
import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your open ai key here: ")

Enter your open ai key here:  ········


In [4]:
from openai import OpenAI
import numpy as np
import tiktoken
import os
from typing import List, Tuple, Dict, Any

# Contextual Embedding Class

In [16]:
class ContextualEmbedder:
    def __init__(self, model: str = "text-embedding-3-small"):
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.model = model
        self.tokenizer = tiktoken.encoding_for_model(model)
        
    def chunk_by_sentences(self, input_text: str) -> Tuple[List[str], List[Tuple[int, int]]]:
        """
        Split the input text into sentences and track token spans
        """
        tokens = self.tokenizer.encode(input_text)
        
        chunk_positions = []
        current_pos = 0
        
        for i in range(len(tokens)):
            decoded_token = self.tokenizer.decode([tokens[i]])
            if decoded_token == "." and i < len(tokens) - 1:
                next_token = self.tokenizer.decode([tokens[i + 1]])
                if next_token.startswith(" "):
                    chunk_positions.append((current_pos, i + 1))
                    current_pos = i + 1
        
        if current_pos < len(tokens):
            chunk_positions.append((current_pos, len(tokens)))
        
        chunks = []
        for start, end in chunk_positions:
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)
            
        return chunks, chunk_positions

    def embed_text(self, text: str) -> List[float]:
        """
        Generate embeddings for a single piece of text
        """
        try:
            response = self.client.embeddings.create(
                model=self.model,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error generating embedding: {e}")
            return []

    def traditional_chunking(self, chunks: List[str]) -> List[List[float]]:
        """
        Generate embeddings for each chunk independently
        """
        return [self.embed_text(chunk) for chunk in chunks]

    def contextual_chunking(self, input_text: str) -> Tuple[List[str], List[List[float]], List[List[float]]]:
        """
        Perform both traditional and context-sensitive chunking
        """
        chunks, span_annotations = self.chunk_by_sentences(input_text)
        
        traditional_embeddings = self.traditional_chunking(chunks)
        
        full_text_embedding = self.embed_text(input_text)
        
        contextual_embeddings = []
        embedding_dim = len(full_text_embedding)
        
        for start, end in span_annotations:
            chunk_length = end - start
            weights = np.ones(embedding_dim) * (1.0 / chunk_length)
            chunk_embedding = np.multiply(full_text_embedding, weights)
            contextual_embeddings.append(chunk_embedding.tolist())
        
        return chunks, contextual_embeddings, traditional_embeddings

    @staticmethod
    def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
        """
        Calculate cosine similarity between two vectors
        """
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Embedder Calling

In [17]:
embedder = ContextualEmbedder()

input_text = """Berlin is the capital and largest city of Germany, both by area and by population. 
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. 
The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."""

# FINAL RUN

In [18]:
chunks, contextual_emb, traditional_emb = embedder.contextual_chunking(input_text)

berlin_embedding = embedder.embed_text("Berlin")

for i, chunk in enumerate(chunks):
    context_sim = embedder.cosine_similarity(berlin_embedding, contextual_emb[i])
    trad_sim = embedder.cosine_similarity(berlin_embedding, traditional_emb[i])
    
    print(f'Chunk: "{chunk}"')
    print(f'Contextual similarity: {context_sim:.4f}')
    print(f'Traditional similarity: {trad_sim:.4f}\n')

Chunk: "Berlin is the capital and largest city of Germany, both by area and by population."
Contextual similarity: 0.4382
Traditional similarity: 0.4268

Chunk: " 
Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."
Contextual similarity: 0.4382
Traditional similarity: 0.2534

Chunk: " 
The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."
Contextual similarity: 0.4382
Traditional similarity: 0.3479



# METHOD 2 (Zero Entropy Repo -- The pdf text took 5 minutes to make chunks -- Not recommended)

In [28]:
import openai
import numpy as np
import tiktoken
import pandas as pd
from typing import List, Tuple, Dict, Any
from tqdm import tqdm
from dataclasses import dataclass
import os
import re

In [44]:
@dataclass
class ChunkingConfig:
    model: str = "gpt-4o"
    split_threshold: float = -2.0  # More strict threshold like Llama taken from the zero-entropy repository
    min_chunk_size: int = 100
    max_chunk_size: int = 1000

In [45]:
class ImprovedChunker:
    def __init__(self, config: ChunkingConfig):
        self.config = config
        self.client = openai.OpenAI()
        self.tokenizer = tiktoken.encoding_for_model(config.model)
        
    def get_split_decisions(self, text: str) -> List[Tuple[str, float]]:
        """Get split decisions and logprobs using OpenAI's API"""
        system_message = "You are a document chunking assistant. Insert '段' at natural break points in the text."
        prompt = f"Add the '段' character at natural semantic break points in this text:\n\n{text}"
        
        try:
            response = self.client.chat.completions.create(
                model=self.config.model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                ],
                logprobs=True,
                top_logprobs=5,
                temperature=0
            )
            
            token_decisions = []
            for token_info in response.choices[0].logprobs.content:
                token = token_info.token
                split_prob = float('-inf')
                
                for logprob in token_info.top_logprobs:
                    if '段' in logprob.token:
                        split_prob = logprob.logprob
                        break
                        
                token_decisions.append((token, split_prob))
            
            return token_decisions
            
        except Exception as e:
            print(f"Error getting logprobs: {e}")
            return []

    def chunk_text(self, text: str) -> List[str]:
        """Main chunking method working with tokens"""
        print("Getting split decisions...")
        token_decisions = self.get_split_decisions(text)
        
        print("Creating chunks based on split decisions...")
        chunks = []
        current_chunk = []
        current_text = ""
        
        for token, split_prob in token_decisions:
            current_chunk.append(token)
            current_text += token
            
            should_split = (
                split_prob > self.config.split_threshold and 
                len(current_text) >= self.config.min_chunk_size
            ) or len(current_text) >= self.config.max_chunk_size
            
            if should_split:
                chunk_text = ''.join(current_chunk).strip()
                if chunk_text:
                    chunks.append(chunk_text)
                current_chunk = []
                current_text = ""
        
        if current_chunk:
            chunk_text = ''.join(current_chunk).strip()
            if chunk_text:
                chunks.append(chunk_text)
            
        return chunks

    def analyze_chunks(self, chunks: List[str]) -> Dict[str, Any]:
        """Analyze the chunking results"""
        lengths = [len(chunk) for chunk in chunks]
        return {
            "num_chunks": len(chunks),
            "avg_chunk_length": np.mean(lengths),
            "min_chunk_length": min(lengths),
            "max_chunk_length": max(lengths),
            "total_characters": sum(lengths)
        }

In [46]:
text = """Section 1.
All legislative Powers herein granted shall be vested in a Congress of the United States, which shall consist of a Senate and House of Representatives.

Section 2.
The House of Representatives shall be composed of Members chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifications requisite for Electors of the most numerous Branch of the State Legislature.

No Person shall be a Representative who shall not have attained to the Age of twenty five Years, and been seven Years a Citizen of the United States, and who shall not, when elected, be an Inhabitant of that State in which he shall be chosen."""

config = ChunkingConfig()
chunker = ImprovedChunker(config)
chunks = chunker.chunk_text(text)

print("\nChunked Text:")
for i, chunk in enumerate(chunks, 1):
    print(f"\nChunk {i}:")
    print(chunk)
    print("-" * 80)

analysis = chunker.analyze_chunks(chunks)
print("\nChunking Analysis:")
for key, value in analysis.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")

Getting split decisions...
Creating chunks based on split decisions...

Chunked Text:

Chunk 1:
Section 1.  
All legislative Powers herein granted shall be vested in a Congress of the United States, which shall consist of a Senate and House of Representatives. \xe6
--------------------------------------------------------------------------------

Chunk 2:
\xae\xb5

Section 2.  
The House of Representatives shall be composed of Members chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifications requisite for Electors of the most numerous Branch of the State Legislature. \xe6\xae\xb5

No Person shall be a Representative who shall not have attained to the Age of twenty five Years, and been seven Years a Citizen of the United States, and who shall not, when elected, be an Inhabitant of that State in which he shall be chosen.
--------------------------------------------------------------------------------

Chunking Analysis:
num_

# METHOD 3

In [5]:
import os
import numpy as np
import tiktoken
from openai import OpenAI
from typing import List, Tuple, Dict, Any

In [6]:
class OpenAILateChunker:
    def __init__(self, model: str = "text-embedding-3-small"):
        """
        Initialize the chunker with OpenAI client and tokenizer
        """
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.model = model
        self.tokenizer = tiktoken.encoding_for_model(model)
        
    def chunk_by_tokens(self, input_text: str, chunk_size: int = 512) -> Tuple[List[str], List[Tuple[int, int]]]:
        """
        Split input text into chunks while keeping track of token spans
        
        Args:
            input_text: Text to be chunked
            chunk_size: Maximum number of tokens per chunk
            
        Returns:
            Tuple containing list of text chunks and their token span annotations
        """
        token_ids = self.tokenizer.encode(input_text)
        
        chunks = []
        span_annotations = []
        
        for i in range(0, len(token_ids), chunk_size):
            chunk_end = min(i + chunk_size, len(token_ids))
            if chunk_end - i > 0:
                chunk_tokens = token_ids[i:chunk_end]
                chunk_text = self.tokenizer.decode(chunk_tokens)
                chunks.append(chunk_text)
                span_annotations.append((i, chunk_end))
        
        return chunks, span_annotations
    
    def get_embeddings(self, text: str) -> List[float]:
        """
        Get embeddings for the entire text using OpenAI's API
        """
        response = self.client.embeddings.create(
            model=self.model,
            input=text
        )
        return response.data[0].embedding
    
    def late_chunking(self, text_embeddings: List[float], 
                     span_annotations: List[Tuple[int, int]], 
                     max_length: int = None) -> List[List[float]]:
        """
        Perform late chunking by pooling embeddings based on span annotations
        
        Args:
            text_embeddings: Embeddings for the entire text
            span_annotations: List of (start, end) token positions for each chunk
            max_length: Maximum sequence length to consider
            
        Returns:
            List of chunk embeddings
        """
        embeddings = np.array(text_embeddings)
        chunk_embeddings = []
        
        for start, end in span_annotations:
            if max_length is not None:
                end = min(end, max_length - 1)
                if start >= (max_length - 1):
                    continue
                    
            chunk_size = end - start
            if chunk_size >= 1:
                chunk_embedding = embeddings[start:end].mean(axis=0) if len(embeddings.shape) > 1 else embeddings
                chunk_embeddings.append(chunk_embedding)
                
        return chunk_embeddings

    @staticmethod
    def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
        """Calculate cosine similarity between two vectors"""
        vec1, vec2 = np.array(vec1), np.array(vec2)
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [7]:
chunker = OpenAILateChunker()

text = """As Weaviate celebrates its fifth anniversary, we've had the privilege of collaborating with tens of thousands of developers, gaining invaluable insights into the evolving landscape of AI projects and strategies. Our users constantly push the boundaries of what’s possible. As they continue to scale their applications in production, they guide the evolution of our product and the market itself. The need for optionality One of the main reasons developers choose Weaviate is the optionality it offers in terms of machine learning models, frameworks, and deployment. With new AI models and tools emerging daily, it's crucial to build systems that allow flexibility for tech stacks to evolve. This optionality, combined with ease of use, helps teams scale AI prototypes into production faster. Flexibility is also vital when it comes to architecture. Different use cases have different requirements. For example, we work with many software companies and those operating in regulated industries. They often require multi-tenancy to isolate data and maintain compliance. When building a Retrieval Augmented Generation (RAG) application, using account or user-specific data to contextualize results, data must remain within a dedicated tenant for its user group. Weaviate’s native, multi-tenant architecture shines for customers who need to prioritize data privacy while maintaining fast retrieval and accuracy. On the other hand, we support some very large scale single-tenant use cases that orient toward real-time data access. Many of these are in e-commerce and industries that compete on speed and customer experience. Even the slightest latency can send their users looking elsewhere. These use cases leverage our HNSW index on hot storage and vector compression to ensure low latency. The point is, there is no one-size-fits-all solution so optionality is key. I’m very proud that through learning from our customers and community, we’re building a solution that supports diverse use cases and the evolving needs of developers. Introducing hot, warm, and cold storage tiers It’s amazing to see our customers' products gain popularity, attracting more users, and in many cases, tenants. However, as multi-tenant use cases scale, infrastructure costs can quickly become prohibitive. Since multi-tenancy is a core tenet of our architecture, the next logical step for us was to build a way to help customers drive more efficient resource consumption. We’re pleased to offer tenant offloading and hot, warm, and cold storage tiers as part of our latest release. Weaviate users (Open Source and Enterprise Cloud) can now deactivate or offload tenants to less-expensive warm or cold storage and reactivate them dynamically, based on the unique patterns of their use case. Here’s what it might look like in practice: One of our customers develops an email platform with tens of thousands of users. 80% of their users are only active during a 12-hour window (US business hours). With our new storage tiers, they can offload tenants to cold storage to save on infrastructure costs when users are inactive. When a user comes online, they can quickly warm up the tenant. This way they reduce storage costs while still offering performance that meets the needs of their customers. alt The Weaviate AI Unit To adapt to this product change and the evolving AI stack, we’ve introduced a new pricing unit to our Enterprise Cloud offering. An AI Unit (AIU) is a Weaviate-specific unit that can be applied to hot, warm, and cold storage tiers and compute costs. AIUs enable customers to better monitor usage and improve budgeting. In addition to resource costs, AIUs will apply to new AI-native Apps as they are released (more on that next). Apps and tools to fuel AI-native development As we continue to listen to our community, it’s clear that developers need an AI-native framework offering not just flexibility, but also modular GUI tools to interact with their data and accelerate their use cases. We’re excited about a new line of AI-native apps and tools that will help developers and business users accelerate common use cases. Recommender App Our first app is a Recommender service, now in private beta. The Recommender is a fully managed, low-code way to build scalable recommendation systems. It offers configurable endpoints for item-to-item, item-to-user, and user-to-user recommendation scenarios across multimodal data. Sign up for the private beta here, and stay tuned for more Apps updates coming soon. alt Weaviate Cloud Tools Lastly, new Weaviate Cloud Tools give developers and non-technical users an easier way to manage, explore, and interact with their data within Weaviate Cloud. The Query and Collections tools are available now in the Weaviate Cloud Console. It’s been an exciting few months, and I’m ecstatic to continue learning from our community and empowering developers to build the future of AI-native possibilities. To dive deeper into our latest product updates, join our upcoming webinar. """

chunks, span_annotations = chunker.chunk_by_tokens(text, chunk_size=128)

print("Chunks created:")
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i}:")
    print(chunk)

Chunks created:

Chunk 0:
As Weaviate celebrates its fifth anniversary, we've had the privilege of collaborating with tens of thousands of developers, gaining invaluable insights into the evolving landscape of AI projects and strategies. Our users constantly push the boundaries of what’s possible. As they continue to scale their applications in production, they guide the evolution of our product and the market itself. The need for optionality One of the main reasons developers choose Weaviate is the optionality it offers in terms of machine learning models, frameworks, and deployment. With new AI models and tools emerging daily, it's crucial to build systems that allow flexibility for tech stacks to evolve. This option

Chunk 1:
ality, combined with ease of use, helps teams scale AI prototypes into production faster. Flexibility is also vital when it comes to architecture. Different use cases have different requirements. For example, we work with many software companies and those opera

In [8]:
text_embeddings = chunker.get_embeddings(text)

chunk_embeddings = chunker.late_chunking(text_embeddings, span_annotations)

print(f"Number of chunks: {len(chunks)}")
print(f"Number of chunk embeddings: {len(chunk_embeddings)}")

Number of chunks: 8
Number of chunk embeddings: 8


In [9]:
query_text = "what do customers need to prioritize?"
query_embedding = chunker.get_embeddings(query_text)

results = []
for i, chunk_embedding in enumerate(chunk_embeddings):
    similarity = chunker.cosine_similarity(query_embedding, chunk_embedding)
    results.append((similarity, i))

results.sort(reverse=True)

print(f"Query: {query_text}\n")
print("Top 2 most relevant chunks:")
for similarity, chunk_idx in results[:2]:
    print(f"\nChunk {chunk_idx}: Similarity = {similarity:.4f}")
    print(f"Text: {chunks[chunk_idx]}")

Query: what do customers need to prioritize?

Top 2 most relevant chunks:

Chunk 7: Similarity = 0.2628
Text:  non-technical users an easier way to manage, explore, and interact with their data within Weaviate Cloud. The Query and Collections tools are available now in the Weaviate Cloud Console. It’s been an exciting few months, and I’m ecstatic to continue learning from our community and empowering developers to build the future of AI-native possibilities. To dive deeper into our latest product updates, join our upcoming webinar. 

Chunk 6: Similarity = 0.2628
Text:  interact with their data and accelerate their use cases. We’re excited about a new line of AI-native apps and tools that will help developers and business users accelerate common use cases. Recommender App Our first app is a Recommender service, now in private beta. The Recommender is a fully managed, low-code way to build scalable recommendation systems. It offers configurable endpoints for item-to-item, item-to-user, a

# METHOD 4 

In [10]:
%%capture
!pip install semchunk

In [23]:
import os
import tiktoken
import semchunk
from openai import OpenAI
import numpy as np

In [24]:
def get_embeddings(text: str) -> list[float]:
    """Get embeddings for a text using OpenAI's embedding model."""
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [25]:
# Example text - we can replace this as we test
# text = """As Weaviate celebrates its fifth anniversary, we've had the privilege of collaborating with tens of thousands of developers, gaining invaluable insights into the evolving landscape of AI projects and strategies. Our users constantly push the boundaries of what’s possible. As they continue to scale their applications in production, they guide the evolution of our product and the market itself. The need for optionality One of the main reasons developers choose Weaviate is the optionality it offers in terms of machine learning models, frameworks, and deployment. With new AI models and tools emerging daily, it's crucial to build systems that allow flexibility for tech stacks to evolve. This optionality, combined with ease of use, helps teams scale AI prototypes into production faster. Flexibility is also vital when it comes to architecture. Different use cases have different requirements. For example, we work with many software companies and those operating in regulated industries. They often require multi-tenancy to isolate data and maintain compliance. When building a Retrieval Augmented Generation (RAG) application, using account or user-specific data to contextualize results, data must remain within a dedicated tenant for its user group. Weaviate’s native, multi-tenant architecture shines for customers who need to prioritize data privacy while maintaining fast retrieval and accuracy. On the other hand, we support some very large scale single-tenant use cases that orient toward real-time data access. Many of these are in e-commerce and industries that compete on speed and customer experience. Even the slightest latency can send their users looking elsewhere. These use cases leverage our HNSW index on hot storage and vector compression to ensure low latency. The point is, there is no one-size-fits-all solution so optionality is key. I’m very proud that through learning from our customers and community, we’re building a solution that supports diverse use cases and the evolving needs of developers. Introducing hot, warm, and cold storage tiers It’s amazing to see our customers' products gain popularity, attracting more users, and in many cases, tenants. However, as multi-tenant use cases scale, infrastructure costs can quickly become prohibitive. Since multi-tenancy is a core tenet of our architecture, the next logical step for us was to build a way to help customers drive more efficient resource consumption. We’re pleased to offer tenant offloading and hot, warm, and cold storage tiers as part of our latest release. Weaviate users (Open Source and Enterprise Cloud) can now deactivate or offload tenants to less-expensive warm or cold storage and reactivate them dynamically, based on the unique patterns of their use case. Here’s what it might look like in practice: One of our customers develops an email platform with tens of thousands of users. 80% of their users are only active during a 12-hour window (US business hours). With our new storage tiers, they can offload tenants to cold storage to save on infrastructure costs when users are inactive. When a user comes online, they can quickly warm up the tenant. This way they reduce storage costs while still offering performance that meets the needs of their customers. alt The Weaviate AI Unit To adapt to this product change and the evolving AI stack, we’ve introduced a new pricing unit to our Enterprise Cloud offering. An AI Unit (AIU) is a Weaviate-specific unit that can be applied to hot, warm, and cold storage tiers and compute costs. AIUs enable customers to better monitor usage and improve budgeting. In addition to resource costs, AIUs will apply to new AI-native Apps as they are released (more on that next). Apps and tools to fuel AI-native development As we continue to listen to our community, it’s clear that developers need an AI-native framework offering not just flexibility, but also modular GUI tools to interact with their data and accelerate their use cases. We’re excited about a new line of AI-native apps and tools that will help developers and business users accelerate common use cases. Recommender App Our first app is a Recommender service, now in private beta. The Recommender is a fully managed, low-code way to build scalable recommendation systems. It offers configurable endpoints for item-to-item, item-to-user, and user-to-user recommendation scenarios across multimodal data. Sign up for the private beta here, and stay tuned for more Apps updates coming soon. alt Weaviate Cloud Tools Lastly, new Weaviate Cloud Tools give developers and non-technical users an easier way to manage, explore, and interact with their data within Weaviate Cloud. The Query and Collections tools are available now in the Weaviate Cloud Console. It’s been an exciting few months, and I’m ecstatic to continue learning from our community and empowering developers to build the future of AI-native possibilities. To dive deeper into our latest product updates, join our upcoming webinar. """
text = ""
with open('demo_paper_to_chunk.txt', 'r') as f:
    text = f.read()
chunker = semchunk.chunkerify('gpt-4o', chunk_size=2000)

In [26]:
chunks, offsets = chunker(text, offsets=True)

print("Generated chunks:")
for i, chunk in enumerate(chunks):
    print(f"\nChunk {i}:")
    # Chunks can be printed, not printing right now because the output will be too long
    print(f"Offset: {offsets[i]}")

chunk_embeddings = [get_embeddings(chunk) for chunk in chunks]

Generated chunks:

Chunk 0:
Offset: (0, 9338)

Chunk 1:
Offset: (9339, 19185)

Chunk 2:
Offset: (19186, 25964)

Chunk 3:
Offset: (25965, 32717)

Chunk 4:
Offset: (32718, 40961)

Chunk 5:
Offset: (40962, 49260)

Chunk 6:
Offset: (49261, 57912)

Chunk 7:
Offset: (57913, 66572)

Chunk 8:
Offset: (66573, 73308)

Chunk 9:
Offset: (73309, 80218)

Chunk 10:
Offset: (80219, 85609)


In [27]:
def find_relevant_chunks(query: str, chunks: list[str], chunk_embeddings: list[list[float]], top_k: int = 2) -> list[tuple[float, int, str]]:
    """Find the most relevant chunks for a given query."""
    query_embedding = get_embeddings(query)
    
    similarities = []
    for i, chunk_embedding in enumerate(chunk_embeddings):
        similarity = cosine_similarity(query_embedding, chunk_embedding)
        similarities.append((similarity, i, chunks[i]))
    
    similarities.sort(reverse=True)
    
    return similarities[:top_k]

query = "Explain the challenges faced in the eBay dataset for collusion detection and how KnowGraph addresses them."

print(f"\nQuery: {query}")
print("\nTop relevant chunks:")
for score, idx, chunk in find_relevant_chunks(query, chunks, chunk_embeddings):
    print(f"\nSimilarity score: {score:.4f}")
    print(f"Chunk {idx}:")
    print(chunk)


Query: Explain the challenges faced in the eBay dataset for collusion detection and how KnowGraph addresses them.

Top relevant chunks:

Similarity score: 0.7313
Chunk 6:
Used in Business Rule, “[feedback amt < 𝑎] ∧
[(seller_age < 𝑏 ∨ buyer_age < 𝑐)] ⇒
collusion”
Business Domain knowledge using labeled attributes of the
zip code, price, and gross value of a transaction
Used in Business Rule, “[(gmv − price >
𝑑] ∧ [billing_zip ≠ delivery_zip] ⇒
collusion”
5.2 Collusion detection on real-world eBay
marketplace dataset
In this section, we describe the real-world eBay marketplace dataset
and our collusion detection results and analysis in detail.
Dataset. We use a large-scale proprietary dataset on real-world
marketplace transactions from the popular online shopping website
eBay, which has more than 135 million users [76]. The dataset contains transactions from 40 days total, each with around 4 million
transactions, collected from January to February of 2022. Each transaction consists of 

# We can observe the output above has answer to the question being asked. Method 4 seems best till now. 