In [None]:
## Exercise 2: Chunk Size Impact on Retrieval

In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")

Similarity function ready!


In [4]:
# Long document
document = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural intelligence displayed by humans and animals. Leading AI textbooks define
the field as the study of intelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence that focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

Deep learning is part of a broader family of machine learning methods based on artificial
neural networks with representation learning. Learning can be supervised, semi-supervised
or unsupervised. Deep learning architectures such as deep neural networks, deep belief
networks, recurrent neural networks and convolutional neural networks have been applied
to fields including computer vision, speech recognition, natural language processing,
machine translation, and bioinformatics.

Natural language processing is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural
language data. Challenges in natural language processing frequently involve speech
recognition, natural language understanding, and natural language generation.
"""

# Function to split text into chunks
def chunk_text(text, chunk_size):
    chunks = []
    start = 0
    while start < len(text):
        chunk = text[start:start+chunk_size].strip()
        if chunk:
            chunks.append(chunk)
        start += chunk_size
    return chunks

# Query
query = "What is machine learning?"

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Different chunk sizes
chunk_sizes = [100, 200, 400]

for size in chunk_sizes:
    print(f"\n{'='*10} Chunk Size: {size} {'='*10}")
    
    # Chunk document
    chunks = chunk_text(document, size)
    print(f"Number of chunks: {len(chunks)}")
    
    # Create embeddings for chunks
    chunk_embeddings = model.encode(chunks)
    
    # Create embedding for query
    query_embedding = model.encode([query])[0]
    
    # Calculate similarity scores
    similarities = []
    for i, emb in enumerate(chunk_embeddings):
        score = cosine_similarity(query_embedding, emb)
        similarities.append((chunks[i], score))
    
    # Sort by similarity (descending)
    similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Print top 3 results
    print("Top 3 chunks:")
    for chunk, score in similarities_sorted[:3]:
        print(f"- Score: {score:.4f}, Chunk: \"{chunk}\"")


Number of chunks: 16
Top 3 chunks:
- Score: 0.6617, Chunk: "nce of successfully achieving its goals.

Machine learning is a subset of artificial intelligence th"
- Score: 0.6029, Chunk: "ng its accuracy.
Machine learning is an important component of the growing field of data science.

D"
- Score: 0.4909, Chunk: "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
the natural i"

Number of chunks: 8
Top 3 chunks:
- Score: 0.6932, Chunk: "at focuses on the use of data
and algorithms to imitate the way that humans learn, gradually improving its accuracy.
Machine learning is an important component of the growing field of data science.

D"
- Score: 0.6098, Chunk: "ntelligent agents: any device that perceives its environment
and takes actions that maximize its chance of successfully achieving its goals.

Machine learning is a subset of artificial intelligence th"
- Score: 0.4997, Chunk: "eep learning is part of a broader family of machine learning methods b