In [4]:
from langchain_ollama import OllamaEmbeddings

In [5]:
embeddings = OllamaEmbeddings(
    model="gemma:2b",
    base_url="http://localhost:11434"
)

In [8]:
r1 = embeddings.embed_documents(
    [
        "Alpha is the first letter of the Greek alphabet.",
        "Beta is the second letter of the Greek alphabet."
    ]
)

In [11]:
len(r1[0]) # Number of dimensions

2048

In [13]:
query = "What is the second letter of the Greek alphabet?"
query_embedding = embeddings.embed_query(query)
print(query_embedding)

[-0.036740024, -0.0058010593, -0.030905474, -0.00035853568, -0.008297962, -0.0029846425, -0.03560881, -0.015521313, -0.0017295667, -0.007713875, -0.0007875992, -0.0014955627, 0.024870455, 0.0160487, -0.0050935275, 0.010757694, 0.13312878, -0.009449341, -0.003391284, -0.0048198, 0.014739374, -0.013735264, 0.026499884, 0.006015866, -0.021792116, -0.021085624, 0.00047484253, -0.002157563, -0.012141372, -0.0069448524, -0.018328404, 0.017302996, 0.018818323, 0.012210767, 0.0016984342, -0.02041033, -0.023061173, 0.0053757476, 0.0070677362, 0.0026206048, -0.0048789713, 0.0312925, -0.022511631, -0.03517143, -0.03147661, -0.0019988078, 0.02732176, 0.0076930486, -0.026674971, 0.0032386386, -0.14555566, -0.064171985, -0.026723957, 0.032631326, 0.0018970622, -0.019456705, -0.0219236, 0.0014572687, -0.00043265102, -0.007091734, -0.012689769, 0.0108331395, -0.017577462, 0.0043155495, -0.062706426, -0.00028246554, 0.0009393583, -0.011708974, 0.005463172, 0.024130462, 0.026656419, -0.013587462, -0.025

In [14]:
import numpy as np

# Calculate dot product between document embeddings and query embedding
dot_product_r1_0 = np.dot(r1[0], query_embedding)
dot_product_r1_1 = np.dot(r1[1], query_embedding)

print(f"Dot product of r1[0] (Alpha text) with query: {dot_product_r1_0}")
print(f"Dot product of r1[1] (Beta text) with query: {dot_product_r1_1}")

# Show which document is more similar to the query
print(f"\nQuery: '{query}'")
print(f"Document 0: 'Alpha is the first letter of the Greek alphabet.'")
print(f"Document 1: 'Beta is the second letter of the Greek alphabet.'")
print(f"\nDocument {1 if dot_product_r1_1 > dot_product_r1_0 else 0} is more similar to the query (higher dot product)")
print(f"Similarity scores - Doc 0: {dot_product_r1_0:.4f}, Doc 1: {dot_product_r1_1:.4f}")


Dot product of r1[0] (Alpha text) with query: 0.5934741662961289
Dot product of r1[1] (Beta text) with query: 0.5828624567188632

Query: 'What is the second letter of the Greek alphabet?'
Document 0: 'Alpha is the first letter of the Greek alphabet.'
Document 1: 'Beta is the second letter of the Greek alphabet.'

Document 0 is more similar to the query (higher dot product)
Similarity scores - Doc 0: 0.5935, Doc 1: 0.5829


In [15]:
# Let's investigate this unexpected result
# Check the embedding dimensions and magnitudes

print("=== DIAGNOSTIC INFORMATION ===")
print(f"r1[0] embedding length: {len(r1[0])}")
print(f"r1[1] embedding length: {len(r1[1])}")
print(f"query_embedding length: {len(query_embedding)}")

print(f"\nr1[0] magnitude (L2 norm): {np.linalg.norm(r1[0]):.6f}")
print(f"r1[1] magnitude (L2 norm): {np.linalg.norm(r1[1]):.6f}")
print(f"query magnitude (L2 norm): {np.linalg.norm(query_embedding):.6f}")

# Let's also try cosine similarity which normalizes for magnitude
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_0 = cosine_similarity([r1[0]], [query_embedding])[0][0]
cosine_sim_1 = cosine_similarity([r1[1]], [query_embedding])[0][0]

print(f"\n=== COSINE SIMILARITY (NORMALIZED) ===")
print(f"Cosine similarity r1[0] with query: {cosine_sim_0:.6f}")
print(f"Cosine similarity r1[1] with query: {cosine_sim_1:.6f}")
print(f"Document {1 if cosine_sim_1 > cosine_sim_0 else 0} is more similar using cosine similarity")


=== DIAGNOSTIC INFORMATION ===
r1[0] embedding length: 2048
r1[1] embedding length: 2048
query_embedding length: 2048

r1[0] magnitude (L2 norm): 1.000000
r1[1] magnitude (L2 norm): 1.000000
query magnitude (L2 norm): 0.999999

=== COSINE SIMILARITY (NORMALIZED) ===
Cosine similarity r1[0] with query: 0.593474
Cosine similarity r1[1] with query: 0.582863
Document 0 is more similar using cosine similarity


In [16]:
# Let's test with more specific queries to understand the embedding behavior
test_queries = [
    "What is the first letter of the Greek alphabet?",
    "What is the second letter of the Greek alphabet?", 
    "Alpha",
    "Beta",
    "first letter",
    "second letter"
]

print("=== TESTING DIFFERENT QUERIES ===")
for i, test_query in enumerate(test_queries):
    test_embedding = embeddings.embed_query(test_query)
    
    cos_sim_0 = cosine_similarity([r1[0]], [test_embedding])[0][0]
    cos_sim_1 = cosine_similarity([r1[1]], [test_embedding])[0][0]
    
    winner = "Doc 1 (Beta)" if cos_sim_1 > cos_sim_0 else "Doc 0 (Alpha)"
    
    print(f"\nQuery {i+1}: '{test_query}'")
    print(f"  Alpha doc similarity: {cos_sim_0:.4f}")
    print(f"  Beta doc similarity:  {cos_sim_1:.4f}")
    print(f"  Winner: {winner}")


=== TESTING DIFFERENT QUERIES ===

Query 1: 'What is the first letter of the Greek alphabet?'
  Alpha doc similarity: 0.6233
  Beta doc similarity:  0.5854
  Winner: Doc 0 (Alpha)

Query 2: 'What is the second letter of the Greek alphabet?'
  Alpha doc similarity: 0.5935
  Beta doc similarity:  0.5829
  Winner: Doc 0 (Alpha)

Query 3: 'Alpha'
  Alpha doc similarity: 0.5196
  Beta doc similarity:  0.5098
  Winner: Doc 0 (Alpha)

Query 4: 'Beta'
  Alpha doc similarity: 0.4769
  Beta doc similarity:  0.4913
  Winner: Doc 1 (Beta)

Query 5: 'first letter'
  Alpha doc similarity: 0.6408
  Beta doc similarity:  0.6152
  Winner: Doc 0 (Alpha)

Query 6: 'second letter'
  Alpha doc similarity: 0.6705
  Beta doc similarity:  0.6516
  Winner: Doc 0 (Alpha)
