In [4]:
!pip install -q langchain-community langchain_chroma langchain-nvidia-ai-endpoints

In [2]:
import os
from google.colab import userdata
os.environ['NVIDIA_API_KEY'] = userdata.get('NVIDIA_API_KEY')

In [3]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the embedding model
embedding_model = NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2")

# Document sentences
documents = [
    "I like chicken rice",
    "สมชายชอบกินข้าวขาหมู",
    "A lazy dog chases a furry cat",
    "The apple is sweeter than orange.",
    "ฉันชอบกินข้าวมันไก่"
]

# Generate embeddings for all documents
print("Generating embeddings...")
embeddings = embedding_model.embed_documents(documents)

# Convert to numpy array for easier manipulation
embeddings_array = np.array(embeddings)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings_array)

# Display similarity matrix with labels
print("\nSimilarity Matrix:")
print("=" * 50)

# Print header
print(f"{'':<30}", end="")
for i, doc in enumerate(documents):
    print(f"{f'Doc {i}':<12}", end="")
print()

# Print similarity values
for i, doc1 in enumerate(documents):
    print(f"{f'Doc {i} - {doc1[:20]}...':<30}", end="")
    for j, doc2 in enumerate(documents):
        print(f"{similarity_matrix[i][j]:<12.4f}", end="")
    print()

# Find and display top similarities
print("\n" + "=" * 50)
print("Top Similarity Pairs (excluding self-similarity):")
print("=" * 50)

# Create list of all pairs with their similarity scores
pairs = []
for i in range(len(documents)):
    for j in range(i+1, len(documents)):
        pairs.append((i, j, similarity_matrix[i][j]))

# Sort by similarity score (descending)
pairs.sort(key=lambda x: x[2], reverse=True)

# Display top pairs
for i, (doc1_idx, doc2_idx, score) in enumerate(pairs):
    print(f"{i+1:2d}. Doc {doc1_idx} & Doc {doc2_idx}: {score:.4f}")
    print(f"    '{documents[doc1_idx]}'")
    print(f"    '{documents[doc2_idx]}'")
    print()

# Analyze specific relationships
print("=" * 50)
print("Cross-language Analysis:")
print("=" * 50)

# Check similarity between English and Thai sentences about chicken rice
chicken_rice_indices = [0, 1, 4]  # Indices of chicken rice sentences
print("Chicken rice related sentences:")
for i in chicken_rice_indices:
    for j in chicken_rice_indices:
        if i != j:
            print(f"  '{documents[i]}' vs '{documents[j]}': {similarity_matrix[i][j]:.4f}")

Generating embeddings...

Similarity Matrix:
                              Doc 0       Doc 1       Doc 2       Doc 3       Doc 4       
Doc 0 - I like chicken rice...1.0000      0.7656      0.5292      0.5985      0.8441      
Doc 1 - สมชายชอบกินข้าวขาหมู...0.7656      1.0000      0.5075      0.4970      0.8217      
Doc 2 - A lazy dog chases a ...0.5292      0.5075      1.0000      0.5271      0.5281      
Doc 3 - The apple is sweeter...0.5985      0.4970      0.5271      1.0000      0.5312      
Doc 4 - ฉันชอบกินข้าวมันไก่...0.8441      0.8217      0.5281      0.5312      1.0000      

Top Similarity Pairs (excluding self-similarity):
 1. Doc 0 & Doc 4: 0.8441
    'I like chicken rice'
    'ฉันชอบกินข้าวมันไก่'

 2. Doc 1 & Doc 4: 0.8217
    'สมชายชอบกินข้าวขาหมู'
    'ฉันชอบกินข้าวมันไก่'

 3. Doc 0 & Doc 1: 0.7656
    'I like chicken rice'
    'สมชายชอบกินข้าวขาหมู'

 4. Doc 0 & Doc 3: 0.5985
    'I like chicken rice'
    'The apple is sweeter than orange.'

 5. Doc 3 & Doc 4: 0.53

In [5]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the models
llm = ChatNVIDIA(model="meta/llama-3.2-3b-instruct")
embedder = NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2")

# Document sentences
documents = [
    "I like chicken rice",
    "สมชายชอบกินข้าวขาหมู",
    "A lazy dog chases a furry cat",
    "The apple is sweeter than orange.",
    "ฉันชอบกินข้าวมันไก่"
]

# Get embeddings for all documents
print("Generating embeddings...")
embeddings = embedder.embed_documents(documents)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# Display similarity scores
print("\nSimilarity Matrix:")
print("Rows/Columns represent documents in order:")
for i, doc in enumerate(documents):
    print(f"{i}: {doc}")

print("\nSimilarity Scores (cosine similarity):")
print("Format: [row_doc_index, col_doc_index] = similarity_score")
print("-" * 50)

for i in range(len(documents)):
    for j in range(len(documents)):
        if i <= j:  # Only show upper triangle including diagonal to avoid duplicates
            score = similarity_matrix[i][j]
            print(f"[{i}, {j}] = {score:.4f}")
            if i != j:
                print(f"  '{documents[i]}' <-> '{documents[j]}'")

# Find and display top similarities
print("\n" + "="*60)
print("TOP SIMILARITY PAIRS:")
print("="*60)

# Create list of all pairs with their similarity scores
pairs = []
for i in range(len(documents)):
    for j in range(i+1, len(documents)):
        score = similarity_matrix[i][j]
        pairs.append((i, j, score, documents[i], documents[j]))

# Sort by similarity score (descending)
pairs.sort(key=lambda x: x[2], reverse=True)

# Display top 5 most similar pairs
for idx, (i, j, score, doc1, doc2) in enumerate(pairs[:5]):
    print(f"{idx+1}. Similarity: {score:.4f}")
    print(f"   '{doc1}'")
    print(f"   '{doc2}'")
    print()

# Analyze using LLM for semantic understanding
print("="*60)
print("SEMANTIC ANALYSIS USING LLM:")
print("="*60)

# Analyze the most similar pair with LLM
if pairs:
    most_similar_pair = pairs[0]
    doc1, doc2 = most_similar_pair[3], most_similar_pair[4]

    prompt = f"""
    Analyze the semantic similarity between these two sentences:

    Sentence 1: "{doc1}"
    Sentence 2: "{doc2}"

    Please explain:
    1. What is the semantic relationship between these sentences?
    2. Why might they be considered similar or different?
    3. Do they share similar concepts, themes, or meanings?
    """

    print(f"Analyzing most similar pair:")
    print(f"'{doc1}'")
    print(f"'{doc2}'")
    print("\nLLM Analysis:")
    print("-" * 30)

    response = llm.invoke(prompt)
    print(response.content if hasattr(response, 'content') else str(response))

# Additional analysis: Find sentences most similar to a query
print("\n" + "="*60)
print("QUERY SIMILARITY EXAMPLE:")
print("="*60)

query = "I enjoy eating chicken rice"
print(f"Query: '{query}'")

# Get embedding for query
query_embedding = embedder.embed_query(query)

# Calculate similarity with all documents
query_similarities = []
for i, (doc, doc_embedding) in enumerate(zip(documents, embeddings)):
    similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
    query_similarities.append((i, doc, similarity))

# Sort by similarity
query_similarities.sort(key=lambda x: x[2], reverse=True)

print("\nSimilarity ranking:")
for rank, (idx, doc, score) in enumerate(query_similarities, 1):
    print(f"{rank}. {score:.4f} - '{doc}'")

Generating embeddings...

Similarity Matrix:
Rows/Columns represent documents in order:
0: I like chicken rice
1: สมชายชอบกินข้าวขาหมู
2: A lazy dog chases a furry cat
3: The apple is sweeter than orange.
4: ฉันชอบกินข้าวมันไก่

Similarity Scores (cosine similarity):
Format: [row_doc_index, col_doc_index] = similarity_score
--------------------------------------------------
[0, 0] = 1.0000
[0, 1] = 0.7656
  'I like chicken rice' <-> 'สมชายชอบกินข้าวขาหมู'
[0, 2] = 0.5292
  'I like chicken rice' <-> 'A lazy dog chases a furry cat'
[0, 3] = 0.5985
  'I like chicken rice' <-> 'The apple is sweeter than orange.'
[0, 4] = 0.8441
  'I like chicken rice' <-> 'ฉันชอบกินข้าวมันไก่'
[1, 1] = 1.0000
[1, 2] = 0.5075
  'สมชายชอบกินข้าวขาหมู' <-> 'A lazy dog chases a furry cat'
[1, 3] = 0.4970
  'สมชายชอบกินข้าวขาหมู' <-> 'The apple is sweeter than orange.'
[1, 4] = 0.8217
  'สมชายชอบกินข้าวขาหมู' <-> 'ฉันชอบกินข้าวมันไก่'
[2, 2] = 1.0000
[2, 3] = 0.5271
  'A lazy dog chases a furry cat' <-> 'The ap