In [5]:
!pip install -qq langchain-openai tiktoken

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import os
import numpy as np
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

# Get secret key from Colab repository
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# Load the OpenAI API Key from environment
openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("Please set the environment variable for 'OPENAI_API_KEY'")

# Sample list of sentences
sentences = [
    "The cat sits on the mat.",
    "Dogs are great friends to humans.",
    "Birds can fly high in the sky.",
    "It is raining cats and dogs.",
    "The sky is blue and the sun is shining."
]

# Initialize the embeddings model
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Convert sentences to embeddings
sentence_embeddings = embedding_model.embed_documents(sentences)

# Function to find the most similar sentence
def find_most_similar(query):
    # Convert query to embedding
    query_embedding = embedding_model.embed_query(query)

    # Calculate cosine similarity between the query vector and sentence vectors
    similarities = cosine_similarity([query_embedding], sentence_embeddings)[0]

    # Find the index of the highest similarity score
    most_similar_index = np.argmax(similarities)

    return sentences[most_similar_index], similarities[most_similar_index]

# Sample usage
query_text = "What's a good companion animal?"
most_similar_sentence, similarity_score = find_most_similar(query_text)

print(f"Query: {query_text}")
print(f"Most Similar Sentence: '{most_similar_sentence}' with similarity score: {similarity_score:.4f}")

Query: What's a good companion animal?
Most Similar Sentence: 'Dogs are great friends to humans.' with similarity score: 0.8562


In [7]:
# Display OpenAI Embedding model name
embedding_model.model

'text-embedding-ada-002'

In [8]:
type(sentence_embeddings)

list

In [9]:
len(sentence_embeddings[0])

1536

In [10]:
# prompt: show first 10 values of sentence_embeddings[0]

print(sentence_embeddings[0][:10])

[0.004806076642125845, -0.009483648464083672, -0.016628511250019073, -0.012934001162648201, -0.012240075506269932, 0.008121498860418797, -0.007453274447470903, -0.038525719195604324, -0.007710283622145653, 0.0006778624374419451]


In [11]:
print(sentence_embeddings[1][:10])

[0.012955625541508198, -0.0018508036155253649, 0.005934372078627348, -0.0335877388715744, -0.013849973678588867, 0.018259605392813683, 0.002693912945687771, -0.007042990997433662, -0.004701538011431694, -0.014495891518890858]


In [12]:
print(sentence_embeddings[2][:10])

[0.004376836586743593, -0.029178909957408905, -0.0019395208219066262, -0.018847322091460228, 0.004758791998028755, 0.01974898763000965, -0.011558855883777142, -0.022641832008957863, -0.0008570522186346352, -0.020450282841920853]


In [13]:
# prompt: print first 10 values of sentence_embedding with 4 precision

for i in range(3):
    print(np.round(sentence_embeddings[i][:10], 4))

[ 0.0048 -0.0095 -0.0166 -0.0129 -0.0122  0.0081 -0.0075 -0.0385 -0.0077
  0.0007]
[ 0.013  -0.0019  0.0059 -0.0336 -0.0138  0.0183  0.0027 -0.007  -0.0047
 -0.0145]
[ 0.0044 -0.0292 -0.0019 -0.0188  0.0048  0.0197 -0.0116 -0.0226 -0.0009
 -0.0205]
