In [None]:
import os
import json

def load_or_initialize_json(filepath, default=None):
    if default is None:
        default = []
    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            return json.load(f)
    return default

SCRAPED_CONTENTS_FILE = "scraped_contents.json"
scraped_contents = load_or_initialize_json(SCRAPED_CONTENTS_FILE)
text_contents = [tweet['content'] for tweet in scraped_contents]
print(len(text_contents))
print(type(text_contents))


In [None]:
import os
import cohere
import numpy as np
from dotenv import load_dotenv

load_dotenv()

model_name = "embed-english-v3.0"
api_key: str | None = os.getenv("COHERE_API_KEY")
input_type_embed = "search_document"

# Now we'll set up the cohere client.
if api_key is None:
    raise ValueError("Please set the COHERE_API_KEY environment variable.")
co = cohere.Client(api_key)

# Get the embeddings
embeds: list[list[float]] = co.embed(
    texts=text_contents, model=model_name, input_type=input_type_embed
).embeddings

# print(embeds)
print(type(embeds), len(embeds), len(embeds[0]))
array_embeds = np.array(embeds)
print(array_embeds.shape)
np.save('embeddings.npy', array_embeds)

In [None]:
import numpy as np

def cosine_similarity_matrix(vectors, query_vec):
    dot_product = np.dot(vectors, query_vec)
    
    norms_vectors = np.linalg.norm(vectors, axis=1)
    norm_query_vec = np.linalg.norm(query_vec)
    
    if norm_query_vec == 0 or np.any(norms_vectors == 0):
        raise ValueError("Cosine similarity is not defined when one or both vectors are zero vectors.")
    
    similarity = dot_product / (norms_vectors * norm_query_vec)
    return similarity


In [None]:
import os
import cohere
from dotenv import load_dotenv

load_dotenv()

model_name = "embed-english-v3.0"
api_key: str | None = os.getenv("COHERE_API_KEY")
input_type_embed = "search_query"
QUERY="LMM Tools do computation math"

# Now we'll set up the cohere client.
if api_key is None:
    raise ValueError("Please set the COHERE_API_KEY environment variable.")
co = cohere.Client(api_key)

# Get the embeddings
query_embed: list[list[float]] = co.embed(
    texts=[QUERY], model=model_name, input_type=input_type_embed
).embeddings
query_array = np.array(query_embed)
query_array = query_array.reshape(-1)

embeds_dataset = np.load('embeddings.npy')

similarity_results = np.zeros((embeds_dataset.shape[0],), dtype=np.float32)
try:
    similarity_results = cosine_similarity_matrix(embeds_dataset, query_array)
except ValueError as e:
    print(e)

sorted_indices = np.argsort(similarity_results)[::-1]
for i in range(25):
    print(similarity_results[sorted_indices[i]], scraped_contents[sorted_indices[i]])