In [None]:
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

COURS = "ATS800"
file_path = f'../data/{COURS}/embeds/summaries_audio_{COURS}.jsonl'  # Transcripts
contents = load_jsonl(file_path)
print(len(contents))
print(contents[0])
print(type(contents[0]))

text_contents = [slide['content'] for slide in contents]
print(len(text_contents))
print(type(text_contents))
print(type(text_contents[0]))
print(text_contents[0])


In [None]:
import os
import cohere
import numpy as np
from dotenv import load_dotenv

load_dotenv()
model_name = "embed-multilingual-v3.0"
api_key: str | None = os.getenv("COHERE_API_KEY")
input_type_embed = "search_document"

# Now we'll set up the cohere client.
if api_key is None:
    raise ValueError("Please set the COHERE_API_KEY environment variable.")
co = cohere.Client(api_key)

# Split the text_contents list into chunks of 96 texts
chunk_size = 96
text_chunks = [text_contents[i:i+chunk_size] for i in range(0, len(text_contents), chunk_size)]

# Initialize an empty list to store the embeddings
embeds = []

# Iterate over each chunk and make the API request
for chunk in text_chunks:
    chunk_embeds: list[list[float]] = co.embed(
        texts=chunk,
        model=model_name,
        input_type=input_type_embed
    ).embeddings # type: ignore
    embeds.extend(chunk_embeds)

print(type(embeds), len(embeds), len(embeds[0]))
array_embeds = np.array(embeds)
print(array_embeds.shape)

np.save(f'../data/{COURS}/embeds/embeddings_audio_{COURS}.npy', array_embeds)