In [46]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_core.documents import Document
import json
import os
from dotenv import load_dotenv

load_dotenv()

# load documents.json
documents_json = json.load(open("data/docs/documents.json"))

embeddings_model_name = "models/embedding-001"
embeddings_model = GoogleGenerativeAIEmbeddings(
    model=embeddings_model_name,
    api_key=os.environ["GOOGLE_API_KEY"]
)

In [47]:
docs = []
for doc_json in documents_json:
	docs.append(Document(page_content=doc_json["page_content"], metadata=doc_json["metadata"]))

In [48]:
texts = [doc["page_content"] for doc in documents_json]
embeddings = embeddings_model.embed_documents(texts)

In [49]:
embedded_docs = []
for embedding, doc_json in zip(embeddings, documents_json):
	embedding_metadata = {
		"embedding": embedding,
		"embedding_model": embeddings_model_name,
	}
	doc_json["embedding_metadata"] = embedding_metadata
	embedded_docs.append(doc_json)

with open("data/docs/gemini_embedded_docs.json", "w") as f:
	json.dump(embedded_docs, f)