# Embeddings

In [None]:
#!pip install mixedbread-ai

In [11]:
from mixedbread_ai.client import MixedbreadAI
from dotenv import load_dotenv
import os
import numpy as np
import json
import pandas as pd

## Setup the embeddings model

In [2]:
load_dotenv()
MIXEDBREAD_API_KEY = os.getenv('MIXEDBREAD_API_KEY')
DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')

with open(f'{DATA_DIRECTORY}/chunks.json', 'r', encoding='utf-8') as file:
    textbook_chunks = json.load(file)

In [3]:
mxbai = MixedbreadAI(api_key=MIXEDBREAD_API_KEY)

def get_embeddings(queries):
    res = mxbai.embeddings(
        model='mixedbread-ai/mxbai-embed-large-v1',
        input=queries,
        normalized=True,
        encoding_format='float',
        truncation_strategy='start'
    )

    embeddings = np.array([res.data[i].embedding for i in range(len(res.data))])
    return embeddings

In [4]:
def dot_product_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

## Embed each chunk

In [6]:
embedded_chunks = {}

for book_name, book_info in textbook_chunks.items():
    embedded_chunks[book_name] = {
        "authors": book_info.get("authors", []),
        "year": book_info.get("year", ""),
        "chapters": []
    }
    for chapter in book_info['chapters']:
        for chapter_number, chunks in chapter.items():
            print(f"{book_name}, Chapter {chapter_number}, {len(chunks)} chunks")
            embeddings = get_embeddings(chunks)
            embedded_chunks[book_name]["chapters"].append({chapter_number: embeddings.tolist()})

with open(f'{DATA_DIRECTORY}/embedded_chunks.json', 'w', encoding='utf-8') as file:
    json.dump(embedded_chunks, file, ensure_ascii=False, indent=4)

Deep Learning, Chapter 5, 51 chunks
Deep Learning, Chapter 6, 66 chunks
Deep Learning, Chapter 8, 29 chunks
Deep Learning, Chapter 9, 9 chunks
Deep Learning, Chapter 14, 2 chunks
Stanford CS229, Chapter 5, 13 chunks
Stanford CS229, Chapter 6, 23 chunks
Stanford CS229, Chapter 7, 26 chunks
Stanford CS229, Chapter 8, 10 chunks
Stanford CS229, Chapter 9, 11 chunks
Stanford CS229, Chapter 11, 2 chunks
Math for Machine Learning, Chapter 3, 13 chunks
Math for Machine Learning, Chapter 4, 47 chunks
Math for Machine Learning, Chapter 5, 7 chunks
Math for Machine Learning, Chapter 6, 14 chunks
Math for Machine Learning, Chapter 7, 15 chunks
Math for Machine Learning, Chapter 12, 34 chunks
The Elements of Statistical Learning, Chapter 7.10, 11 chunks
The Elements of Statistical Learning, Chapter 8.2, 9 chunks
The Elements of Statistical Learning, Chapter 8.3, 6 chunks
The Elements of Statistical Learning, Chapter 12, 27 chunks
An Introduction to Statistical Learning, Chapter 6.2, 19 chunks
An In

## Convert to CSV

In [15]:
with open(f'{DATA_DIRECTORY}/chunks.json', 'r', encoding='utf-8') as file:
    textbook_chunks = json.load(file)
with open(f'{DATA_DIRECTORY}/embedded_chunks.json', 'r', encoding='utf-8') as file:
    embedded_chunks = json.load(file)

data = []
for book_name, book_info in textbook_chunks.items():
    for chapter_index, chapter in enumerate(book_info['chapters']):
        for chapter_number, chunks in chapter.items():
            embeddings = embedded_chunks[book_name]['chapters'][chapter_index][chapter_number]
            for chunk_index, chunk in enumerate(chunks):
                data.append({
                    'id': len(data) + 1,
                    'embedding': embeddings[chunk_index],
                    'content': chunk,
                    'textbook': book_name,
                    'chapter': chapter_number
                })

df = pd.DataFrame(data)
df.to_csv(f'{DATA_DIRECTORY}/data.csv', index=False)