## Vector quantization

Most vector embeddings are stored as floating point numbers (64-bit in Python). We can use **quantization** to reduce the size of the embeddings.

In [None]:
# Load in the vectors from movies.json, {movie: [vector]}
import json

with open('openai_movies.json') as f:
    movies = json.load(f)

In [None]:
# Quantize all the vectors to 1 byte
import numpy as np


def quantize_vector(vec):
    """Turn a float64 into a int8"""
    vec = np.array(vec) + 1
    vec = np.clip(vec, 0, 2)
    vec = vec * 127
    vec = vec.astype(np.uint8)
    return vec

movies_1byte = {k: quantize_vector(v) for k, v in movies.items()}
for k, v in list(movies_1byte.items())[:10]:
    print(k, v)

In [None]:
# 10 most similar movies to Moana
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def most_similar(movie, movies):
    movie_vec = movies[movie]
    similarities = {k: cosine_similarity([movie_vec], [v])[0][0] for k, v in movies.items()}
    closest = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    df = pd.DataFrame(closest, columns=['movie', 'similarity'])
    return df

most_similar('Moana', movies_1byte)[:10]

In [None]:
most_similar('Moana', movies)[:10]

## Binary quantization

The most extreme form of quantization is to store the embeddings as binary numbers, setting each dimension to 0 or 1, based on a threshold.

In [None]:
def quantize_vector(vec, threshold):
    """Turn a float32 into a bit by thresholding at 0"""
    vec = np.array(vec)
    vec = (vec > threshold).astype(np.int8)
    return vec

# calculate the mean of all the dimensions of each vector
mean_vec = np.mean(np.mean(list(movies.values()), axis=0))

movies_1bit = {k: quantize_vector(v, mean_vec) for k, v in movies.items()}
for k, v in list(movies_1bit.items())[:10]:
    print(k, v)

In [None]:
most_similar('Moana', movies_1bit)[:10]

## Visualizing the quantization

In [None]:
# make a chart of a single vector
import matplotlib.pyplot as plt

vector = movies['Moana']
plt.bar(range(len(vector)), vector)
plt.xlabel('Dimension')
plt.ylabel('Value')
plt.show()

In [None]:
# make a histogram of a single vector
plt.hist(movies['Moana'])
plt.show()

In [None]:
# Compare the 1bit and 1byte vectors in a 2d plot
plt.scatter(movies_1byte['Moana'], movies['Moana'])
plt.xlabel('int8')
plt.ylabel('float')
plt.title('int8 vs float')
plt.show()

In [None]:
# Compare the 1bit and 1byte vectors in a 2d plot
plt.scatter(movies_1bit['Moana'], movies_1byte['Moana'])
plt.xlabel('1 bit')
plt.ylabel('1 byte')
plt.title('1 bit vs 1 byte')
plt.show()

## Size comparison

In [None]:
import sys

pd.DataFrame({
    'float64': [sys.getsizeof(movies['Moana'])],
    'int8': [sys.getsizeof(movies_1byte['Moana'])],
    'int1': [sys.getsizeof(movies_1bit['Moana'])],
})

In [None]:
sys.getsizeof(movies['Moana'])/sys.getsizeof(movies_1byte['Moana'])