In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer, util
import chromadb
from chromadb.utils import embedding_functions
import os
import pinecone
from tqdm.auto import tqdm

In [2]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ["dlopen(/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 0x0006): tried: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file)"]


In [3]:
def get_embeddings(text):
    return model.encode(text).tolist()

In [4]:
df = (
    pl.scan_csv("../data/imdb_top_1000.csv")
    .drop("")
    .with_row_count("ids")
    .with_columns(
        [
            pl.format("{} {} {}", "Title", "Description", "Cast").alias("text"),
            pl.col("ids").cast(pl.Utf8),
        ]
    )
    .with_columns(pl.col("text").apply(get_embeddings).alias("embeddings"))
    .collect()
)

# Chroma

In [None]:
openai_api_key = os.environ["OPENAI_API_KEY"]

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key, model_name="text_embedding-ada-002"
)

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

In [None]:
chroma_client = chromadb.Client()
client_persistent = chromadb.PersistentClient(path="../data/data-embeddings")
db = client_persistent.create_collection(
    name="movies_db", embedding_function=sentence_transformer_ef
)

In [None]:
db.add(
    ids=df["ids"].to_list(),
    embeddings=df["embeddings"].to_list(),
    metadatas=(
        df.drop(["ids", "embeddings", "text"]).with_columns(
            [pl.col("Certificate").fill_null(0), pl.col("Metascore").fill_null(0)]
        )
    ).to_dicts(),
)

In [None]:
db.peek(1)

## Chroma Embeddings

In [None]:
db_no_embeddings = client_persistent.create_collection(
    name="movies_db_no_embeddings", embedding_function=sentence_transformer_ef
)

In [None]:
db_no_embeddings.add(
    ids=df["ids"].to_list(),
    documents=df["text"].to_list(),
    metadatas=(
        df.drop(["ids", "embeddings", "text"]).with_columns(
            [pl.col("Certificate").fill_null(0), pl.col("Metascore").fill_null(0)]
        )
    ).to_dicts(),
)

In [None]:
db_no_embeddings.peek(1)

In [None]:
db_no_embeddings.delete(ids=["0"])

In [None]:
db_no_embeddings.peek(1)

## Chroma Query

In [None]:
results = db_no_embeddings.query(
    query_texts=["a history of the second world war"], n_results=3
)

In [None]:
results["metadatas"][0][1]

In [None]:
for result in results["metadatas"][0]:
    print(f"->>{result['Title']}\n")

In [None]:
results_where = db_no_embeddings.query(
    query_texts=["A history of a romance where one dies"],
    n_results=3,
    where={"Rate": {"$gte": 8}},
)

In [None]:
for result in results_where["metadatas"][0]:
    print(f"->>{result['Title']}\n")

In [None]:
results_where = db_no_embeddings.query(
    query_texts=["A history of a romance where one dies"],
    n_results=3,
    where={"$and": [{"Rate": {"$gte": 8}}, {"Metascore": {"$gte": 80}}]},
)
for result in results_where["metadatas"][0]:
    print(f"->>{result['Title']}\n")

## Load chroma registers

In [None]:
client_persistent_2 = chromadb.PersistentClient(path="../data/data-embeddings/")
db2 = client_persistent_2.get_collection("movies_db_no_embeddings")
db2.peek(1)

# Pinecone

In [5]:
pinecone_api_key = os.environ["PINECONE_API_KEY"]
pinecone.init(api_key=pinecone_api_key, environment="gcp-starter")
dimension_embeddings = len(df["embeddings"][0])
pinecone.create_index(
    "movies-embeddings", dimension=dimension_embeddings, metric="cosine"
)
index = pinecone.Index("movies-embeddings")

In [6]:
df = df.with_columns(
    [pl.col("Certificate").fill_null(""), pl.col("Metascore").fill_null("")]
)

In [7]:
batch_size = 64
for i in tqdm(range(0, len(df), batch_size)):
    i_end = min(i + batch_size, len(df))
    # Get batch of data
    batch = df[i:i_end]
    # Generate embeddings for the batch
    ids = batch["ids"].to_list()
    emb = batch["embeddings"].to_list()
    metadata = batch.drop(["ids", "embeddings", "text"]).to_dicts()

    # Update or add values to the database
    to_upsert = list(zip(ids, emb, metadata))
    _ = index.upsert(to_upsert)

index.describe_index_stats()

  0%|          | 0/16 [00:00<?, ?it/s]

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [8]:
index.fetch(["0"])

{'namespace': '',
 'vectors': {'0': {'id': '0',
                   'metadata': {'Cast': 'Director: Frank Darabont | Stars: Tim '
                                        'Robbins, Morgan Freeman, Bob Gunton, '
                                        'William Sadler',
                                'Certificate': 'R',
                                'Description': 'Two imprisoned men bond over a '
                                               'number of years, finding '
                                               'solace and eventual redemption '
                                               'through acts of common '
                                               'decency.',
                                'Duration': '142 min',
                                'Genre': 'Drama',
                                'Info': 'Votes: 2,295,987 | Gross: $28.34M',
                                'Metascore': '80',
                                'Rate': 9.3,
                                'T

In [9]:
index.delete(ids=["0"])
index.fetch(["0"])

{'namespace': '', 'vectors': {}}

## Pinecone Query

In [10]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query = "A world war I story"
query_vector = model.encode(query).tolist()

responses = index.query(vector=query_vector, top_k=3, include_metadata=True)

In [11]:
responses

{'matches': [], 'namespace': ''}

## Pinecone Filters

In [12]:
query = "a science fiction movie about the space"
query_vector = model.encode(query).tolist()

In [13]:
responses = index.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True,
    filter={"Genre": {"$ne": "Horror, Sci-Fi"}},
)

In [14]:
responses

{'matches': [], 'namespace': ''}

## Load Index

In [16]:
pinecone.init(api_key=pinecone_api_key, environment="gcp-starter")
index_2 = pinecone.Index("movies-embeddings")
query = "An space journey movie"
query_vector = model.encode(query).tolist()
responses = index_2.query(
    vector=query_vector,
    top_k=3,
    include_metadata=True,
    filter={"Genre": {"$ne": "Horror, Sci-Fi"}},
)

In [17]:
responses

{'matches': [{'id': '20',
              'metadata': {'Cast': 'Director: Christopher Nolan | Stars: '
                                   'Matthew McConaughey, Anne Hathaway, '
                                   'Jessica Chastain, Mackenzie Foy',
                           'Certificate': 'PG-13',
                           'Description': 'A team of explorers travel through '
                                          'a wormhole in space in an attempt '
                                          "to ensure humanity's survival.",
                           'Duration': '169 min',
                           'Genre': 'Adventure, Drama, Sci-Fi',
                           'Info': 'Votes: 1,468,447 | Gross: $188.02M',
                           'Metascore': '74',
                           'Rate': 8.6,
                           'Title': '21. Interstellar (2014)'},
              'score': 0.620471,
              'values': []},
             {'id': '427',
              'metadata': {'Cast': 'Directo