In [1]:
import os
import re
from typing import Sequence

import pandas as pd
import torch
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

In [2]:
# Disable parallelized tokenization to avoid warnings.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
model_name = "Snowflake/snowflake-arctic-embed-xs"
embedding_dim = 384
query_prefix = "Represent this sentence for searching relevant passages: "
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
model.eval()
model = model.to("mps")  # Use Mac hardware acceleration.



In [4]:
# Example.
queries  = ['what is snowflake?', 'Where can I get the best tacos?']
documents = ['The Data Cloud!', 'Mexico City of Course!']
query_embeddings = embed(queries, is_query=True)
document_embeddings = embed(documents)

scores = torch.mm(query_embeddings, document_embeddings.transpose(0, 1))
for query, query_scores in zip(queries, scores):
    doc_score_pairs = list(zip(documents, query_scores.tolist()))
    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    # Output passages & scores.
    print("Query:", query)
    for document, score in doc_score_pairs:
        print(f"{score:.4f} {document}")

NameError: name 'embed' is not defined

# Pokemon time

In [None]:
df = pd.read_csv("all_the_pokemon.csv")
docs = df["description"].tolist()
df.head()

In [None]:
print(docs[500])

In [None]:
description_wc = pd.Series(len(re.split("\W+", doc)) for doc in docs)
ax = description_wc.plot.hist(bins=60)
ax.set_xlabel("Description Word Count")

In [None]:
batch_size = 16
embeddings_list = []
with tqdm(total=len(docs), desc="Embedding The Pokemon!", unit="doc", smoothing=0) as pbar:
    for start in range(0, len(docs), batch_size):
        end = start + batch_size
        batch = docs[start:end]
        embeddings = embed(batch)
        embeddings_list.append(embeddings)
        pbar.update(len(batch))
embeddings = torch.cat(embeddings_list)

In [None]:
q_emb = embed(["scary monster"], is_query=True)

In [None]:
scores = (q_emb @ embeddings.T).squeeze()

In [None]:
topk = torch.topk(scores, 15)
topk_scores = topk.values.cpu().numpy()
topk_ind = topk.indices.cpu().numpy()

In [None]:
results = df.iloc[topk_ind, :].copy()
results["score"] = topk_scores
results