#### Referencse
1. https://platform.openai.com/docs/guides/embeddings/use-cases
2. https://github.com/openai/openai-cookbook/tree/main/
3. https://cookbook.openai.com/examples/get_embeddings_from_dataset

#### Imports

In [1]:
# imports
import pandas as pd
import tiktoken

from utils.embeddings_utils import get_embedding
from keys.keys import OPENAI_KEY

import openai


In [2]:
# Set up your API credentials
openai.api_key = OPENAI_KEY

In [3]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

#### Load Dataset

In [4]:
# load & inspect dataset
input_datapath = "/home/narayan/data/amazon_fine_food_reviews/Reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)

df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = ("Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip())

df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [5]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)

print(f"df len: {len(df)}")
display(df.head(2))

df len: 1000


Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
284932,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...,52
220697,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...,35


#### Get embeddings and save them for future reuse

In [11]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
df_copy = df[0:3]

# This may take a few minutes
df_copy["embedding"] = df_copy.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df_copy.to_csv("data/fine_food_reviews_with_embeddings_1k.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy["embedding"] = df_copy.combined.apply(lambda x: get_embedding(x, model=embedding_model))


In [12]:
df_copy

Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
284932,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...,52,"[0.007060592994093895, -0.02732112631201744, 0..."
220697,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...,35,"[-0.023609420284628868, -0.011784634552896023,..."
107908,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title: It isn't blanc mange, but isn't bad . ....",267,"[0.00016697357932571322, 0.005226491950452328,..."


#### Semantic Text Search

In [18]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "data/fine_food_reviews_with_embeddings_1k.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)

df.head()


Unnamed: 0,Id,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
0,284932,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...,52,"[0.007060592994093895, -0.02732112631201744, 0..."
1,220697,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...,35,"[-0.023609420284628868, -0.011784634552896023,..."
2,107908,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title: It isn't blanc mange, but isn't bad . ....",267,"[0.00016697357932571322, 0.005226491950452328,..."


In [19]:
from utils.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        model="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for r in results:
            print(r[:200])
            print()
    return results


results = search_reviews(df, "delicious beans", n=3)


where does one  start...and stop... with a treat like this:  Wanted to save some to bring to my Chicago family but my North Carolina family ate all 4 boxes before I could pack. These are excellent...c

It isn't blanc mange, but isn't bad . . .:  I'm not sure that custard is really custard without eggs.  But this comes close.  I got it for use in a "Vegan pancake" recipe.  We were having houseguests 

Arrived in pieces:  Not pleased at all. When I opened the box, most of the rings were broken in pieces. A total waste of money.



#### Vector Search
https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/embeddings/vector-search-quickstart.ipynb

https://cloud.google.com/vertex-ai/docs/vector-search/quickstart

https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings#generative-ai-get-text-embedding-python_vertex_ai_sdk

https://stackoverflow.com/questions/76505538/how-to-deploy-chroma-database-vector-database-in-production


https://www.datacamp.com/blog/the-top-5-vector-databases

https://www.kdnuggets.com/an-honest-comparison-of-open-source-vector-databases

https://coinsbench.com/experimenting-with-vector-databases-chromadb-pinecone-weaviate-and-pgvector-0f35c0356540

https://blog.apify.com/pinecone-alternatives/

https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/weaviate/getting-started-with-weaviate-and-openai.ipynb


https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/weaviate/Using_Weaviate_for_embeddings_search.ipynb

In [11]:
from utils.embeddings_utils import get_embedding, cosine_similarity

cosine_similarity(get_embedding("Walked In", model=embedding_model), 
                  get_embedding("Walk-in", model=embedding_model))
# 0.9118788322167249

cosine_similarity(get_embedding("Walked In", model=embedding_model), 
                  get_embedding("Ambulatory (Walk-in)", model=embedding_model))
# 0.8491000636559728

cosine_similarity(get_embedding("Ambulatory", model=embedding_model), 
                  get_embedding("Ambulance", model=embedding_model))
# 0.8544654641876922

cosine_similarity(get_embedding("Amb- UC Health EMS", model=embedding_model), 
                  get_embedding("Ambulance", model=embedding_model))
# 0.8819070813968349

cosine_similarity(get_embedding("Lifecare EMS", model=embedding_model), 
                  get_embedding("Life EMS", model=embedding_model))
# 0.9294797597346736

cosine_similarity(get_embedding("Walk-in", model=embedding_model), 
                  get_embedding("Life EMS", model=embedding_model))
# 0.7650713968650132

cosine_similarity(get_embedding("hello", model=embedding_model), 
                  get_embedding("Life EMS", model=embedding_model))
# 0.7662688073763755


0.8819070813968349

0.7662688073763755