### reference
https://cookbook.openai.com/examples/semantic_text_search_using_embeddings

### openai api connection

In [50]:
from dotenv import dotenv_values
from openai import OpenAI
import json

envs = dotenv_values("../.env")
openai = OpenAI(api_key = envs["OPENAI_API_KEY"])



### cosine sim func

In [61]:
from scipy.spatial.distance import cosine
def cosine_similarity(a, b):
    return 1 - cosine(a, b)

1.0

### get embeddings func

In [64]:
import tiktoken

def get_embedding(text, encoding = "cl100k_base", model = "text-embedding-3-small"):
    encoding = tiktoken.get_encoding(encoding)
    return openai.embeddings.create(input = encoding.encode(text), model = model).data[0].embedding

get_embedding("test")

[-0.009877119213342667,
 0.0015331337926909328,
 0.015642808750271797,
 -0.05476367473602295,
 -0.006405937951058149,
 -0.012950307689607143,
 0.009662549011409283,
 -0.013552486896514893,
 0.028683098033070564,
 0.007862933911383152,
 0.031811658293008804,
 -0.006544370204210281,
 0.0038587902672588825,
 0.010153982788324356,
 0.01405776385217905,
 0.04465814307332039,
 -0.059802599251270294,
 -0.002439863048493862,
 -0.05110907182097435,
 0.03651834651827812,
 0.03610305115580559,
 0.0244193933904171,
 0.03995145857334137,
 -0.04355068877339363,
 0.03455261141061783,
 -0.01956043392419815,
 -0.013981626369059086,
 0.01061773020774126,
 0.031202558428049088,
 -0.04155726730823517,
 0.058805886656045914,
 -0.028876902535557747,
 -0.001364419818855822,
 -0.03873325511813164,
 0.05517897009849548,
 0.00413911510258913,
 0.023574959486722946,
 0.018854429945349693,
 -0.0056687877513468266,
 -0.003966074902564287,
 -0.04144652187824249,
 -0.045018065720796585,
 0.011351418681442738,
 0.027

In [56]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "../data/result_embeddings.csv"

df = pd.read_csv(datafile_path)
df["embedding"] = df.embedding.apply(literal_eval).apply(np.array)
df

Unnamed: 0,Id,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
0,3,B000LQOCH0,ABXLMWJIXXAIN,4,"""Delight"" says it all",This is a confection that has been around a fe...,"Title: ""Delight"" says it all; Content: This is...",137,"[0.019770847633481026, -0.03219044208526611, -..."
1,1,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...,64,"[0.02088439092040062, -0.0002246303338324651, ..."
2,4,B000UA0QIQ,A395BORC6FGVXV,2,Cough Medicine,If you are looking for the secret ingredient i...,Title: Cough Medicine; Content: If you are loo...,59,"[-0.013130724430084229, -0.02155955694615841, ..."
3,9,B000E7L2R4,A1MZYO9TZK0BBI,5,Yay Barley,Right now I'm mostly just sprouting this so my...,Title: Yay Barley; Content: Right now I'm most...,41,"[-0.028889616951346397, 0.005978284869343042, ..."
4,8,B006K2ZZ7K,A3JRGQVEQN31IQ,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,"Title: Wonderful, tasty taffy; Content: This t...",47,"[0.011754157952964306, -0.04340575635433197, -..."
5,7,B006K2ZZ7K,A1SP2KVKFXXRU1,5,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,Title: Great! Just as good as the expensive b...,84,"[0.006900141015648842, -0.029931161552667618, ..."
6,6,B006K2ZZ7K,ADT0SRK1MGOEU,4,Nice Taffy,I got a wild hair for taffy and ordered this f...,Title: Nice Taffy; Content: I got a wild hair ...,110,"[0.03261734917759895, -0.009279564023017883, -..."
7,2,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...,51,"[-0.004459121264517307, 0.0007839715690352023,..."
8,5,B006K2ZZ7K,A1UQRSCLF8GW1T,5,Great taffy,Great taffy at a great price. There was a wid...,Title: Great taffy; Content: Great taffy at a ...,50,"[0.005799959879368544, -0.05150648206472397, -..."
9,10,B00171APVA,A21BT40VZCCYT4,5,Healthy Dog Food,This is a very healthy dog food. Good for thei...,Title: Healthy Dog Food; Content: This is a ve...,37,"[0.005538227967917919, -0.029609257355332375, ..."


In [106]:
# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        encoding="cl100k_base"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))
    df = df.sort_values("similarity", ascending=False)
    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .combined.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    if pprint:
        for i, r in enumerate(results):
            print(r)
            print("similarity: ", df.iloc[i].similarity)
            print()
    return results

results = search_reviews(df, "great for Cough", n=10)


Cough Medicine:  If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.
similarity:  0.4995771254280522

Wonderful, tasty taffy:  This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!
similarity:  0.2320653938622037

Great taffy:  Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.
similarity:  0.22841301966862804

"Delight" says it all:  This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy t