In [1]:
# ! pip install pandas
# ! pip install pinecone
# ! pip unstall openai
# ! pip install python-dotenv

In [7]:
from pinecone import Pinecone, ServerlessSpec
import os
from openai import OpenAI
import pandas as pd
from time import time
import dotenv
dotenv.load_dotenv()
from together import Together

In [8]:
token= os.getenv("TOGETHER_API_KEY") 
# open_ai_base_url = os.getenv("RUNPOD_EMBEDDING_URL") 
model_name= os.getenv("MODEL_NAME") 
pinecone_api_key = os.getenv("PINECONE_API_KEY") 

In [None]:
pc = Pinecone(api_key=pinecone_api_key)

client = Together(
  api_key=token
)

## Try out embeddings

In [15]:
output = client.embeddings.create(input = ["helloo there"],model="togethercomputer/m2-bert-80M-32k-retrieval")
embedings = output.data[0].embedding
print(embedings)

[-0.0125544425, -0.044113155, -0.03084596, -0.06244851, -0.019202882, -0.0044565094, 0.057110425, -0.07059755, -0.06669056, -0.031806115, -0.0130010955, -0.035977688, -0.013222456, -0.036191795, 0.06799341, 0.060141355, -0.05900432, 0.027184704, -0.07414323, -0.023034494, -0.016627861, 0.05380352, 0.013887607, -0.036713827, -0.0025265834, -0.030425189, -0.0155652035, 0.06724184, 0.055961568, -0.024285026, 0.036119003, 0.060119923, 0.006208163, 0.033985604, -0.030067516, -0.0056329626, 0.041995272, 0.02114107, 0.04770768, 0.039945636, 0.02771009, -0.0018326284, -0.038981784, -0.042519726, 0.009817259, 0.03754165, -0.003279068, 0.047843542, 0.090597466, 0.060544927, 0.030446703, -0.0005402099, 0.056377098, -0.013878529, 0.04447856, 0.05877096, -0.017111199, 0.033119705, -0.021795923, -0.04336519, -0.04623094, -0.034499742, 0.039839417, -0.01973379, 0.026241617, 0.0014741354, 0.04716746, 0.03579321, 0.026933199, 0.01813506, -0.009512409, -0.02292165, 0.038861938, 0.027368594, -0.0655203, 

In [16]:
len(embedings)

768

## Wrangle dataset

In [17]:
df=pd.read_json('products/products.jsonl',lines=True)

In [18]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


In [19]:
df['text'] =  df['name']+" : "+df['description'] + \
                " -- Ingredients: " + df['ingredients'].astype(str) + \
                " -- Price: " + df['price'].astype(str) + \
                " -- rating: " + df['rating'].astype(str) 

In [20]:
df['text'].head()

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
2    Latte : Smooth and creamy, our latte combines ...
3    Chocolate Chip Biscotti : Crunchy and delightf...
4    Espresso shot : A bold shot of rich espresso, ...
Name: text, dtype: object

In [21]:
texts = df['text'].tolist()

In [22]:
with open('products/Merry\'s_way_about_us.txt') as f:
    Merry_way_about_section = f.read()
    
Merry_way_about_section = "Coffee shop Merry's Way about section: " + Merry_way_about_section
texts.append(Merry_way_about_section)

In [23]:
with open('products/menu_items_text.txt') as f:
    menue_items_text = f.read()
    
menue_items_text = "Menu Items: " + menue_items_text
texts.append(menue_items_text)

## Generate Embeddings

In [25]:
output = client.embeddings.create(input = texts,model="togethercomputer/m2-bert-80M-32k-retrieval")

In [26]:
embeddings = output.data

## Push data to database

In [30]:
index_name = "coffeeshop"

pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [31]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0].strip()
    vectors.append({
        "id": entry_id,
        "values": e.embedding,
        "metadata": {'text': text}
    })
    
index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 20}

## Get Closest documents

In [33]:
output = client.embeddings.create(input = ["Is Cappuccino lactose-free?"],model="togethercomputer/m2-bert-80M-32k-retrieval")
embeding = output.data[0].embedding

In [34]:
results = index.query(
    namespace="ns1",
    vector=embeding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'Espresso shot',
              'metadata': {'text': 'Espresso shot : A bold shot of rich '
                                   'espresso, our espresso is crafted from the '
                                   'finest beans to deliver a robust flavor in '
                                   'every sip. Perfect for a quick pick-me-up, '
                                   'it can also serve as a base for your '
                                   'favorite coffee drinks. -- Ingredients: '
                                   "['Espresso'] -- Price: 2.0 -- rating: 4.9"},
              'score': 0.614803553,
              'values': []},
             {'id': 'Hazelnut syrup',
              'metadata': {'text': 'Hazelnut syrup : Add a nutty flavor to '
                                   'your drinks with our hazelnut syrup, '
                                   'perfect for lattes and desserts. Its '
                                   'smooth sweetness enhances a variety of '
     