In [4]:
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import pandas as pd
import time
import os
import dotenv
dotenv.load_dotenv()

True

In [5]:
token = os.getenv("RUNPOD_TOKEN")   
openai_base_url = os.getenv("RUNPOD_EMBEDDING_URL") 
model_name = os.getenv("MODEL_NAME")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME")


In [6]:
pc = Pinecone(api_key=pinecone_api_key)     # Initialize Pinecone client
client = OpenAI(
    api_key=token,
    base_url=openai_base_url
)   # Initialize OpenAI client

## Tryout Embeddings


In [9]:
# Text embedding using the OpenAI API and prints the resulting vector
output = client.embeddings.create(input=["hello world"], model=model_name)
embedding = output.data[0].embedding
print(embedding)

[0.015176702290773392, -0.02265065535902977, 0.00859504658728838, -0.0742514431476593, 0.003908572718501091, 0.0027112148236483335, -0.031238075345754623, 0.044630181044340134, 0.04405056685209274, -0.007908663712441921, -0.02519790083169937, -0.03337348997592926, 0.014375921338796616, 0.04639952629804611, 0.008610299788415432, -0.016137639060616493, 0.007561658509075642, -0.019020449370145798, -0.11458028852939606, -0.018105272203683853, 0.1262945681810379, 0.029728032648563385, 0.025274166837334633, -0.0342886708676815, -0.041091494262218475, 0.006669359747320414, 0.010303379036486149, 0.022437114268541336, 0.00444242637604475, -0.12727075815200806, -0.01610713265836239, -0.020255940034985542, 0.047375716269016266, 0.011561748571693897, 0.06815025955438614, 0.00739006232470274, -0.017998501658439636, 0.04084744676947594, -0.01028049923479557, 0.02373361587524414, 0.010509294457733631, -0.028538301587104797, 0.008137458004057407, -0.015138569287955761, 0.030948270112276077, -0.0659538

In [10]:
len(embedding)

384

## Wrangle Dataset



In [7]:
df = pd.read_json("products/products.jsonl", lines=True) # 

In [8]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


In [9]:
df['text'] = df['name']+" : "+df['description']+\
    " -- Ingredients: "+df['ingredients'].astype(str)+\
    " -- Price: "+df['price'].astype(str)+\
    " -- Rating: "+df['rating'].astype(str)

In [10]:
df['text'].head(2)

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
Name: text, dtype: object

In [11]:
texts = df['text'].tolist()

In [12]:
texts[:2]  # Display the first two texts

["Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations. -- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam'] -- Price: 4.5 -- Rating: 4.7",
 "Jumbo Savory Scone : Deliciously flaky and buttery, this jumbo savory scone is filled with herbs and cheese, creating a mouthwatering experience. Perfect for a hearty snack or a light lunch, it pairs beautifully with your favorite coffee or tea. -- Ingredients: ['Flour', 'Butter', 'Cheese', 'Herbs', 'Baking Powder', 'Salt'] -- Price: 3.25 -- Rating: 4.3"]

In [13]:
with open("products/Velvet_hours_about_us.txt") as f:
    about_us_section = f.read()

about_us_section = "Coffee shop Velvet Hours about section: "+about_us_section
texts.append(about_us_section)  # Add the about us text to the list

In [14]:
with open("products/menu_items_text.txt") as f:
    menu_items_text = f.read()

menu_items_text = "Menu Items: "+menu_items_text
texts.append(menu_items_text)  # Add the menu items text to the list

## Generate Embeddings

In [17]:
output = client.embeddings.create(input=texts, model=model_name)

In [18]:
embeddings = output.data # Get the embeddings from the structured output generated by the OpenAI API

In [19]:
embeddings[0]

Embedding(embedding=[-0.014078277163207531, -0.06446248292922974, 0.003805713029578328, 0.004784325137734413, 0.030125223100185394, -0.045630402863025665, 0.03723685070872307, 0.010690334253013134, -0.027744507417082787, -0.05091071128845215, -0.052436813712120056, -0.03366577625274658, -0.0004516303597483784, -0.043768562376499176, 0.03757259249687195, -0.01718389242887497, -0.019488302990794182, -0.08063914626836777, -0.07587771862745285, -0.007950983941555023, 0.0483773872256279, -0.06122715026140213, -0.10749851912260056, -0.03699267655611038, 0.03549709916114807, -0.04856051877140999, 0.08790338784456253, -0.01983930729329586, 0.001731170224957168, -0.13075628876686096, -0.012933701276779175, 0.009805195964872837, -0.02801920473575592, -0.025821620598435402, -0.0350392684340477, 0.0010091339936479926, 0.06751468777656555, -0.07593876123428345, 0.047095462679862976, 0.0005737183964811265, -0.002592463279142976, 0.029804741963744164, -0.06415726244449615, -0.002075496595352888, -0.0

## Push Data to Pinecone Database

In [None]:
# Create a new space to store the vectors
pc.create_index(
    name=index_name,    # Unique name of the index
    dimension=384,      # Dimension of the vectors must match the embedding model
    metric="cosine",    # Metric used for similarity search
    spec=ServerlessSpec(    # ServerlessSpec defines where and how your index is hosted
        cloud="aws",
        region="us-east-1",
    )
)

{
    "name": "coffeeshop",
    "metric": "cosine",
    "host": "coffeeshop-k8sedwf.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [20]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status.ready:
    time.sleep(1)

index = pc.Index(index_name)  # Connect to the index

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0]  # Use the first part of the text as the ID
    vectors.append({
        "id": entry_id,
        "values": e.embedding,  # Use the embedding vector
        "metadata": {
            "text": text  # Store the original text in metadata
        }
    })
index.upsert(vectors=vectors, namespace='ns1')  # Upsert the vectors into the index

{'upserted_count': 20}

## Get Closest Documents


In [None]:
output = client.embeddings.create(input=["Is Cappuccino lactose-free?"], model=model_name) 
embedding = output.data[0].embedding  # Get the embedding for the query

In [None]:
results = index.query(
    namespace='ns1',  # Specify the namespace
    vector=embedding,  # Use the embedding as the query vector
    top_k=3,  # Number of nearest neighbors to return
    include_metadata=True  # Include metadata in the results
)

In [23]:
results

{'matches': [{'id': 'Cappuccino ',
              'metadata': {'text': 'Cappuccino : A rich and creamy cappuccino '
                                   'made with freshly brewed espresso, steamed '
                                   'milk, and a frothy milk cap. This '
                                   'delightful drink offers a perfect balance '
                                   'of bold coffee flavor and smooth milk, '
                                   'making it an ideal companion for relaxing '
                                   'mornings or lively conversations. -- '
                                   "Ingredients: ['Espresso', 'Steamed Milk', "
                                   "'Milk Foam'] -- Price: 4.5 -- Rating: 4.7"},
              'score': 0.734519,
              'values': []},
             {'id': 'Sugar Free Vanilla syrup ',
              'metadata': {'text': 'Sugar Free Vanilla syrup : Enjoy the sweet '
                                   'flavor of vanilla without the s