## Creating **Embeddings**

In [None]:
pip install OpenAI==1.10.0

In [None]:
pip install pinecone-client==3.0.2

In [33]:
from openai import OpenAI

client=OpenAI(
    api_key=""                      #Enter your api key here
)
MODEL = "text-embedding-3-large"

In [30]:
# ASsuming dataset is in form of question answer pairs
dataset=[
    ("Tell me something about Yardstick AI","At Yardstick, we possess profound expertise in AI model integrations, fine-tuning, advanced analytics, image generations, speech AI, video generations using AI, chatbot frameworks and solutions"),
    ("Where is Yardstick AI headquartered ?","Bengaluru, India"),
    ("How can I contact you ?","contact@yardstick.live")
]



In [None]:
embeddings={}
for question,answer in dataset:

  res=client.embeddings.create(
      input=[question],
      model=MODEL
  )

  question_embedding = res.data[0].embedding
  embeddings[question] = {question_embedding}


## **Intializing Pinecone Index**

In [32]:
import time
from pinecone import Pinecone
from pinecone import ServerlessSpec
import numpy as np

In [None]:
pc = Pinecone(api_key="")

spec = ServerlessSpec(cloud="aws", region="ap-south-1")

index_name = 'semantic-search-openai'


if index_name not in pc.list_indexes().names():

    pc.create_index(
        index_name,
        dimension=len(embeddings[qa_pairs[0]]),
        metric='dotproduct',
        spec=spec
    )

    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)


index = pc.Index(index_name)
time.sleep(1)


index.describe_index_stats()


## **Upsert data to pinecone**

In [None]:
from tqdm.auto import tqdm

# Set batch size for processing
batch_size = 32

# Initialize count for creating unique IDs
count = 0

# Iterate through the dataset in batches
for i in tqdm(range(0, len(dataset), batch_size)):
    # Set end position of batch
    i_end = min(i + batch_size, len(dataset))

    # Get batch of question-answer pairs and their IDs
    batch_dataset = dataset[i: i_end]
    ids_batch = [str(n) for n in range(count, count + len(batch_dataset))]

    # Extract embeddings and metadata for the batch
    batch_embeddings = [embeddings[qa_pair[0]] for qa_pair in batch_dataset]
    batch_metadata = [{'question': qa_pair[0], 'answer': qa_pair[1]} for qa_pair in batch_dataset]

    # Upsert batch to Pinecone index
    index.upsert(vectors=list(zip(ids_batch, batch_embeddings, batch_metadata)))

    # Update count for creating unique IDs
    count += len(batch_dataset)


## **Query**

In [None]:
query = "WHat does Yardstick AI do and where it is headquartered"

xq = client.embeddings.create(input=query, model=MODEL).data[0].embedding


In [None]:
res = index.query([xq], top_k=2, include_metadata=True)


## **Genreating Respnse**

In [None]:
# Stack the top 2 matching question-answer pairs together
stacked_pairs = "\n\n".join([f"Question: {result.metadata['question']}\nAnswer: {result.metadata['answer']}" for result in res.results])

# Initialize conversation context with the query and stacked pairs


conversation_context = f"User: Hey GPT, this is my query: {query}\nAI: Based on the following question-answer pairs:\n{stacked_pairs}\nUser: give me a short response to this query.\n"


# Call the OpenAI API to generate a response
response = client.Completion.create(
    model=MODEL,
    prompt=conversation_context,
    temperature=0.7,
    max_tokens=70
)

# Print the response
print("AI Response:", response.choices[0].text.strip())
