## Implementing RAG and search database to get Wine recommendation

In [None]:
# Import libraries
import pandas as pd
import json
import os

In [None]:
# Visualize the dataset
df = pd.read_csv('top_rated_wines.csv')
df.head(10)

In [None]:
# Inspect the dataset
df.describe().transpose()

We can visualize that we have 1365 rows and only have one numerical feature.

In [None]:
# Verify if there is null values
df.isnull().sum()

In [None]:
# Delete rows with null values
df.dropna(axis=0, inplace=True)
# Verify again
df.isnull().sum()

In [None]:
# We will work with just 100 values, so we will sample the dataset

sampled_df = df.sample(n=100)
sampled_df.shape

In [None]:
data = sampled_df.to_dict('records')

Now we need to embed the dataset by using SentenceTransformer by HuggingFace

In [None]:
# import the useful libraries
from sentence_transformers import SentenceTransformer
from qdrant_client import models, QdrantClient

In [None]:
# Create the model embedding
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Create the vector database client and the in-memory Qdrant instance
qdrant = QdrantClient(":memory:")

In [None]:
# Create collection to store wines
qdrant.recreate_collection(
    collection_name="top_wines",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

In [None]:
# Create and upload the vectors
qdrant.upsert(
    collection_name="top_wines",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["notes"]).tolist(),
            payload=doc,
        ) for idx, doc in enumerate(data) # sampled_df is the variable holding all the wines
    ]
)

In [None]:
user_prompt = "Suggest me an amazing Malbec wine from Argentina"

In [None]:
# Search time for red wine from California
search = qdrant.search(
    collection_name='top_wines',
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for result in search:
    print(json.dumps(result.payload, indent=4), "score:", result.score)

In [None]:
# define a variable to hold the search results
search_results = [result.payload for result in search]

In [None]:
# Now connect to the local large language model
from openai import OpenAI
# Enter your credentials here
#os.environ['OPENAI_API_KEY'] = "ENTER YOUR API KEY HERE"
#openai.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()

completion=client.chat.completions.create(
    model="gpt4",
    temperature=0.5,
    messages=[
        {"role": "system", "content": "You are chatbot, a wine specialist. Your top priority is to help guide users into selecting amazing wine and guide them with their requests."},
        {"role": "user", "content": "Suggest me an amazing Malbec wine from Argentina"},
        {"role": "assistant", "content": str(search_results)}
    ]
)
print(completion.choices[0].message.content)