# Set Up

In [2]:
#Imports 
import pandas as pd
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

## Data Loading

In [3]:
wine_df = pd.read_csv("C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv")

In [4]:
from langchain.document_loaders import CSVLoader

loader = CSVLoader(file_path="C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv", encoding='utf8')

data = loader.load()


In [5]:
wine_df.columns

Index(['id', 'country', 'description', 'designation', 'points', 'price',
       'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

## Text Splitting

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 15
)
splits = text_splitter.split_documents(data)

In [7]:
len(splits)

614977

In [8]:
split1 = splits[0:100000]

## Embeddings and Vector Store

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [10]:
from langchain.vectorstores import FAISS
vectordbR = FAISS.from_documents(split1, embedding)

In [None]:
from langchain.vectorstores import FAISS
vectordbR = FAISS.from_documents(splits, embedding)

## Retrieval 

### Similarity Search

In [12]:
question = "What are some prices of Rose's around the world?"
docs = vectordbR.similarity_search(question,k=5)
list(map(lambda x: x.metadata, docs))

[{'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 14076},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 19456},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 11602},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 10481},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 16223}]

In [17]:
docs

[Document(page_content='id: 14076\ncountry: US\ndescription: Pretty dark for a rosé, and heavy in the mouth, with the density almost of a red wine. Shows soft cherry and spice flavors.\ndesignation: Rosato\npoints: 82\nprice: 18', metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 14076}),
 Document(page_content="designation: \npoints: 86\nprice: 12\nprovince: Languedoc-Roussillon\nregion_1: Pays d'Oc\nregion_2: \ntaster_name: Lauren Buzzeo\ntaster_twitter_handle: @laurbuzz\ntitle: Le Grand Noir 2016 Rosé (Pays d'Oc)", metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 7470}),
 Document(page_content="description: The popularity of rosé has lead to higher prices in Tavel, but this estate, one of the region's largest, has kept those increases in check. Thankfully, the wine qual

### Max Marginal Relevance Search

In [15]:
question = "What are some prices of Rose's around the world?"
docs = vectordbR.max_marginal_relevance_search(question,k=5)
list(map(lambda x: x.metadata, docs))

[{'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 14076},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 7470},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 10481},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 5059},
 {'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv',
  'row': 11827}]

In [16]:
docs

[Document(page_content='id: 14076\ncountry: US\ndescription: Pretty dark for a rosé, and heavy in the mouth, with the density almost of a red wine. Shows soft cherry and spice flavors.\ndesignation: Rosato\npoints: 82\nprice: 18', metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 14076}),
 Document(page_content="designation: \npoints: 86\nprice: 12\nprovince: Languedoc-Roussillon\nregion_1: Pays d'Oc\nregion_2: \ntaster_name: Lauren Buzzeo\ntaster_twitter_handle: @laurbuzz\ntitle: Le Grand Noir 2016 Rosé (Pays d'Oc)", metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 7470}),
 Document(page_content="description: The popularity of rosé has lead to higher prices in Tavel, but this estate, one of the region's largest, has kept those increases in check. Thankfully, the wine qual

### Hybrid Search

In [20]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
retriever_vectordb = vectordbR.as_retriever(search_kwargs={"k": 5})
keyword_retriever = BM25Retriever.from_documents(split1)
keyword_retriever.k =  5
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [22]:
query="What countires are Malbecs from?"
docs_rel=ensemble_retriever.get_relevant_documents(query)
docs_rel

[Document(page_content='description: What makes this Sangiovese and Cabernet Sauvignon blend stand out are its pristine balsam notes. Those beautifully pungent aromas are followed by exotic spice, black cherry, maple syrup', metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 13448}),
 Document(page_content='title: Boatique 2013 Malbec (Red Hills Lake County)\nvariety: Malbec\nwinery: Boatique', metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 11915}),
 Document(page_content='title: Chacewater 2013 Malbec (Red Hills Lake County)\nvariety: Malbec\nwinery: Chacewater', metadata={'source': 'C:/Users/MehakGanju/Documents/Repositories/Personal/Wine Tasting RAG Model/Wine_Tasting_RAG_Model/data/winemag-data-130k-v2.csv', 'row': 14742}),
 Document(page_content='and with a long, spi

## QA Chain

In [30]:
# Setup GPT Key
openai.api_key  = os.environ['OPENAI_API_KEY']

In [31]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-4"
else:
    llm_name = "gpt-4"
print(llm_name)

gpt-4


In [32]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")

'Hello! How can I assist you today?'

#### QA Chain with No Hybrid Search

In [34]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, 
just say that you don't know, don't try to make up an answer. 
Use five sentences maximum. Keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer if you know it.
If you do not know it end the answer with "Is there something else you would like to ask?" 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
qa_chainR = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordbR.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

In [35]:
question = "Give me a reccomendation for a good white wine from Italy"
result = qa_chainR({"query": question})
result["result"]

"A good white wine from Italy is the one from the largely unexplored wine region of Liguria in northern Italy. It's described as crisp and perky, offering easy aromas of citrus, apple and brimstone. Another recommendation is the satisfying white wine from Sardinia, which pairs well with shellfish, grilled vegetables, exotic spicy dishes or Italian kitchen classics. Thanks for asking!"

In [36]:
question = "What countries are Malbec's from?"
result = qa_chainR({"query": question})
result["result"]

'The Malbecs mentioned are from the United States and France. Thanks for asking!'

In [42]:
question = "Can you give me the price of a Rose from USA verses one from France"
result = qa_chainR({"query": question})
result["result"]

'The price of the rosé from the USA is $18. The price of the rosé from France is not provided in the context given. Thanks for asking!'

#### QA Chain with Hybrid Search

In [44]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, 
just say that you don't know, don't try to make up an answer. 
Use five sentences maximum. Keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer if you know it.
If you do not know it end the answer with "Is there something else you would like to ask?" 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
qa_chainH = RetrievalQA.from_chain_type(llm,
                                       retriever=ensemble_retriever,
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

In [53]:
question = "What countries are Malbec's from?"
result = qa_chainH({"query": question})
result["result"]

'The Malbecs mentioned in the context are from the US and France. Thanks for asking!'

In [51]:
question = "Can you give me the price of a Pinot Noir"
result = qa_chainH({"query": question})
result["result"] 

'The prices of the Pinot Noir wines listed range from $12 to $25. Thanks for asking!'

# ChatBot