https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html#


In [2]:
#!pip install lark
#!pip install chromadb

Collecting lark
  Downloading lark-1.1.5-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.5


In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import json
import os

import tiktoken
from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Read the ASIN values from the CSV file
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
#asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

OPENAI_API_KEY is ready


In [10]:
reviews = pd.read_csv('/Users/vladbordei/Documents/Development/ProductExplorer/data/processed/reviews_export.csv')
reviews = reviews[reviews['asin'].isin(asin_list)]

In [11]:
reviews.columns

Index(['id', 'rating', 'review_summary', 'product_facts', 'positive_sentiment',
       'negative_sentiment', 'improvements_expected', 'issues_identified',
       'how_product_is_used', 'media', 'where_product_is_used', 'sentiment',
       'anger', 'anger_reason', 'delight', 'delight_reason', 'disappointment',
       'disappointment_reason', 'time', 'season', 'weather',
       'user_description', 'title', 'review', 'asin_variant', 'asin'],
      dtype='object')

In [12]:
df = reviews[['asin', 'review', 'review_summary','rating','title']]

#### Embeding and loading the data

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [14]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="review")
documents = loader.load()

#### the Database: Chroma

In [24]:
# https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/chroma_self_query.html
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html
from langchain.vectorstores import Chroma
persist_directory = '/Users/vladbordei/Documents/Development/ProductExplorer/data/vectorstores/chroma/db'
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x285a7f880>

How you save to file and stop the database

vectorstore.persist()
vectorstore = None

Now we can load the persisted database from disk, and use it as normal. 

vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [39]:
retriever = vectorstore.as_retriever(search_type="mmr")

In [38]:
retriever.get_relevant_documents(query)[0]

Document(page_content='Saw a young boy using it and instantly brought myself here Its pretty fun', metadata={'asin': 'B07X7YFZWG', 'review_summary': 'Fun product that caught my attention', 'rating': 5, 'title': 'Fun Tool'})

In [42]:
query = "hate that feature"

In [43]:
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

This product is great however can get annoying after time


In [44]:
docs = vectorstore.similarity_search_with_score(query)

In [50]:
docs[2]

(Document(page_content='This product is great however can get annoying after time', metadata={'asin': 'B07XCRVK2Y', 'review_summary': 'Great product but can get annoying over time', 'rating': 5, 'title': 'Super satisfying/ can get annoying'}),
 1.3280236721038818)

#### the Chat Memory

In [51]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

#### the Conversational Retrieval Chain

In [52]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory)

In [53]:
query = "What is the best feature of this product"
result = qa({"question": query})

In [54]:
result["answer"]

' The best feature of this product is that there is no mess or cleanup.'

#### Using a different model for condensing the question
This chain has two steps:
- First, it condenses the current question and the chat history into a standalone question. This is neccessary to create a standanlone vector to use for retrieval. 
- After that, it does retrieval and then answers the question using retrieval augmented generation with a separate model. 

In [62]:
from langchain.chat_models import ChatOpenAI

In [63]:
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),   # model='gpt-4'),
    vectorstore.as_retriever(),
    condense_question_llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
)

In [64]:
chat_history = []
query = "What is the best feature of this product"
result = qa({"question": query, "chat_history": chat_history})

In [65]:
chat_history = [(query, result["answer"])]
query = "Why is that?"
result = qa({"question": query, "chat_history": chat_history})

All toghtether now

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [1]:
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),   # model='gpt-4'),
    vectorstore.as_retriever(),
    condense_question_llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
    memory = memory
)

NameError: name 'ConversationalRetrievalChain' is not defined