https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html#


In [2]:
#!pip install lark
#!pip install chromadb

Collecting lark
  Downloading lark-1.1.5-py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lark
Successfully installed lark-1.1.5


In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import json
import os

import tiktoken
from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Read the ASIN values from the CSV file
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
#asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

OPENAI_API_KEY is ready


In [3]:
reviews = pd.read_csv('/Users/vladbordei/Documents/Development/ProductExplorer/data/processed/reviews_export.csv')
reviews = reviews[reviews['asin'].isin(asin_list)]

In [4]:
df = reviews.copy()

#### Embeding and loading the data

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#### the Database: Chroma

In [9]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column='review')
documents = loader.load()

In [147]:
######## VERSIUNEA PERSISTENTA #############
# https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/chroma_self_query.html
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html
from langchain.vectorstores import Chroma
persist_directory = '/Users/vladbordei/Documents/Development/ProductExplorer/data/vectorstores/chroma/db'
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x29c2b1960>

In [10]:
####### VERSIUNEA TEMPORARA ##############
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x285fd3f70>

How you save to file and stop the database

vectorstore.persist()
vectorstore = None

Now we can load the persisted database from disk, and use it as normal. 

vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [11]:
query = "what I would like to see improved is the quality of the product"

In [12]:
retriever = vectorstore.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)

[Document(page_content='This product is great however can get annoying after time', metadata={'id': 3, 'rating': 5, 'review_summary': 'Product is great but can get annoying over time.', 'product_facts': 'Unknown', 'positive_sentiment': 0.50126034, 'negative_sentiment': 0.064283654, 'improvements_expected': 'Unknown', 'issues_identified': 'Product can get annoying over time', 'how_product_is_used': 'Unknown', 'media': '[]', 'where_product_is_used': 'Unknown', 'sentiment': 'Neutral', 'anger': 'No', 'anger_reason': nan, 'delight': 'No', 'delight_reason': nan, 'disappointment': 'No', 'disappointment_reason': nan, 'time': 'Unknown', 'season': 'Unknown', 'weather': 'Unknown', 'user_description': 'Unknown', 'title': 'Super satisfying/ can get annoying', 'asin_variant': 'B07XCRT49W', 'asin': 'B07X7YFZWG'}),
 Document(page_content='Only complaint I have is I wish the pen was connected Car use becomes a hassle if the pen gets dropped', metadata={'id': 82, 'rating': 5, 'review_summary': 'Pen conn

In [152]:
docs = vectorstore.similarity_search(query)
len(docs)

4

In [13]:
docs = vectorstore.similarity_search_with_score(query)

In [18]:
docs[0]

(Document(page_content='This product is great however can get annoying after time', metadata={'id': 3, 'rating': 5, 'review_summary': 'Product is great but can get annoying over time.', 'product_facts': 'Unknown', 'positive_sentiment': 0.50126034, 'negative_sentiment': 0.064283654, 'improvements_expected': 'Unknown', 'issues_identified': 'Product can get annoying over time', 'how_product_is_used': 'Unknown', 'media': '[]', 'where_product_is_used': 'Unknown', 'sentiment': 'Neutral', 'anger': 'No', 'anger_reason': nan, 'delight': 'No', 'delight_reason': nan, 'disappointment': 'No', 'disappointment_reason': nan, 'time': 'Unknown', 'season': 'Unknown', 'weather': 'Unknown', 'user_description': 'Unknown', 'title': 'Super satisfying/ can get annoying', 'asin_variant': 'B07XCRT49W', 'asin': 'B07X7YFZWG'}),
 1.1258236169815063)

In [24]:
docs[0]

(Document(page_content='This product is great however can get annoying after time', metadata={'id': 3, 'rating': 5, 'review_summary': 'Product is great but can get annoying over time.', 'product_facts': 'Unknown', 'positive_sentiment': 0.50126034, 'negative_sentiment': 0.064283654, 'improvements_expected': 'Unknown', 'issues_identified': 'Product can get annoying over time', 'how_product_is_used': 'Unknown', 'media': '[]', 'where_product_is_used': 'Unknown', 'sentiment': 'Neutral', 'anger': 'No', 'anger_reason': nan, 'delight': 'No', 'delight_reason': nan, 'disappointment': 'No', 'disappointment_reason': nan, 'time': 'Unknown', 'season': 'Unknown', 'weather': 'Unknown', 'user_description': 'Unknown', 'title': 'Super satisfying/ can get annoying', 'asin_variant': 'B07XCRT49W', 'asin': 'B07X7YFZWG'}),
 1.1258236169815063)

#### the Chat Memory

In [16]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

#### the Conversational Retrieval Chain

In [15]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory)

In [53]:
query = "What is the best feature of this product"
result = qa({"question": query})

In [54]:
result["answer"]

' The best feature of this product is that there is no mess or cleanup.'

#### Using a different model for condensing the question
This chain has two steps:
- First, it condenses the current question and the chat history into a standalone question. This is neccessary to create a standanlone vector to use for retrieval. 
- After that, it does retrieval and then answers the question using retrieval augmented generation with a separate model. 

In [62]:
from langchain.chat_models import ChatOpenAI

In [63]:
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),   # model='gpt-4'),
    vectorstore.as_retriever(),
    condense_question_llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
)

In [None]:
questions = [
    "What are Heat-bath random walks with Markov base?",
    "What is the ImageBind model?",
    "How does Compositional Reasoning with Large Language Models works?",   
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

In [64]:
chat_history = []
query = "What is the best feature of this product"
result = qa({"question": query, "chat_history": chat_history})

In [65]:
chat_history = [(query, result["answer"])]
query = "Why is that?"
result = qa({"question": query, "chat_history": chat_history})

All toghtether now

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [1]:
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),   # model='gpt-4'),
    vectorstore.as_retriever(),
    condense_question_llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
    memory = memory
)

NameError: name 'ConversationalRetrievalChain' is not defined

In [None]:
###### RETREIVAL ##########
### VESPA ?



Retreival Augmented Generation (RAG)
Improving Zero-Shot Ranking with Vespa Hybrid Search - Information Retreival Evaluation


Information Retreival
vespa.ai



Alte metode:
multi-dense


BM25 ( elasticsearch) e mai bun decat simmilarity search acum ( prima impresie)



Synthetic Query Generation at Spotify for improving retreival
https://www.youtube.com/watch?v=VrL7AbrY438


"For every topic generate five questions that could help define / increase precision"


colBERT < -- retreival general foarte bun  >
DSP Retreival < -- se poate specializa


Scalable Nearest Neighbor Search
PLAID Retreival Engine


github.com/standford-futuredata/ColBERT

Vespa < - hosted>


ColBERT-QA
Hindsight

DSP Programming Model
for highly specialized retreivers, which may be used by agents


Run the full cross encoder model
https://pyvespa.readthedocs.io/en/latest/deploy-vespa-cloud.html
https://docs.vespa.ai/en/vespa-quick-start.html



IN DOCUMENT CROSS-ENCODER
https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/retrieve_rerank/in_document_search_crossencoder.py


In [None]:
########
https://www.youtube.com/watch?v=VrL7AbrY438

https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/contextual-compression.html

In [25]:
from langchain.vectorstores import FAISS

In [None]:
retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents("What did the president say about Ketanji Jackson Brown")
pretty_print_docs(compressed_docs)