https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [None]:
from langchain.document_loaders import TextLoader
loader = TextLoader("../../state_of_the_union.txt")
documents = loader.load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever(), memory=memory)

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
result = qa({"question": query})

In [None]:
result["answer"]

In [None]:
query = "Did he mention who she suceeded"
result = qa({"question": query})

In [None]:
result['answer']

In [None]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), vectorstore.as_retriever())

In [None]:
chat_history = []
query = "What did the president say about Ketanji Brown Jackson"
result = qa({"question": query, "chat_history": chat_history})

In [None]:
result["answer"]

In [None]:
chat_history = [(query, result["answer"])]
query = "Did he mention who she suceeded"
result = qa({"question": query, "chat_history": chat_history})

In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import json
import os

import tiktoken
from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Read the ASIN values from the CSV file
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
#asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

OPENAI_API_KEY is ready


In [10]:
reviews = pd.read_csv('/Users/vladbordei/Documents/Development/ProductExplorer/data/processed/reviews_export.csv')
reviews = reviews[reviews['asin'].isin(asin_list)]

In [11]:
reviews.columns

Index(['id', 'rating', 'review_summary', 'product_facts', 'positive_sentiment',
       'negative_sentiment', 'improvements_expected', 'issues_identified',
       'how_product_is_used', 'media', 'where_product_is_used', 'sentiment',
       'anger', 'anger_reason', 'delight', 'delight_reason', 'disappointment',
       'disappointment_reason', 'time', 'season', 'weather',
       'user_description', 'title', 'review', 'asin_variant', 'asin'],
      dtype='object')

In [12]:
df = reviews[['asin', 'review', 'review_summary','rating','title']]

In [None]:
# Encode the sentences and store the embeddings in the 'embeddings' column
df['embedding'] = df['data_label'].apply(lambda x: embedder.encode(x))

# Normalize the embeddings to unit length
df['embedding'] = df['embedding'].apply(lambda x: x / np.linalg.norm(x))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)

# Fit clusters
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1)
cluster_model = clustering.fit(matrix)

cluster_assignment = cluster_model.labels_



In [6]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [14]:
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="review")
documents = loader.load()

In [24]:
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html
persist_directory = '/Users/vladbordei/Documents/Development/ProductExplorer/data/vectorstores/chroma/db'
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x285a7f880>

How you save to file and stop the database

vectorstore.persist()
vectorstore = None

Now we can load the persisted database from disk, and use it as normal. 

vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)