In [1]:
import os
import requests
from dotenv import load_dotenv
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import GooglePalm
from langchain.chains import ConversationalRetrievalChain

load_dotenv()

True

In [2]:
# Set your Confluence URL, email, and API token
confluence_url = "https://mikesofts.atlassian.net/"
email = os.environ['EMAIL']


In [3]:
CONFLUENCE_API_TOKEN = os.environ['CONFLUENCE_API_TOKEN']
GOOGLE_PALM_API_KEY = os.environ['GOOGLE_PALM_API_KEY']
# Define your Confluence API URL
base_url = "https://mikesofts.atlassian.net/wiki/rest/api"

# Create a session for authentication
session = requests.Session()
session.auth = (email, CONFLUENCE_API_TOKEN)

In [4]:
## get documents from confluence
loader = ConfluenceLoader(
    url=f"{confluence_url}wiki", username=email, api_key=CONFLUENCE_API_TOKEN, 
)

documents = loader.load(space_key="~614914d4071141006ab46038", limit=50)

In [5]:
llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API_KEY"], temperature = 0.1)
palm_embeddings = GooglePalmEmbeddings(google_api_key=os.environ["GOOGLE_PALM_API_KEY"])
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [7]:
## split documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
texts = text_splitter.split_documents(documents)

persist_directory = "chroma_db"

# Check if the folder exists in the current working directory
if os.path.exists(persist_directory) and os.path.isdir(persist_directory):
    print(f"The folder '{persist_directory}' exists in the current working directory. \n Therefore embeddings were loaded from disk")
    cdb = Chroma(embedding_function=hf_embeddings, persist_directory=persist_directory)
else:
    print(f"The folder '{persist_directory}' does not exist in the current working directory. \n Therefore new embeddings will be created")
    cdb = Chroma.from_documents(texts, embedding=hf_embeddings, persist_directory=persist_directory)


The folder 'chroma_db' does not exist in the current working directory. 
 Therefore new embeddings will be created


In [10]:
cdb.as_retriever?

[0;31mSignature:[0m [0mcdb[0m[0;34m.[0m[0mas_retriever[0m[0;34m([0m[0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m)[0m [0;34m->[0m [0;34m'VectorStoreRetriever'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return VectorStoreRetriever initialized from this VectorStore.

Args:
    search_type (Optional[str]): Defines the type of search that
        the Retriever should perform.
        Can be "similarity" (default), "mmr", or
        "similarity_score_threshold".
    search_kwargs (Optional[Dict]): Keyword arguments to pass to the
        search function. Can include things like:
            k: Amount of documents to return (Default: 4)
            score_threshold: Minimum relevance threshold
                for similarity_score_threshold
            fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
            lambda_mult: Diversity of results returned by MMR;
                1 for minimum diversity and 0 for maximum. (Default: 0.5)


In [18]:
qa_chain = ConversationalRetrievalChain.from_llm(llm,
                                                  retriever=cdb.as_retriever(),
                                                  verbose=False,return_source_documents=True)
chat_history=[]

In [19]:
query="what are some barriers to becoming proficient in data science"
result = qa_chain({"question":query, "chat_history":chat_history})
print(f"Answer: " + result["answer"])
chat_history.append((query, result["answer"]))

Answer: lack of domain knowledge
