In [2]:
import os
import requests
from dotenv import load_dotenv
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import GooglePalm
from langchain.chains import ConversationalRetrievalChain

load_dotenv()

True

In [6]:
# Set your Confluence URL, email, and API token
confluence_url = "https://mikesofts.atlassian.net/"
email = os.environ['EMAIL']


In [7]:
CONFLUENCE_API_TOKEN = os.environ['CONFLUENCE_API_TOKEN']
GOOGLE_PALM_API_KEY = os.environ['GOOGLE_PALM_API_KEY']
# Define your Confluence API URL
base_url = "https://mikesofts.atlassian.net/wiki/rest/api"

# Create a session for authentication
session = requests.Session()
session.auth = (email, CONFLUENCE_API_TOKEN)

In [8]:
## get documents from confluence
loader = ConfluenceLoader(
    url=f"{confluence_url}wiki", username=email, api_key=CONFLUENCE_API_TOKEN, 
)

documents = loader.load(space_key="~614914d4071141006ab46038", limit=50)

In [9]:
llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API_KEY"], temperature = 0.1)
palm_embeddings = GooglePalmEmbeddings(google_api_key=os.environ["GOOGLE_PALM_API_KEY"])
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [10]:
## split documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True,
    separators=[" ", "\n", '.', ","]
)
texts = text_splitter.split_documents(documents)

persist_directory = "chroma_db"

# Check if the folder exists in the current working directory
if os.path.exists(persist_directory) and os.path.isdir(persist_directory):
    print(f"The folder '{persist_directory}' exists in the current working directory. \n Therefore embeddings were loaded from disk")
    cdb = Chroma(embedding_function=hf_embeddings, persist_directory=persist_directory)
else:
    print(f"The folder '{persist_directory}' does not exist in the current working directory. \n Therefore new embeddings will be created")
    cdb = Chroma.from_documents(texts, embedding=hf_embeddings, persist_directory=persist_directory)


The folder 'chroma_db' does not exist in the current working directory. 
 Therefore new embeddings will be created


In [11]:
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever=cdb.as_retriever(),verbose=False,return_source_documents=True)
chat_history=[]

In [13]:
query="what are some barriers to becoming proficient in data science"
result = qa_chain({"question":query, "chat_history":chat_history})
print(f"Answer: " + result["answer"])
chat_history.append((query, result["answer"]))

Answer: Data science is a complex field that requires a wide range of skills, including statistics, programming, and data visualization. As such, it can be difficult for people to become proficient in data science without the proper training and experience. Some of the barriers to becoming proficient in data science include:

* **Lack of educational resources.** There are a limited number of educational resources available for people who want to learn data science. This can make it difficult for people to find the training they need to become proficient in the field.
* **High cost of education.** The cost of education can be a barrier to entry for many people who want to learn data science. This is especially true for people who are not able to afford to attend a traditional university or college.
* **Lack of experience.** Data science is a field that is constantly evolving, and it can be difficult for people to keep up with the latest trends and technologies. This can make it difficul