In [14]:
import os
import requests
from dotenv import load_dotenv
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import GooglePalm
from langchain.chains import ConversationalRetrievalChain

load_dotenv()

True

In [9]:
# Set your Confluence URL, email, and API token
confluence_url = "https://mikesofts.atlassian.net/"
email = os.environ['EMAIL']
CONFLUENCE_API_TOKEN = os.environ['CONFLUENCE_API_TOKEN']
GOOGLE_PALM_API_KEY = os.environ['GOOGLE_PALM_API_KEY']
# Define your Confluence API URL
base_url = "https://mikesofts.atlassian.net/wiki/rest/api"

# Create a session for authentication
session = requests.Session()
session.auth = (email, CONFLUENCE_API_TOKEN)

In [10]:
## get documents from confluence
loader = ConfluenceLoader(
    url=f"{confluence_url}wiki", username=email, api_key=CONFLUENCE_API_TOKEN, 
)

documents = loader.load(space_key="~614914d4071141006ab46038", limit=50)

In [15]:
llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API_KEY"], temperature = 0.1)
embeddings = GooglePalmEmbeddings(google_api_key=os.environ["GOOGLE_PALM_API_KEY"])


In [32]:
## split documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True,
    separators=[" ", "\n", '.', ","]
)
texts = text_splitter.split_documents(documents)

# Get embeddings.
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

import os

persist_directory = "chroma_db"

# Check if the folder exists in the current working directory
if os.path.exists(persist_directory) and os.path.isdir(persist_directory):
    print(f"The folder '{persist_directory}' exists in the current working directory. \n Therefore embeddings were loaded from disk")
    cdb = Chroma(embedding_function=embeddings, persist_directory=persist_directory)
else:
    print(f"The folder '{persist_directory}' does not exist in the current working directory. \n Therefore new embeddings will be created")
    cdb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=persist_directory)


The folder 'chroma_db' does not exist in the current working directory. 
 Therefore new embeddings will be created


In [43]:
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever=cdb.as_retriever(),verbose=False,return_source_documents=True)
chat_history=[]

In [49]:
query="what are some barriers to becoming proficient in data science"
result = qa_chain({"question":query, "chat_history":chat_history})
print(f"Answer: " + result["answer"])
chat_history.append((query, result["answer"]))

Answer: The barriers to becoming proficient in data science include:

* **Lack of technical skills.** Data science requires a strong foundation in mathematics, statistics, programming, and data visualization. Many people who are interested in data science do not have the necessary technical skills to succeed in the field.
* **Lack of domain knowledge.** Data science is often applied to real-world problems, which means that data scientists need to have a strong understanding of the domain they are working in. This can be a challenge for people who are new to the field.
* **Lack of experience.** Data science is a relatively new field, and there are not many experienced data scientists available. This can make it difficult for people who are new to the field to find jobs.
* **Lack of access to data.** Data science requires access to large datasets in order to train models and make predictions. This can be a challenge for people who do not have access to large datasets.
* **Lack of support

In [45]:
qa_chain({"question":query, "chat_history":chat_history})

{'question': 'what are some concerns of AI',
 'chat_history': [('what are some concerns of AI',
   'the goals of the field: to build machines that can solve problems using intelligence.')],
 'answer': 'Existential risk, weaponization, technological unemployment, and regulation.',
 'source_documents': [Document(page_content='of the human race ". [268] According to the philosopher Nick Bostrom , for almost any goals that a sufficiently intelligent AI may have, it is instrumentally incentivized to protect itself from being shut down and to acquire more resources, as intermediary steps to better achieve these goals. Sentience or emotions are then not required for an advanced AI to be dangerous. In order to be safe for humanity, a superintelligence would have to be genuinely aligned with humanity\'s morality and values so that it is "fundamentally on our side". [269] The political scientist Charles T. Rubin argued that "any sufficiently advanced benevolence may be indistinguishable from mal

In [9]:

retriever = cdb.as_retriever(
    search_kwargs={"k": 5}
)
query = "What is machine learning?"

# Get relevant documents ordered by relevance score
docs = retriever.get_relevant_documents(query)
docs

[]

In [11]:
docs[0].metadata['source']

'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/917667/Etymology'

In [8]:
results = "\n".join([doc.page_content for doc in docs])
print(results)

of data science, and it is considered by some to
Data science is an interdisciplinary academic
Data science is an interdisciplinary field [10]
[23] The term "data science" has been traced back
, data science often involves tasks such as data


In [2]:
# Get the list of all pages
def get_all_pages():
    pages = []
    start = 0
    limit = 50  # You can adjust the limit based on your needs
    while True:
        url = f"{base_url}/content"
        params = {
            "start": start,
            "limit": limit,
            "expand": "version,body.view",
        }
        response = session.get(url, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break

        data = response.json()
        pages.extend(data.get("results", []))

        if data.get("size", 0) < limit:
            break

        start += limit

    return pages

# Example usage
if __name__ == "__main__":
    all_pages = get_all_pages()
    print(f"Total pages: {len(all_pages)}")

    for page in all_pages:
        print(f"Title: {page['title']}, ID: {page['id']}")



Total pages: 26
Title: mamba setup, ID: 33113
Title: Knowledge base, ID: 65638
Title: Template - How-to guide, ID: 65650
Title: Template - Troubleshooting article, ID: 65664
Title: Getting started in Confluence, ID: 98394
Title: Overview, ID: 98395
Title: Overview, ID: 98634
Title: Software Development, ID: 262245
Title: Template - Product requirements, ID: 262284
Title: Template - Meeting notes, ID: 262298
Title: Template - Decision documentation, ID: 262312
Title: Get the most out of your software project space, ID: 262326
Title: baby mamba, ID: 262359
Title: Data Science, ID: 884737
Title: Artificial intelligence, ID: 884784
Title: In fiction, ID: 885246
Title: Etymology, ID: 917667
Title: Ethics, ID: 918001
Title: Philosophy, ID: 918153
Title: Future, ID: 918165
Title: Foundations, ID: 983041
Title: Data Science and Data Analysis, ID: 983053
Title: Goals, ID: 1048724
Title: Tools, ID: 1048904
Title: Applications, ID: 1048916
Title: History, ID: 1049028
