In [21]:
import os
import requests
from dotenv import load_dotenv
import json
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
# from langchain.document_transformers import (
#     LongContextReorder,
# )
# from langchain.chains import StuffDocumentsChain, LLMChain
# from langchain.prompts import PromptTemplate
# from langchain.llms import OpenAI
load_dotenv()

True

In [18]:
# Set your Confluence URL, email, and API token
confluence_url = "https://mikesofts.atlassian.net/"
email = os.environ['EMAIL']
api_token = os.environ['API_TOKEN']
OPENAI_API_KEY = os.environ['OPENAI_API_TOKEN']
# Define your Confluence API URL
base_url = "https://mikesofts.atlassian.net/wiki/rest/api"

# Create a session for authentication
session = requests.Session()
session.auth = (email, api_token)

In [24]:
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [29]:


## get documents from confluence
loader = ConfluenceLoader(
    url=f"{confluence_url}wiki", username=email, api_key=api_token, 
)

documents = loader.load(space_key="~614914d4071141006ab46038", limit=30)

## split documents
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 50,
    chunk_overlap  = 10,
    length_function = len,
    add_start_index = True,
)
texts = text_splitter.split_documents(documents)

# Get embeddings.
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a retriever
cdb = Chroma.from_documents(texts, embedding=embeddings, persist_directory="./chroma_db")
retriever = cdb.as_retriever(
    search_kwargs={"k": 5}
)
query = "What is machine learning?"

# Get relevant documents ordered by relevance score
docs = retriever.get_relevant_documents(query)
docs


[Document(page_content='Machine learning is the study of programs that', metadata={'id': '1048724', 'source': 'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/1048724/Goals', 'start_index': 5627, 'title': 'Goals'}),
 Document(page_content='Machine learning is the study of programs that', metadata={'id': '1048724', 'source': 'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/1048724/Goals', 'start_index': 5627, 'title': 'Goals'}),
 Document(page_content='in machine learning is the study of how to', metadata={'id': '918001', 'source': 'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/918001/Ethics', 'start_index': 305, 'title': 'Ethics'}),
 Document(page_content='in machine learning is the study of how to', metadata={'id': '918001', 'source': 'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/918001/Ethics', 'start_index': 305, 'title': 'Ethics'}),
 Document(page_content='[e] There 

{'ids': ['dc1847ce-627a-11ee-812a-00155d0c9b42',
  'dc184878-627a-11ee-812a-00155d0c9b42',
  'dc1848b4-627a-11ee-812a-00155d0c9b42',
  'dc1848dc-627a-11ee-812a-00155d0c9b42',
  'dc184904-627a-11ee-812a-00155d0c9b42',
  'dc18492c-627a-11ee-812a-00155d0c9b42',
  'dc18495e-627a-11ee-812a-00155d0c9b42',
  'dc184986-627a-11ee-812a-00155d0c9b42',
  'dc184a6c-627a-11ee-812a-00155d0c9b42',
  'dc184a94-627a-11ee-812a-00155d0c9b42',
  'dc184ab2-627a-11ee-812a-00155d0c9b42',
  'dc184b52-627a-11ee-812a-00155d0c9b42',
  'dc184bc0-627a-11ee-812a-00155d0c9b42',
  'dc184bf2-627a-11ee-812a-00155d0c9b42',
  'dc184c1a-627a-11ee-812a-00155d0c9b42',
  'dc184c42-627a-11ee-812a-00155d0c9b42',
  'dc184d00-627a-11ee-812a-00155d0c9b42',
  'dc184d1e-627a-11ee-812a-00155d0c9b42',
  'dc184d32-627a-11ee-812a-00155d0c9b42',
  'dc184d50-627a-11ee-812a-00155d0c9b42',
  'dc184d64-627a-11ee-812a-00155d0c9b42',
  'dc184d78-627a-11ee-812a-00155d0c9b42',
  'dc184d8c-627a-11ee-812a-00155d0c9b42',
  'dc184daa-627a-11ee-812a-

In [11]:
docs[0].metadata['source']

'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/pages/917667/Etymology'

In [8]:
results = "\n".join([doc.page_content for doc in docs])
print(results)

of data science, and it is considered by some to
Data science is an interdisciplinary academic
Data science is an interdisciplinary field [10]
[23] The term "data science" has been traced back
, data science often involves tasks such as data


In [2]:
# Get the list of all pages
def get_all_pages():
    pages = []
    start = 0
    limit = 50  # You can adjust the limit based on your needs
    while True:
        url = f"{base_url}/content"
        params = {
            "start": start,
            "limit": limit,
            "expand": "version,body.view",
        }
        response = session.get(url, params=params)

        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break

        data = response.json()
        pages.extend(data.get("results", []))

        if data.get("size", 0) < limit:
            break

        start += limit

    return pages

# Example usage
if __name__ == "__main__":
    all_pages = get_all_pages()
    print(f"Total pages: {len(all_pages)}")

    for page in all_pages:
        print(f"Title: {page['title']}, ID: {page['id']}")



Total pages: 26
Title: mamba setup, ID: 33113
Title: Knowledge base, ID: 65638
Title: Template - How-to guide, ID: 65650
Title: Template - Troubleshooting article, ID: 65664
Title: Getting started in Confluence, ID: 98394
Title: Overview, ID: 98395
Title: Overview, ID: 98634
Title: Software Development, ID: 262245
Title: Template - Product requirements, ID: 262284
Title: Template - Meeting notes, ID: 262298
Title: Template - Decision documentation, ID: 262312
Title: Get the most out of your software project space, ID: 262326
Title: baby mamba, ID: 262359
Title: Data Science, ID: 884737
Title: Artificial intelligence, ID: 884784
Title: In fiction, ID: 885246
Title: Etymology, ID: 917667
Title: Ethics, ID: 918001
Title: Philosophy, ID: 918153
Title: Future, ID: 918165
Title: Foundations, ID: 983041
Title: Data Science and Data Analysis, ID: 983053
Title: Goals, ID: 1048724
Title: Tools, ID: 1048904
Title: Applications, ID: 1048916
Title: History, ID: 1049028
