In [1]:
import os
import openai
import sys

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [8]:
from langchain.document_loaders import ConfluenceLoader

api_key = os.environ['CONFLUENCE_API_KEY']

loader = ConfluenceLoader(
    url="https://url", username="user", api_key=api_key
)
documents = loader.load(page_ids=['505618684', '494732378', '446760275', '436012282'], include_attachments=False, limit=50)
print(len(documents))

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(documents)
print(len(splits))

4
7


In [9]:
# !rm -rf ./docs/chroma

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

# ! pip install chromadb
# !rm -rf ./docs/chroma  # remove old database files if any

from langchain.vectorstores import Chroma

persist_directory = 'docs/chroma/'

In [10]:

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

vectordb.persist()

# vectordb = Chroma(
#     persist_directory=persist_directory,
#     embedding_function=embedding
# )

In [11]:
print(vectordb._collection.count())

7


In [12]:
question1 = "is there an email i can ask for help"
question2 = "what are airflow's IP addresses?"

docs1 = vectordb.similarity_search(question1, k=3)
docs2 = vectordb.similarity_search(question2, k=3)
print(len(docs1))
print(len(docs2))

3
3


In [8]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)



gpt-3.5-turbo-0301


In [24]:
question = "How to install Airflow"
docs = vectordb.similarity_search(question, k=3)
len(docs)

3

In [35]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [36]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [37]:
result = qa_chain({"query": question})

In [38]:
result

{'query': 'How to install Airflow',
 'result': 'Airflow can be installed using a customized Helm chart based on the official chart. Here are the steps to install Airflow:\n\n1. SSH into the server where you want to install Airflow.\n2. Clone the customized Helm chart to your local machine using the command: `git clone <helm-chart-repo-url>`\n3. CD into the cloned directory using the command: `cd <helm-chart-directory>`\n4. Create a Kubernetes namespace for Airflow using the command: `kubectl create namespace airflow`\n5. Install Airflow using the Helm chart using the command: `helm install airflow -n airflow .`\n\nAfter the installation is complete, you can access the Airflow UI using the URL provided in the context.'}

In [53]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Answer in Chinese.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [54]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [55]:
question = "都有哪些 MySql 的账号？"
result = qa_chain({"query": question})

In [56]:
result['result']

'有root、root_backup、sqlpad、sqlpad_real、feature_store、hive、airflow、adsuser等账号。'