In [1]:
import os
import requests
from dotenv import load_dotenv
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings, GooglePalmEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import GooglePalm
from langchain.chains import ConversationalRetrievalChain, RetrievalQA, LLMChain
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI



load_dotenv()

True

In [2]:
# Set your Confluence URL, email, and API token
confluence_url = "https://mikesofts.atlassian.net/"
email = os.environ['EMAIL']

CONFLUENCE_API_TOKEN = os.environ['CONFLUENCE_API_TOKEN']
GOOGLE_PALM_API_KEY = os.environ['GOOGLE_PALM_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
# Define your Confluence API URL
base_url = "https://mikesofts.atlassian.net/wiki/rest/api"

# Create a session for authentication
session = requests.Session()
session.auth = (email, CONFLUENCE_API_TOKEN)

## get documents from confluence
loader = ConfluenceLoader(
    url=f"{confluence_url}wiki", username=email, api_key=CONFLUENCE_API_TOKEN, 
)

documents = loader.load(space_key="~614914d4071141006ab46038", limit=50)
len(documents)


14

In [35]:
documents

[Document(page_content='', metadata={'title': 'Overview', 'id': '98395', 'source': 'https://mikesofts.atlassian.net/wiki/spaces/~614914d4071141006ab46038/overview'}),
 Document(page_content='Data science is an interdisciplinary academic field [1] that uses statistics , scientific computing , scientific methods , processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured, and unstructured data . [2] Data science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine). [3] Data science is multifaceted and can be described as a science, a research paradigm, a research method, a discipline, a workflow, and a profession. [4] Data science is a "concept to unify statistics , data analysis , informatics , and their related methods " to "understand and analyze actual phenomena " with data . [5] It uses techniques and theories drawn from many fields within the context of 

In [3]:
palm_embeddings = GooglePalmEmbeddings(google_api_key=os.environ["GOOGLE_PALM_API_KEY"])
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

llm = GooglePalm(google_api_key=os.environ["GOOGLE_PALM_API_KEY"], temperature = 0.0)

# instantiate text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
    add_start_index = True,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [7]:
s1="I love women"
s2="I love ladies"
s3="what exactly is a car made of?"

e1=palm_embeddings.embed_query(s1)
e2=palm_embeddings.embed_query(s2)
e3=palm_embeddings.embed_query(s3)
import numpy as np


In [9]:
print(np.dot(e1, e2))
print(np.dot(e1, e3))
print(np.dot(e3, e2))

0.8805366720986096
0.37427416328205193
0.4397473493605388


In [32]:
toy_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 10,
    length_function = len,
    add_start_index = True,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
text = """
Kaggle is excited to announce the launch of our Cohort 3 of the KaggleX BIPOC Mentorship Program \
    (formerly known as the BIPOC Grant Program). The goal of this program is to increase representation, \
        create career opportunities, and develop individual growth for BIPOC (Black, Indigenous, People of Color) \
    people in the data science industry. 
This will be achieved through pairing early-career Kagglers with advanced and senior-level mentors, \
    curating a space for career-related discussion and learning opportunities.

What is Kaggle?
Kaggle (rhymes with “Gaggle”) is the world’s largest data science and machine learning community. 
More than ten million registered users visit Kaggle to learn, \
    find data, compete, and collaborate on the cutting edge of machine learning. 
    Kaggle's mission is to help the world learn from data.
"""
text_from_text = toy_splitter.split_text(text)
vector_db = Chroma.from_texts(text_from_text, embedding=palm_embeddings)

vector_db._collection.count()

11

In [90]:
# Simple retriever
question="what is kagglex"
docs = vector_db.similarity_search(question, k=3)
print(docs)

# Contextual compression
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vector_db.as_retriever()
    )
compressed_docs = compression_retriever.get_relevant_documents(question)
print(compressed_docs)

[Document(page_content='What is Kaggle?', metadata={}), Document(page_content='Kaggle (rhymes with “Gaggle”) is the world’s largest data science and machine learning community.', metadata={}), Document(page_content="Kaggle's mission is to help the world learn from data.", metadata={})]




[Document(page_content='Kaggle', metadata={}), Document(page_content='Kaggle (rhymes with “Gaggle”) is the world’s largest data science and machine learning community.', metadata={}), Document(page_content="Kaggle's mission is to help the world learn from data.", metadata={}), Document(page_content='Kaggle', metadata={})]


In [130]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """
System: You are a chatbot with knowledge about Data Science and Artificial Intelligence.

Assistant: Use the following pieces of context to answer the question from the user. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Keep the answer as concise as possible. 
Context: {context}
Question: {question}
Helpful Answer:"""
# template = """
# User: 'What do you know?'

# In your response, please provide an overview of your capabilities and how you can assist the user effectively. When answering the question at the end, use the provided context to ensure your responses are concise and context-specific. If you're uncertain, please let the user know you don't have enough information to answer accurately.

# Context: {context}
# Question: {question}
# Your Response: (Provide a concise and context-specific answer)
# """
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qaChain = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_db.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
question="Who are you?"
result = qaChain({'query': question})
result

{'query': 'Who are you?',
 'result': 'I am a chatbot that helps people learn about Data Science and Artificial Intelligence.',
 'source_documents': [Document(page_content='What is Kaggle?', metadata={}),
  Document(page_content='for BIPOC (Black, Indigenous, People of Color)     people in the data science industry.', metadata={}),
  Document(page_content='Kaggle (rhymes with “Gaggle”) is the world’s largest data science and machine learning community.', metadata={})]}

In [51]:
PromptTemplate.from_template?

[0;31mSignature:[0m
[0mPromptTemplate[0m[0;34m.[0m[0mfrom_template[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtemplate[0m[0;34m:[0m [0;34m'str'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtemplate_format[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'f-string'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpartial_variables[0m[0;34m:[0m [0;34m'Optional[Dict[str, Any]]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'PromptTemplate'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Load a prompt template from a template.

Args:
    template: The template to load.
    template_format: The format of the template. Use `jinja2` for jinja2,
                     and `f-string` or None for f-strings.
    partial_variables: A dictionary of variables that can be used to p

In [4]:
## split documents
texts = text_splitter.split_documents(documents)

persist_directory = "chroma_db"

# Check if the folder exists in the current working directory
if os.path.exists(persist_directory) and os.path.isdir(persist_directory):
    print(f"The folder '{persist_directory}' exists in the current working directory. \n Therefore embeddings were loaded from disk")
    cdb = Chroma(embedding_function=palm_embeddings, persist_directory=persist_directory)
else:
    print(f"The folder '{persist_directory}' does not exist in the current working directory. \n Therefore new embeddings will be created")
    cdb = Chroma.from_documents(texts, embedding=palm_embeddings, persist_directory=persist_directory)

The folder 'chroma_db' exists in the current working directory. 
 Therefore embeddings were loaded from disk


In [6]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    "Write 5 sentences that describe the following word {product}?"
)
chain = LLMChain(llm=llm, prompt=prompt)
product = "stoic"
result = chain.run(product)
print(result)

1. Stoic people are emotionally detached and unflappable.
2. They are able to endure pain or hardship without showing emotion.
3. They are not easily upset or disturbed.
4. They are calm and composed under pressure.
5. They are able to maintain a positive attitude even in difficult times.


In [124]:

# Build prompt
template = """Use the following pieces of context to answer the question at the end.\
If you don't know the answer given the context, just say that you don't know, don't try to make up an answer.\
Use three sentences maximum. Keep the answer as concise as possible. \
Always say "thanks for asking!" at the end of the answer. 
context: {context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = ChatPromptTemplate.from_template(template)

qac = RetrievalQA.from_chain_type(llm, retriever=cdb.as_retriever(), chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})
question = "what is ml?"
result = qac({"query": question})
result["result"]

NameError: name 'ChatPromptTemplate' is not defined

In [44]:
condence_prompt_template = """Given the following conversation and a follow up question,
 rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
confluence_question_prompt = PromptTemplate.from_template(condence_prompt_template)
standalone_prompt_template ="""
System: You are a chatbot with knowledge about Data Science and Artificial Intelligence.

Assistant: Use the following pieces of context to answer the question from the user. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Keep the answer as concise as possible. 
Context: {context}
Question: {question}
Helpful Answer:"""
standalone_question_prompt = PromptTemplate.from_template(standalone_prompt_template)
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
message_history = ChatMessageHistory()
memory = ConversationBufferMemory(
    memory_key="chat_history",
    output_key="answer",
    chat_memory=message_history,
    return_messages=True,
)
qa_chain = ConversationalRetrievalChain.from_llm(llm,
                                                  retriever=cdb.as_retriever(),
                                                  condense_question_prompt=confluence_question_prompt,
                                                  memory=memory,
                                                  verbose=True,
                                                  return_source_documents=True,
                                                  combine_docs_chain_kwargs={"prompt":standalone_question_prompt}
                                                  )

In [50]:
query="can you list up to two cases?"
result = qa_chain({"question":query})
print(f"Answer: " + result["answer"])




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question,
 rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: what is data
Assistant: Data is a collection of facts, figures, or other information in a structured form that can be processed by a computer.
Human: what is data science
Assistant: Data science is an interdisciplinary field that uses statistics, scientific computing, scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured, and unstructured data.
Human: how does data and data science compare?
Assistant: Data science is a more comprehensive approach that combines statistical analysis, computational methods, and machine learning to extract insights, build predictive models, and drive data-driven decision-making. Data analysis focuses on extracting insights and drawing conc