# Hackathon

Introduction to Langchain

In [3]:
# Load OpenAI key from env

import os
from dotenv import load_dotenv
load_dotenv(override=True)

azure_api_key = os.getenv('AZURE_OPENAI_API_KEY')
azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')


In [5]:
# create llm instance

from langchain.chat_models import AzureChatOpenAI

llm = AzureChatOpenAI(
    api_key=azure_api_key,
    api_version="2023-05-15",
    azure_deployment="gpt-35-turbo-16k",
    azure_endpoint=azure_endpoint,
)

OpenAIError: Missing credentials. Please pass one of `api_key`, `azure_ad_token`, `azure_ad_token_provider`, or the `AZURE_OPENAI_API_KEY` or `AZURE_OPENAI_AD_TOKEN` environment variables.

In [None]:
# invoke llm

llm.invoke("What do you know about Pub Quizzes?")

In [None]:
# use prompts

from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""You are an encyclopedia.

Question: {input}""")

chain = prompt | llm

chain.invoke({"input": "What are Pub Quizzes?"})

In [None]:
# reusue prompts

chain.invoke({"input": "What are Pub Quizzes also called?"})



In [None]:
# contexts

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

chain = prompt | llm

chain.invoke(
    {
        "input": "What are Pub Quizzes also called?",
        "context": "A pub quiz is a quiz held in a pub or bar. These events are also called quiz nights, trivia nights, or bar trivia and may be held in other settings. The pub quiz is a modern example of a pub game, and often attempts to lure customers to the establishment on quieter days. The pub quiz has become part of British culture since its popularization in the UK in the 1970s by Burns and Porter, although the first mentions in print can be traced to 1959.[4][5] It then became a staple in Irish pub culture, and its popularity has continued to spread internationally. Although different pub quizzes can cover a range of formats and topics, they have many features in common. Most quizzes have a limited number of team members, offer prizes for winning teams, and distinguish rounds by category or theme. ",
    }
)

In [None]:
# documents

from langchain_core.documents import Document
from langchain.chains.combine_documents import create_stuff_documents_chain


documents = [
    Document(
        page_content="A pub quiz is a quiz held in a pub or bar. These events are also called quiz nights, trivia nights, or bar trivia and may be held in other settings. The pub quiz is a modern example of a pub game, and often attempts to lure customers to the establishment on quieter days. The pub quiz has become part of British culture since its popularization in the UK in the 1970s by Burns and Porter, although the first mentions in print can be traced to 1959.[4][5] It then became a staple in Irish pub culture, and its popularity has continued to spread internationally. Although different pub quizzes can cover a range of formats and topics, they have many features in common. Most quizzes have a limited number of team members, offer prizes for winning teams, and distinguish rounds by category or theme. ",
        metadata={
            "source": "wikipedia"
        }
    )
]

document_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)

document_chain.invoke(
    {
        "input": "What are Pub Quizzes also called?",
        "context": documents,
    }
)


In [None]:
document_chain.invoke(
    {
        "input": "What are Pub Quizzes also called and what is the source?",
        "context": documents,
    }
)

In [None]:
# document prompts

document_prompt = ChatPromptTemplate.from_template("""Content: {page_content}                             
Source: {source}""")

document_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt,
    document_prompt=document_prompt,
)

document_chain.invoke(
    {
        "input": "What are Pub Quizzes also called and what is the source?",
        "context": documents,
    }
)

## Vector Stores

In [None]:
# vector stores

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma


# load data
loader = TextLoader(r"./data/text/pubquiz.txt", encoding="utf-8")
data = loader.load()
loader = TextLoader(r"./data/text/llm.txt", encoding="utf-8")
data.extend(loader.load())

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20, separators=[".", "\n"])
documents = splitter.split_documents(data)

embeddings = AzureOpenAIEmbeddings(
    api_key=azure_api_key,
    api_version="2023-05-15",
    azure_deployment="text-embedding-ada-002",
    azure_endpoint=azure_endpoint,
)
db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma/quiz")

for document in documents:
    print(document)
    print("------------------------")

In [None]:
# retriever

from langchain.chains import create_retrieval_chain

db = Chroma(persist_directory="./chroma/quiz", embedding_function=embeddings)
retriever = db.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

retrieval_chain.invoke({"input": "What are Pub Quizzes also called and what is the source?"})

In [None]:
retrieval_chain.invoke({"input": "Give a concise description of a LLM and what is the source?"})

In [None]:
# add already embedded data

db_book = Chroma(persist_directory="./chroma/book", embedding_function=embeddings)
db_book_data = db_book._collection.get(include=['documents', 'metadatas', 'embeddings'])

db._collection.add(
    embeddings=db_book_data['embeddings'],
    metadatas=db_book_data['metadatas'],
    documents=db_book_data['documents'],
    ids=db_book_data['ids']
)

retriever = db.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)


In [None]:
retrieval_chain.invoke({"input": "What does Juliet answer when Paris says 'So will ye, I am sure, that you love me.'?"})

## Summary

In [None]:
summary_prompt = ChatPromptTemplate.from_template("""Please summarize the following piece of text.
Respond in a manner that a 5 year old would understand.

Text: {context}""")

summary_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=summary_prompt,
    document_prompt=document_prompt,
)

summary_chain.invoke(
    {"context": documents[:3]}
)

In [None]:
# context size

summary_chain.invoke(
    {"context": documents}
)

In [None]:
from functools import partial

from langchain.chains.combine_documents import collapse_docs, split_list_of_docs
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.prompts import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

partial_format_document = partial(format_document, prompt=document_prompt)

# The chain we'll apply to each individual document.
# Returns a summary of the document.
map_chain = (
    {"context": partial_format_document}
    | summary_prompt
    | llm
    | StrOutputParser()
)

# A wrapper chain to keep the original Document metadata
map_as_doc_chain = (
    RunnableParallel({"doc": RunnablePassthrough(), "content": map_chain})
    | (lambda x: Document(page_content=x["content"], metadata=x["doc"].metadata))
).with_config(run_name="Summarize (return doc)")

# The chain we'll repeatedly apply to collapse subsets of the documents
# into a consolidate document until the total token size of our
# documents is below some max size.
def format_docs(docs):
    return "\n\n".join(partial_format_document(doc) for doc in docs)

collapse_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Collapse this content:\n\n{context}")
    | llm
    | StrOutputParser()
)

def get_num_tokens(docs):
    return llm.get_num_tokens(format_docs(docs))

def collapse(
    docs,
    config,
    token_max=4000,
):
    collapse_ct = 1
    while get_num_tokens(docs) > token_max:
        config["run_name"] = f"Collapse {collapse_ct}"
        invoke = partial(collapse_chain.invoke, config=config)
        split_docs = split_list_of_docs(docs, get_num_tokens, token_max)
        docs = [collapse_docs(_docs, invoke) for _docs in split_docs]
        collapse_ct += 1
    return docs

# The chain we'll use to combine our individual document summaries
# (or summaries over subset of documents if we had to collapse the map results)
# into a final summary.

reduce_chain = (
    {"context": format_docs}
    | PromptTemplate.from_template("Combine these summaries:\n\n{context}")
    | llm
    | StrOutputParser()
).with_config(run_name="Reduce")

# The final full chain
map_reduce = (map_as_doc_chain.map() | collapse | reduce_chain).with_config(run_name="Map reduce")

map_reduce.invoke(
    input=documents,
    config={"max_concurrency": 5},
)

In [None]:
# map reduce takes longer

map_reduce.invoke(
    input=documents[:3],
    config={"max_concurrency": 5},
)

## Few shotting

In [None]:
intent_prompt = ChatPromptTemplate.from_template(
"""A question can have an intent of either Document Retrieval, Scientific Question or Political Question.
Now tell me which intent the following question has.
                                            
Question: {input}""")

intent_chain = intent_prompt | llm
intent_chain.invoke({"input": "How fast can a Document fall if dropped?"})

In [None]:
intent_prompt = ChatPromptTemplate.from_template(
"""A question can have an intent of either Document Retrieval, Scientific Question or Political Question.
Now tell me which intent the following question has. Only answer with two words.
                                            
Question: {input}""")

intent_chain = intent_prompt | llm
intent_chain.invoke({"input": "How fast can a Document fall if dropped?"})

In [None]:
intent_prompt = ChatPromptTemplate.from_template(
"""A question can have an intent of either Document Retrieval, Scientific Question or Political Question.

Consider the following examples:

Question: What are the Panama Papers?
Answer: Document Retrieval

Question: What is the maximum velocity of a book when thrown?
Answer: Scientific Question

Now tell me which intent the follolwing question has. Only answer with two words.
Question: {input}""")

intent_chain = intent_prompt | llm
intent_chain.invoke({"input": "How fast can a Document fall if dropped?"})

## Agent and Tools

In [None]:
llm.invoke("What did Frank-Walter Steinmeier say in his christmas speech 2023?")

In [None]:
from langchain.tools import Tool
from langchain.tools.ddg_search import DuckDuckGoSearchRun

ddg = DuckDuckGoSearchRun()

ddg_tool = Tool.from_function(
    func = ddg.run,
    name = "DuckDuckGo Search",
    description = "Search DuckDuckGo for a query abount current events.",
)

tools = [ddg_tool]

ddg_tool.run("What did Frank-Walter Steinmeier say in his christmas speech 2023?")

In [None]:
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
)

agent.invoke({"input": "What did Frank-Walter Steinmeier say in his christmas speech 2023?"})

In [None]:
PREFIX = """You are participating in a pubquiz. Answer in a short sentence."""

agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={
        'prefix': PREFIX
    }
)

agent.invoke({"input": "What did Frank-Walter Steinmeier say in his christmas speech 2023?"})

In [None]:
simple_summary_prompt = ChatPromptTemplate.from_template("""Please summarize the following piece of text.
Respond in a manner that a 5 year old would understand.

Text: {input}""")
simple_summary_chain = {"input": RunnablePassthrough()} | simple_summary_prompt | llm

summary_tool = Tool(
    name="Summary Tool",
    func=simple_summary_chain.invoke,
    description="Use this tool to do a summary. Make sure you get the text to do a summary of first."
)
tools.append(summary_tool)

agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
)

agent.invoke({"input": "Summarize the christmas speech 2023 of Frank-Walter Steinmeier?"})

In [None]:
llm.invoke("What is 1000 + 1234?")

In [None]:
llm.invoke("What is 13 raised to the .3432 power?")

## Audio

In [1]:
import openai

azure_api_key_whisper = os.getenv('AZURE_OPENAI_API_KEY_WHISPER')
azure_endpoint_whisper = os.getenv('AZURE_OPENAI_ENDPOINT_WHISPER')

client = openai.AzureOpenAI(
    api_key=azure_api_key_whisper,
    azure_endpoint=azure_endpoint_whisper,
    azure_deployment="whisper",
    api_version="2023-09-01-preview",
)

def get_transcript(audio_file):
    if not os.path.exists(audio_file):
        audio_file = "./data/" + audio_file
    client.audio.with_raw_response
    return client.audio.transcriptions.create(
        file=open(audio_file, "rb"),            
        model="whisper",
        language="de",
    ).text

audio_test_file = "./data/audio/newyear2023.mp3"
get_transcript(audio_test_file)


NameError: name 'os' is not defined

In [None]:
# audio tool

def available_audio_files(input):
    return "./data/audio/newyear2023.mp3 - new years eve speech 2023 of Olaf Scholz\n"\
        "./data/audio/newyear2016.mp3 - new years eve speech 2016 of Angela Merkel\n" \
        "./data/audio/christmas2019.mp3 - christmas speech 2019 of Frank-Walter Steinmeyer\n"

audio_file_tool = Tool(
    name="Audio File Tool",
    func=available_audio_files,
    description="Use this tool to see which new years eve speeches are available as an audio file."
)

# long summary tool

def split_text_to_docs(text):
    doc = Document(
        page_content=text,
        metadata={
            "source": "text"
        }
    )
    splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=20, separators=[".", "\n"])
    return splitter.split_documents([doc])

def get_splitted_transcript(audio_file):
    return split_text_to_docs(get_transcript(audio_file))

long_summary_chain = get_splitted_transcript | map_reduce

audio_tool = Tool(
    name="Audio Tool",
    func=long_summary_chain.invoke,
    description="Use this tool to get the summary of the new years speech in a given audio file."
)

agent = initialize_agent(
    [
        summary_tool,
        audio_file_tool,
        audio_tool,
        ddg_tool
    ],
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True,
)

agent.invoke({"input": "Summarize the new years eve speech 2023 of Olaf Scholz using the audio and summary tool?"})
