# openai-document-summarizer
### September 27, 2023

This script uses langchain and openai to summarize a document.  Langchain is used to read in and convert a PDF to text.  It is then used to construct a prompt for the LLM.  The prompt is passed to AzureChatOpenAI, which summarizes the document.

In [None]:
# Define Azure storage variables
dbutils.widgets.text("storageAccountName", "dlsgciitedausagedev1", "1) Storage Account:")
dbutils.widgets.text("containerName", "hackathon", "2) Container:")

In [None]:
%pip install azure-search-documents==11.4.0b8 azure-identity pypdf langchain==0.0.302 pdfplumber

In [None]:
# Restart Python to use new libraries
dbutils.library.restartPython()

In [None]:
%run /Source/Common/storageMountPython

In [None]:
# Import required libraries
import openai
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import PyPDFLoader
import langchain

langchain.debug = False

# Get keys and endpoints
SECRET_SCOPE = "<insert your Azure keyvault name here>"
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["OPENAI_API_BASE"] = "https://openai-gci-eda-ds-dev-01.openai.azure.com/"
OPENAI_KEY_VALUE = dbutils.secrets.get(scope = SECRET_SCOPE, key = "<insert the name of your openai key here>")
OPENAI_API_ENDPOINT = "<insert the name of your openai key here>"
OPENAI_DEPLOYMENT = "<insert the name of your azure openai deployment here>"
model: str = "<insert the name of your OpenAI embedder here>"

openai.api_key = OPENAI_KEY_VALUE
openai.api_endpoint = OPENAI_API_ENDPOINT

# Load the Azure OpenAI instance
chatbot = AzureChatOpenAI(deployment_name=OPENAI_DEPLOYMENT, temperature=0, openai_api_key=OPENAI_KEY_VALUE)


In [None]:
# Use langchain to read a pdf.  Creates a langchain Document
filename = "<insert the name of the PDF to summarize here>"
loader = PyPDFLoader(filename)
pdfdata = loader.load()

In [None]:
# Extract the text content from the Document and get rid of newlines
docContent = ""
for i in range(len(pdfdata)):
  pageContent = pdfdata[i].page_content.replace("\n", "")
  docContent += pageContent

content = docContent

In [None]:
# Display the extracted text from the PDF
content

In [None]:
# Define the prompt schema
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [None]:
# Create prompt
prompt_prefix = """ 
  Summarize the following document into a set of no more than twenty bullet points.  List the names of any companies in the document and their addresses.  Also identify the prices of any services:
"""

prompt = prompt_prefix + content
print(prompt)

In [None]:
# Call the chatbot and pass the prompt.  Print the response from the returned structure.
botresponse = chatbot([HumanMessage(content=prompt)])
print(botresponse.content)

In [None]:
# Try a trickier one:
prompt_prefix = """ 
  Identify any legal risks in this contract:
"""

prompt = prompt_prefix + content
botresponse = chatbot([HumanMessage(content=prompt)])
print(botresponse.content)