In [33]:
from langchain_openai import AzureOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import SimpleJsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader



import json
import tiktoken
import os

In [34]:
API_KEY = os.getenv("AZURE_OPENAI_API_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

azure_client = AzureOpenAI(
                api_key=API_KEY
                ,api_version='2023-05-15'
                ,azure_endpoint=RESOURCE_ENDPOINT
                ,azure_deployment='gpt-35-turbo-instruct'
                ,temperature=0
                ,max_tokens=1000
                #,model_kwargs={"response_format": {"type": "json_object"}}
)

In [35]:
loader = PyPDFLoader("./lotr.pdf")

pages = loader.load_and_split()



In [36]:


def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [37]:
prologue_pages = pages[24:39]
# Python for loop in one line
prologue_text = ''
for page in prologue_pages : prologue_text += page.page_content


In [38]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1500,
    chunk_overlap  = 150
)

docs = text_splitter.create_documents([prologue_text])
print(len(docs))

28


In [41]:
output_parser = StrOutputParser()
#output_parser = JsonOutputParser(pydantic_object=Theme)
prompt = PromptTemplate(
    template="Identify the most important themes covered in the following text.  Take all information from the text only.  If a piece of information is not in the text, do not include it.  Results format: <theme>: <detailed summary of text that supports the theme in narrative form.>}}] :\n {text}",
    input_variables=["text"],

)

responses = []
for doc in docs:
    chain = (
    {"text": RunnablePassthrough()} 
        | prompt
        | azure_client
        | output_parser
    )

    responses.append(chain.invoke(doc.page_content).replace('"','\''))



In [42]:
print(docs[2].page_content)
print(responses[2])


mingled with the other kinds that had preceded them, but being 
somewhat bolder and more adventurous, they were often found as 
leaders or chieftains among clans of Harfoots or Stoors. Even in 
Bilbo’s time the strong Fallohidish strain could still be noted among4 TH E L ORD OF THE RI NGS 
the greater families, such as the Tooks and the Masters of Buckland. 
In the westlands of Eriador, between the Misty Mountains and 
the Mountains of Lune, the Hobbits found both Men and Elves. 
Indeed, a remnant still dwelt there of the Du´nedain, the kings of 
Men that came over the Sea out of Westernesse; but they were dwin-
dling fast and the lands of their North Kingdom were falling far and 
wide into waste. There was room and to spare for incomers, and ere 
long the Hobbits began to settle in ordered communities. Most of 
their earlier settlements had long disappeared and been forgotten in 
Bilbo’s time; but one of the ﬁrst to become important still endured, 
though reduced in size; this was at 

In [43]:
reduce_prompt_text = """
### Instructions:
Below is a collection of themes and supporting text summarize the all of supporting text in narrative form as if telling a story. 
Your summary will be added to other summaries so do not give an introduction to the summary, assume the reader has context for what you are telling them. 
Do not include the theme title in the summary.
Return your response in valid JSON format.
### JSON Output Key and Value
"text": "<summary of supporting text>"
### Themes and Supporting Text:
{text}"
"""

In [44]:

final_summary = []
output_parser = StrOutputParser()
prompt = ChatPromptTemplate.from_template(reduce_prompt_text)

for response in responses:
    chain = (
    {"text": RunnablePassthrough()} 
        | prompt
        | azure_client
        | output_parser
    )

    final_summary.append(chain.invoke(response))

print(' '.join(final_summary))


"text": "The Hobbit history and origins theme is supported by the text's mention of the Hobbits' migration from Anduin to Eriador and the three different breeds of Hobbits - Harfoots, Stoors, and Fallohides. It also discusses their own records beginning after the settlement of the Shire and their earliest tales of living in Anduin." 
"text": "The Hobbits are a diverse group, with three distinct types - the Stoors, Fallohides, and Harfoots. Each type has their own physical characteristics and preferences for certain types of land. They are known for their skills in various activities and their tendency to settle in one place and preserve their ancestral habits. Despite their differences, the Hobbits have a history of migration and settlement, with each type moving to different areas. They also have interactions with other races, such as Dwarves, Men, and Elves, and their attitudes towards these races vary. The Hobbits also have cultural differences, with each type having their own pref

In [45]:
print(len(final_summary))

28


In [46]:
final_text = ''

for fs in final_summary:
    if len(fs) > 0:
        raw_js = '{' + fs + '}'
        js = json.loads(raw_js)
        final_text += '\n\n' + js["text"]



In [47]:
with open("./prologue_text.txt", "w") as prologue:
    # Writing data to a file
    prologue.write(prologue_text)

with open("./prologue_summary_text.txt", "w") as summary:
    # Writing data to a file
    summary.write(final_text)