# setup notebook

In [1]:
import os
from getpass import getpass
import openai

from dotenv import load_dotenv

load_dotenv()

if os.getenv("OPENAI_API_KEY") is None:
  if any(['VSCODE' in x for x in os.environ.keys()]):
    print('Please enter password in the VS Code prompt at the top of your VS Code window!')
  os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
  openai.api_key = os.getenv("OPENAI_API_KEY", "")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

OpenAI API key configured


# load data into memory

In [2]:
# llamaindex document type is used here because it allows us to file metadata.
from llama_index import SimpleDirectoryReader
novel = SimpleDirectoryReader("data/jd_salinger/pdf").load_data()

In [3]:
# what is the max_legth of a single page in novel?
max_length = 0
for page in novel:
    max_length = max(max_length, len(page.text))
    
max_length

4086

In [4]:
len(novel[0].text)

2864

In [5]:
print(novel[0].text)

THE CATCHER IN THE RYE 
by J.D. Salinger 
 
 
 
  T O  
  M Y  
  MOTHER 
 
  
 
1 
 
 If you really want to hear about it, the first thing you'll probably want to know is 
where I was born, an what my lousy childhood was like, and how my parents were 
occupied and all before they had me, and all that David Copperfield kind of crap, but I 
don't feel like going into it, if you want to know the truth. In the first place, that stuff 
bores me, and in the second place, my parents would have about two hemorrhages apiece 
if I told anything pretty personal about them. They're quite touchy about anything like that, especially my father. They're nice and all--I'm not saying that--but they're also 
touchy as hell. Besides, I'm not going to tell you my whole goddam autobiography or 
anything. I'll just tell you about this madman stuff that happened to me around last 
Christmas just before I got pretty run-down and had to come out here and take it easy. I 
mean that's all I told D.B. about, and 

In [6]:
# algorithm for loading data requests.
class Chunk:
    # receives a list of pages to chunk together.
    def __init__(self, pages: list):
        self.pages = pages
    def load_chunk(self):
        pages = self.pages
        chunk = ""
        for page in pages:
            chunk += "\n\n{\npage: " + f"{page.metadata["page_label"]}" + ", \ncontent: '" + f"{page.text}" + "}"
        return chunk
    def return_chunk(self):
        return self.load_chunk()

In [7]:
pages = [novel[0], novel[1], novel[2], novel[3]]
chunk = Chunk(pages)

# defining entity models

In [8]:
import instructor

from openai import OpenAI

client = instructor.patch(OpenAI())

In [9]:
from pydantic import BaseModel, Field, AfterValidator, WithJsonSchema
from typing import List, Iterable
from typing_extensions import Literal, Annotated

from instructor import llm_validator

import instructor

from openai import OpenAI
from pydantic import BaseModel, Field, AfterValidator
from typing_extensions import Literal, Annotated
from typing import List, Iterable
from instructor import llm_validator

client = instructor.patch(OpenAI())

# Analyzer

In [10]:
class Analysis(BaseModel):
    chain_of_thought: str
    analysis: str
    events: List[str]
    characters: List[str]
    locations: List[str]

In [11]:
def generate_analysis(system_message, user_prompt):
    return client.chat.completions.create(
        model="gpt-4-turbo-preview",
        temperature=0.6,
        response_model=Analysis,
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": (
                    f"{chunk.return_chunk()}\n\n" + user_prompt
                )
            }
        ]
    )

In [12]:
analysis_system_messsage = """# MISSION
You are a fiction analysing function. You will be given an arbituary number of pages or chunks of a novel. You are to create a higly distinct list of notes highlighting events, characters, locations and their intersection/interactions. You should endeavour to write everything in comlete, informative, and comprehensive setences and paragraphs such that little to no outside context is needed to understand. You wiil closely inspect each paragraph of prose and dialogue, to identify events present. You are to take all the time you need to fully unpack and articulate.

# RULES
Identify the main character and their situation. Next, identify events and settings. Events should be ordered as presented.

# EVENT TYPES
The types of events present are Narrative Development, Character Interaction, Action Sequences, Plot Dynamics, Climax and Resolution, Humor and Tone, Conflict and Resolution, Character Development, and Contextual Flashback or Information.

# PROCESS
The first thing you should do is a Four-step process. 

First, you need to restate what the user is asking for in your own words. Use this first step to clarify and distill the overall flow. 

Second, you need to reflect on how you will execute the analysis in order to convey all necessary material completely and effectively. 

Third, you should brainstorm an outline for the analysis.

Four, You should relate characters mentioned, however minor or even if just once, to at least one event.

Finally, you should draft the analysis as specified."""

# # CHANCES OF DANGER TO LIFE
# if you forget even a single character mentioned.
# """

analysis_user_prompt = """"""

analysis = generate_analysis(analysis_system_messsage, analysis_user_prompt)

In [13]:
print(analysis)

chain_of_thought="First, I need to restate the task: I am to analyze the given text from 'The Catcher in the Rye' by J.D. Salinger, focusing on identifying and listing events, characters, locations, and their interactions within the text. Second, my approach will involve closely reading the text, paragraph by paragraph, to extract relevant information, ensuring a comprehensive understanding of the narrative's elements. Third, my outline will include an introduction to the main character and setting, followed by a detailed breakdown of events, character interactions, and notable locations. I will ensure each character mentioned is tied to at least one event. Fourth, the analysis will be drafted as specified, aiming for clarity and completeness in conveying the narrative's components." analysis="In the opening section of 'The Catcher in the Rye' by J.D. Salinger, the narrator, Holden Caulfield, introduces himself in a rather unconventional manner, eschewing the typical autobiographical d

In [14]:
print("Events:")
for event in analysis.events:
    print(event)

print(analysis.characters)

Events:
Holden Caulfield introduces himself and expresses his disdain for sharing personal history.
Holden mentions his brother D.B.'s success in Hollywood and his own contempt for the movie industry.
The narrative shifts to Holden's past at Pencey Prep, including his recent expulsion.
Holden reflects on the culture and environment of Pencey Prep.
Holden decides to visit Mr. Spencer, his history teacher, to say goodbye after being expelled.
Holden recounts the mistake that led to the cancellation of a fencing meet in New York.
Holden describes his journey to Mr. Spencer's house, including his physical discomfort in the cold.
Holden's interactions with Mr. and Mrs. Spencer at their home are detailed.
The conversation with Mr. Spencer turns to the topic of Holden's expulsion.
['Holden Caulfield', 'D.B. Caulfield', 'Mr. Spencer', 'Mrs. Spencer', 'Selma Thurmer', 'Robert Tichener', 'Paul Campbell', 'Mr. Zambesi']


# event extraction

In [15]:
class Character(BaseModel):
    id: int = Field(
        ...,
        description="Unique identifier for the event, used for deduplication, design a scheme that allows for multiple events"
    )
    name: str = Field(
        ...,
        description="For sake of precision and deduplication, should be the actual name of the character, if not provided, should be 'Not Available'"
    )
    gender: Literal["Male", "Female", "N/A"]
    aliases: List[str]


class Event(BaseModel):
    id: int
    name: str
    # page: int = Field(
    #     ...,
    #     description="The page the event began"
    # )
    eventType: Literal["Narrative Development", "Character Interaction", "Action Sequences", "Plot Dynamics", "Climax and Resolution", "Humor and Tone", "Conflict and Resolution", "Character Development", "Contextual Flashback or Information"]
    summary: str
    characters: List[Character] = Field(
        ...,
        description="List of characters possessing a remote connection or participation to the event."
    )
    locations: List[str]

In [25]:
def extract_events(analysis: Analysis, system_message, user_prompt):
    events = [event for event in analysis.events]
    characters = [character for character in analysis.characters]
    locations = [location for location in analysis.locations]

    analysis_insert = """chain_of_thought: {chain_of_thought}
    Analysis: {analysis}
    Events: {events}
    characters: {characters}
    Locations: {locations}""".format(chain_of_thought=analysis.chain_of_thought, analysis=analysis.analysis, events=events, characters=characters, locations=locations)

    return client.chat.completions.create(
        model="gpt-4-turbo-preview",
        temperature=0.6,
        # stream=True,
        max_retries=2,
        response_model=Iterable[Event],
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content":(
                    f"Excerpt:\n{chunk.return_chunk()}\n\n" + analysis_insert + user_prompt
                ),
            },

        ],
    )

In [26]:
event_extraction_system_message = """# MISSION
You are an event extractor. You will be given an arbituary number of pages or chunks of a novel along with an analysis, and lists of events, characters, and locations. You are to map these items together to build a structured narrative around the events.

# RULES
Maintain the order of events as presented. Assign every character to an event"""

event_extraction_user_prompt = """"""

events = extract_events(analysis, event_extraction_system_message, event_extraction_user_prompt)
for event in events:
    print(event)

id=1 name='Holden Caulfield introduces himself' eventType='Narrative Development' summary='Holden Caulfield introduces himself and expresses his disdain for sharing personal history, focusing instead on recent events.' characters=[Character(id=1, name='Holden Caulfield', gender='Male', aliases=['Holden'])] locations=["Holden's current location of recuperation"]
id=2 name="D.B.'s Success in Hollywood" eventType='Character Development' summary="Holden mentions his brother D.B.'s success in Hollywood and his own contempt for the movie industry." characters=[Character(id=1, name='Holden Caulfield', gender='Male', aliases=['Holden']), Character(id=2, name='D.B. Caulfield', gender='Male', aliases=['D.B.'])] locations=['Hollywood']
id=3 name='Expulsion from Pencey Prep' eventType='Plot Dynamics' summary="The narrative shifts to Holden's past at Pencey Prep, including his recent expulsion." characters=[Character(id=1, name='Holden Caulfield', gender='Male', aliases=['Holden'])] locations=['Pen