In [None]:
import os
import re
import tiktoken
from langchain_openai import ChatOpenAI
from time import sleep

In [None]:
with open('.env') as outfile:
    for line in outfile.read().splitlines():
        if line.startswith("#"):
            continue
        k, v = line.split('=')
        os.environ[k] = v

In [None]:
oai_llm = ChatOpenAI(api_key=os.environ["OAI_KEY_1"], model="gpt-4", base_url=os.environ.get("OAI_BASE_URL", None))
tokenizer = tiktoken.encoding_for_model("gpt-4")
stream = False

In [None]:
def generate(messages, **kwargs):
    output = ""
    if stream:
        response = oai_llm.stream(messages, **kwargs)
        for chunk in response:
            output += chunk.content
        return output
    else:
        return oai_llm.invoke(messages, **kwargs).content


In [None]:
system_prompt = "You are a helpful storywriting assistant."

Identify characters in the chapter for use when marking which characters turn it is

In [None]:
def get_characters(chapter):
    if len(tokenizer.encode(chapter)) > 3584:
        raise ValueError('Chapter is too big, please keep it under 3584 tokens')

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"""Give the following story, please point out each named character present in the Scene. List only the characters in a list like the following, include any other way they are referred to in the story
1. Character 1, Other name used in chapter
2. Character 2

{chapter}"""}
    ]

    characters = generate(messages)
    characters = re.sub(r".*?(1\..*)", r"\1", characters, flags=re.DOTALL)
    return characters

In [None]:
# given an index i get the ith line and the surrounding 3 lines before and after
def get_lines(lines, i, context=3):
  start = max(0, i - context)
  end = min(len(lines), i + context + 1)
  return "\n".join(lines[start:end])

Here we go over each line of the chapter adding character markers to indicate the turn of the actions and speech. We add surrounding lines and the list of characters as context.

In [None]:
def mark_speakers(chapter, characters):
    lines = chapter.splitlines()
    output = ""
    for i, line in enumerate(lines):
        messages = [
            { "role": "system", "content": system_prompt },
            { "role": "user", "content": f"""Please insert markers for when the character acting changes. You should only add markers to the single linge given.
Here are the surroinding lines that will help:
The detective rapidly summarizes the patient's status, concluding, "He hasn't been stabilized. He's in circulatory shock and unconscious."
"Who's the doctor in charge?" asks Erik.
"Daniella Richards."
"She's extremely capable. I'm sure she can—"
"She was the one who asked me to call you. She needs your help. It's urgent."
When Erik returns to the bedroom to get his clothes, Simone is lying on her back, looking at him with a strange, empty expression. A strip of light from the streetlamp is shining in between the blinds.
"I didn't mean to wake you," he says softly.
         
Here are the characters in the schene:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

Here is the line that you must add the markers to:
"Daniella Richards."
The markers should only indicate indicate the character which is acting, and should ignore if another character is mentioned in the line but is not acting any actions."""},
            { "role": "assistant", "content": """Joona Linna: "Daniella Richards.\""""},
            { "role": "user", "content": f"""Please insert markers for when the character acting changes. You should only add markers to the single linge given.
Here are the surroinding lines that will help:
The detective rapidly summarizes the patient's status, concluding, "He hasn't been stabilized. He's in circulatory shock and unconscious."
"Who's the doctor in charge?" asks Erik.
"Daniella Richards."
"She's extremely capable. I'm sure she can—"
"She was the one who asked me to call you. She needs your help. It's urgent."
When Erik returns to the bedroom to get his clothes, Simone is lying on her back, looking at him with a strange, empty expression. A strip of light from the streetlamp is shining in between the blinds.
"I didn't mean to wake you," he says softly.
         
Here are the characters in the schene:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

Here is the line that you must add the markers to:
"She's extremely capable. I'm sure she can—"
The markers should only indicate indicate the character which is acting, and should ignore if another character is mentioned in the line but is not acting any actions."""},
            { "role": "assistant", "content": """Erik Maria Bark: "She's extremely capable. I'm sure she can—\""""},
            { "role": "user", "content": f"""Please insert markers for when the character acting changes. You should only add markers to the single linge given.
Here are the surroinding lines that will help:
"Daniella Richards."
"She's extremely capable. I'm sure she can—"
"She was the one who asked me to call you. She needs your help. It's urgent."
When Erik returns to the bedroom to get his clothes, Simone is lying on her back, looking at him with a strange, empty expression. A strip of light from the streetlamp is shining in between the blinds.
"I didn't mean to wake you," he says softly.
"Who was that?" she asks.
"Police... a detective... I didn't catch his name."
         
Here are the characters in the schene:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

Here is the line that you must add the markers to:
When Erik returns to the bedroom to get his clothes, Simone is lying on her back, looking at him with a strange, empty expression. A strip of light from the streetlamp is shining in between the blinds.
The markers should only indicate indicate the character which is acting, and should ignore if another character is mentioned in the line but is not acting any actions."""},
            { "role": "assistant", "content": """Erik Maria Bark: When Erik returns to the bedroom to get his clothes, 
Simone: Simone is lying on her back, looking at him with a strange, empty expression. A strip of light from the streetlamp is shining in between the blinds."""},
            { "role": "user", "content": f"""Please insert markers for when the character acting changes. You should only add markers to the single linge given.
Here are the surroinding lines that will help:
{get_lines(lines, i)}        
         
Here are the characters in the schene:
{characters}

Here is the line that you must add the markers to:
{line}
The markers should only indicate indicate the character which is acting, and should ignore if another character is mentioned in the line but is not acting any actions."""},
        ]
    
        mes = generate(
            messages,
        )
        output += mes + "\n"
    return output

In [None]:
# we wanna go over and interrogate the AI on whether or not the speaker is correctly identified given surrounding context
# this should hopefully improve the results as the context wont be filled with as much stuff
def interrogate_speakers(marked_text, characters):
    lines = marked_text.splitlines()
    conversation = []
    for i, line in enumerate(lines):
        messages = [
            { "role": "system", "content": system_prompt },
            { "role": "user", "content": f"""Given the following context please check and correct the given lines speaker.
Only respond with the correct speaker.
The lines will be in the format of:
```
Speaker: line
```
         
The characters that are present in this chapter are:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

Here is the surrounding lines to help clarify:
Daniella Richards: "Have some coffee," she says.
Erik Maria Bark: "Do we have time?" asks Erik.
Daniella Richards: "I've got the bleed in the liver under control," she replies.
The Blond Man: A man of about forty-five, dressed in jeans and a black jacket, is thumping the coffee machine. He has tousled blond hair, and his lips are serious, clamped firmly together. 
Erik Maria Bark: Erik thinks maybe this is Daniella's husband, Magnus. He has never met him; he has only seen a photograph in her office.

Here is the line to identify:
Daniella Richards: "I've got the bleed in the liver under control," she replies."""},
            { "role": "assistant", "content": "Daniella Richards" },
            { "role": "user", "content": f"""Given the following context please check and correct the given lines speaker.
Only respond with the correct speaker.
The lines will be in the format of:
```
Speaker: line
```
         
The characters that are present in this chapter are:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)
        
Here is the surrounding lines to help clarify:
Joona Linna: The detective rapidly summarizes the patient's status, concluding, "He hasn't been stabilized. He's in circulatory shock and unconscious."
Erik Maria Bark: "Who's the doctor in charge?" asks Erik.
Erik Maria Bark: "Daniella Richards."
Erik Maria Bark: "She's extremely capable. I'm sure she can—"
Joona Linna: "She was the one who asked me to call you. She needs your help. It's urgent."

Here is the line to identify
Erik Maria Bark: "Daniella Richards.\""""},
            { "role": "assistant", "content": "Joona Linna\nThe text is of a conversation between Erik and Joona, and as such should go in turns." },
            { "role": "user", "content": f"""Given the following context please check and correct the given lines speaker.
Only respond with the correct speaker. If none of the characters in the list match, please respond with the orignal speaker in the line.
The lines will be in the format of:
```
Speaker: line
```
         
The characters that are present in this chapter are:
{characters}

Here is the surrounding lines to help clarify:
{get_lines(marked_text, i)}

Here is the line to identify
{line}"""},
        ]

        response = generate(messages)
        speaker = response.split(": ")[0]
        orig_speaker = line.split(": ")[0]
        message = line.split(": ", maxsplit=1)[1]
        conversation.append({
            "from": orig_speaker if "narrator" in speaker.lower() else speaker,
            "value": message,
        })
    return conversation


In [None]:
def split_quoted_and_non_quoted(s):
    # Regular expression to match quoted and non-quoted parts
    pattern = re.compile(r'"([^"]+)"|([^"]+)')
    
    # Find all matches
    matches = pattern.finditer(s)
    
    # Process matches to form the result list
    result = []
    for match in matches:
        if match.group(1):
            result.append({"value": match.group(1).strip(), "type": "quoted"})
        elif match.group(2):
            # Further split non-quoted parts by spaces while preserving spaces
            result.append({"value": match.group(2).strip(), "type": "unquoted"})
    
    return result

clarify character incase of slight hallucinations mispelling name or leaving out some part of it

In [None]:
def clarify_character(conversation, characters):
        results = []
        for message in conversation:
                messages = [
                        { "role": "system", "content": system_prompt },
                        { "role": "user", "content": f"""Given the following list of characters:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

The character list is given in the following format
1. Name, alias
2. Name, alias

Which character is the following referring to, use only the characters as in the list. Use their full name if possible and no other title or description. Your response must be either a single (full) name or "Unknown". No other text or numbers.
Joona"""},
                        { "role": "assistant", "content": "Joona Linna" },
                        { "role": "user", "content": f"""Given the following list of characters:
1. Erik Maria Bark, Erik
2. Simone, Sixan
3. Joona Linna, the detective
4. Daniella Richards, the doctor in charge
5. Erik's son, not named
6. The fifteen-year-old boy, the patient, the boy
7. Magnus, Daniella's husband (mentioned but not physically present)
8. The blond man, the lanky fellow
9. Two uniformed cops (not individually named)

The character list is given in the following format
1. Name, alias
2. Name, alias

Which character is the following referring to, use only the characters as in the list. Use their full name if possible and no other title or description. Your response must be either a single (full) name or "Unknown". No other text or numbers.
Maria Bark"""},
                        { "role": "assistant", "content": "Erik Maria Bark" },
                        { "role": "user", "content": f"""Given the following list of characters:
{characters}

The character list is given in the following format
1. Name, alias
2. Name, alias

Which character is the following referring to, use only the characters as in the list. Use their full name if possible and no other title or description. Your response must be either a single (full) name or "Unknown". No other text or numbers.
{message["from"]}"""},
                ]
                output = generate(messages, stop=["\n"])
                results.append({
                        "from": re.sub(r",.*", "", re.sub(r"\(.+?\)", "", output)).strip(),
                        "value": message["value"]
                })
        return results

Here we rewrite the non-spoken dialogue into first person e.g. Erik does xyz -> I do xyz  
We give it the full context of the character as was originally pointed out in the initial prompt to hopefully get things such as nicknames or similar in the context as well.

In [None]:
def rewrite_perspective(char, message):
    messages = [
        { "role": "system", "content": system_prompt },
        { "role": "user", "content": f"""Please rewrite the following line to be from a first person present tense perspective. The character acting out the following line is Erik
Avoid using the characters name in the rewritten text, keep in mind that pronouns may refer to other characters and should as such stay the same.
Do not clarify who {char} is, like with "I, {char}" or "I, occupation"

Here is the line to rewrite:
says Erik""" },
        { "role": "assistant", "content": "I say" },
        { "role": "user", "content": f"""Please rewrite the following line to be from a first person present tense perspective. The character acting out the following line is Erik
Avoid using the characters name in the rewritten text, keep in mind that pronouns may refer to other characters and should as such stay the same.
Do not clarify who {char} is, like with "I, {char}" or "I, occupation"

Here is the line to rewrite:
When Erik returns to the bedroom to get his clothes,""" },
        { "role": "assistant", "content": "When I return to the bedroom to get my clothes," },
        { "role": "user", "content": f"""Please rewrite the following line to be from a first person present tense perspective. The character acting out the following line is {char}
Avoid using the characters name in the rewritten text, keep in mind that pronouns may refer to other characters and should as such stay the same.
Do not clarify who {char} is, like with "I, {char}" or "I, occupation"
         
Here is the line to rewrite:
{message}""" },
    ]

    return generate(messages)

def rewrite_perspective_conversation(conversation):
    results = []
    for message in conversation:
        # remove speech from message as that shouldn't change perspective
        split_message = split_quoted_and_non_quoted(message["value"])
        # then call rewrite_perspective on each part of remaining text individually and join them back with the speech in the correct position
        rewritten = []
        for part in split_message:
            if part["type"] == "unquoted":
                content = rewrite_perspective(message["from"], part["value"])
            else:
                content = f'"{part["value"]}"'
            rewritten.append(content)
        results.append({ "from": message["from"], "value": " ".join(rewritten) })
    return results

Concatenate consecutive messags from the same character

In [None]:
def merge_speakers(conversation):
    current_char = None
    results = []
    current_mes = ""
    for message in conversation:
        if message['from'] != current_char:
            if current_char:
                results.append({ "from": current_char, "value": current_mes})
            current_mes = message['value']
            current_char = message['from']
        else:
            current_mes += f" {message['value']}"

    results.append({ "from": current_char, "value": current_mes})
    return results

format into ChatML for visualization purposes

In [None]:
# format into chatml
def chatml_dbg(conversation):
    def turn(char, message):
        return f"""<|im_start|>{char}
    {message}<|im_end|>"""

    print("\n".join([turn(result["from"], result["value"]) for result in conversation]))

In [None]:
import json
from glob import glob
from tqdm import tqdm

def extract_number(chapter):
    match = re.search(r'\d+', chapter)  # Find the first occurrence of one or more digits
    if match:
        return int(match.group())  # Convert the matched digits to an integer
    return 0  # Fallback if no digits found, though not expected in this case

with open("larskepler_thehypnotist.jsonl", "w") as outfile:
    for chapter_filename in tqdm(sorted(glob("out/chapter_*.txt"), key=extract_number)):
        with open(chapter_filename, "r") as chapter_file:
            print(f"""Chapter: {chapter_filename.replace("out/", "")}""")
            chapter = chapter_file.read()
            print("Getting characters...")
            characters = get_characters(chapter)
            print("Marking speakers...")
            marked_text = mark_speakers(chapter, characters)
            print("Interrogating about speakers...")
            conversation = interrogate_speakers(marked_text, characters)
            print("Clarifying Speakers...")
            conversation = clarify_character(conversation, characters)
            print("Rewriting perspective...")
            conversation = rewrite_perspective_conversation(conversation, characters)
            print("Merging speakers...")
            conversation = merge_speakers(conversation)

            print("Writing conversation...")
            for message in conversation:
                outfile.write(json.dumps(message) + "\n")