### BooookScore Summarization and Evaluation

In [3]:
import os, json
from booookscore.score import Scorer 

# Define variables
# book_name = "reminiscences-of-pioneer-days-in-st-paul"
book_name = "a_brief_history_of_time"
model = "gpt-4o"
# model = "gpt-3.5-turbo"
method = "hier"  # or "increasing"

# Configuration variables
api = "openai"
api_key_path = "api.txt"
chunk_size = 4096
max_context_len = 8192

# Create results directory structure
data_dir = "../../../data/full_content_parsed_epubs/"
results_dir = "results/" + model + "/" + book_name + "/" + method + "/"

pickle_book_path = data_dir + book_name + ".pkl"
chunked_output_path = results_dir + "chunked_book.pkl"
summaries_output_path = results_dir + "summaries.json"
postprocessed_summaries_output_path = summaries_output_path.replace('.json', '_cleaned.json')
annotations_output_path = results_dir + "annotations.json"
score_output_path = results_dir + "score.json"


# Ensure the directories exist
os.makedirs(results_dir, exist_ok=True)

# Define commands
chunk_command = (f"python -m booookscore.chunk --chunk_size {chunk_size} --input_path {pickle_book_path} --output_path {chunked_output_path}")
summ_command = (
    f"python -m booookscore.summ --book_path {chunked_output_path} --summ_path {summaries_output_path} --model {model} --api {api} "
    f"--api_key {api_key_path} --method {method} --chunk_size {chunk_size} --max_context_len {max_context_len}"
)
postprocess_command = (f"python -m booookscore.postprocess --input_path {summaries_output_path}")

# Execute commands
print("Chunking the book...")
os.system(chunk_command)

print("Summarizing the book...")
os.system(summ_command)

print("Postprocessing the summaries...")
os.system(postprocess_command)

print("Scoring the summaries...")
scorer = Scorer(model=model,api=api,api_key=api_key_path,summ_path=postprocessed_summaries_output_path,annot_path=annotations_output_path,
    template_path="prompts/get_annotations.txt",v2=False,)
score = scorer.get_score()
print(f"BooookScore = {score}")

# Open the summaries json file which contains a dictionary of the summaries 
with open(postprocessed_summaries_output_path, "r") as f:
    summary_dict = json.load(f)
n_words = len(summary_dict[book_name].split())
print(f"Summary Length (Words): {n_words}")

# Save the score and summary length to a json file
results = {"BooookScore": score, "n_words": n_words}
with open(score_output_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"All files have been saved to {results_dir}")

Chunking the book...


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/108 [00:00<?, ?it/s][A
 11%|█         | 12/108 [00:03<00:25,  3.84it/s][A
 17%|█▋        | 18/108 [00:05<00:27,  3.29it/s][A
 29%|██▊       | 31/108 [00:08<00:21,  3.54it/s][A
 35%|███▌      | 38/108 [00:11<00:22,  3.08it/s][A
 44%|████▎     | 47/108 [00:12<00:15,  3.90it/s][A
 49%|████▉     | 53/108 [00:17<00:21,  2.56it/s][A
 55%|█████▍    | 59/108 [00:17<00:14,  3.40it/s][A
100%|██████████| 108/108 [00:18<00:00,  5.90it/s][A


a_brief_history_of_time chunk sizes: [3101, 4084, 4094, 4068, 4093, 1621, 4087, 4093, 4068, 4096, 4066, 4085, 4073, 4075, 4070, 4060, 4083, 3492, 4080, 4065, 2673]


100%|██████████| 1/1 [00:18<00:00, 18.55s/it]


Summarizing the book...


Iterating over books: 100%|██████████| 1/1 [02:34<00:00, 154.49s/it]


Postprocessing the summaries...


Iterating over books: 100%|██████████| 1/1 [00:00<00:00, 6887.20it/s]


Scoring the summaries...
No annotations found, getting annotations...


Iterating over sentences: 100%|██████████| 27/27 [04:37<00:00, 10.26s/it]
Iterating over summaries: 100%|██████████| 1/1 [04:37<00:00, 277.06s/it]

BooookScore = 1.0
Summary Length (Words): 610
All files have been saved to results/gpt-4o/a_brief_history_of_time/hier/



