### BooookScore Summarization and Evaluation

In [10]:
import os, json
from booookscore.score import Scorer 

# Define variables
book_name = "eat_that_frog"
model = "gpt-4o"
# model = "gpt-3.5-turbo"
method = "hier"  # or "increasing"

# Configuration variables
api = "openai"
api_key_path = "api.txt"
chunk_size = 4096
max_context_len = 8192

# Create results directory structure
data_dir = "../../../data/full_content_parsed_epubs/"
results_dir = "results/" + model + "/" + book_name + "/" + method + "/"

pickle_book_path = data_dir + book_name + ".pkl"
chunked_output_path = results_dir + "chunked_book.pkl"
summaries_output_path = results_dir + "summaries.json"
postprocessed_summaries_output_path = summaries_output_path.replace('.json', '_cleaned.json')
annotations_output_path = results_dir + "annotations.json"
score_output_path = results_dir + "score.json"


# Ensure the directories exist
os.makedirs(results_dir, exist_ok=True)

# Define commands
chunk_command = (f"python -m booookscore.chunk --chunk_size {chunk_size} --input_path {pickle_book_path} --output_path {chunked_output_path}")
summ_command = (
    f"python -m booookscore.summ --book_path {chunked_output_path} --summ_path {summaries_output_path} --model {model} --api {api} "
    f"--api_key {api_key_path} --method {method} --chunk_size {chunk_size} --max_context_len {max_context_len}"
)
postprocess_command = (f"python -m booookscore.postprocess --input_path {summaries_output_path}")

# Execute commands
print("Chunking the book...")
os.system(chunk_command)

print("Summarizing the book...")
os.system(summ_command)

print("Postprocessing the summaries...")
os.system(postprocess_command)

print("Scoring the summaries...")
scorer = Scorer(model=model,api=api,api_key=api_key_path,summ_path=postprocessed_summaries_output_path,annot_path=annotations_output_path,
    template_path="prompts/get_annotations.txt",v2=False,)
score = scorer.get_score()
print(f"BooookScore = {score}")

# Open the summaries json file which contains a dictionary of the summaries 
with open(postprocessed_summaries_output_path, "r") as f:
    summary_dict = json.load(f)
n_words = len(summary_dict[book_name].split())
print(f"Summary Length (Words): {n_words}")

# Save the score and summary length to a json file
results = {"BooookScore": score, "n_words": n_words}
with open(score_output_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"All files have been saved to {results_dir}")

Chunking the book...


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1034 [00:00<?, ?it/s][A
 10%|█         | 108/1034 [00:00<00:00, 1058.52it/s][A
 21%|██        | 214/1034 [00:00<00:01, 631.68it/s] [A
 28%|██▊       | 287/1034 [00:00<00:01, 621.48it/s][A
 34%|███▍      | 354/1034 [00:00<00:01, 556.35it/s][A
 40%|███▉      | 413/1034 [00:00<00:01, 530.07it/s][A
 47%|████▋     | 483/1034 [00:00<00:00, 575.26it/s][A
 53%|█████▎    | 543/1034 [00:00<00:00, 528.12it/s][A
 59%|█████▉    | 608/1034 [00:01<00:00, 557.44it/s][A
 65%|██████▍   | 667/1034 [00:01<00:00, 565.82it/s][A
 70%|███████   | 725/1034 [00:01<00:00, 561.54it/s][A
 77%|███████▋  | 798/1034 [00:01<00:00, 609.26it/s][A
 84%|████████▍ | 873/1034 [00:01<00:00, 648.00it/s][A
 91%|█████████ | 939/1034 [00:01<00:00, 595.69it/s][A
100%|██████████| 1034/1034 [00:01<00:00, 556.72it/s][A
100%|██████████| 1/1 [00:01<00:00,  1.98s/it]


eat_that_frog chunk sizes: [4050, 4071, 4071, 4071, 4091, 4030, 4063, 3805]
Summarizing the book...


Iterating over books:   0%|          | 0/1 [00:00<?, ?it/s]