# Plato Texts to pkl files 
.pkl = `list[str]`

Run this notebook to convert Plato's texts to a list of strings. Each element in the list is intended to be put in a notebook cell. The texts were downloaded from guttenberg.org and slightly edited to make processing them easier (ie remove header/footer or newline, etc).


In [1]:
from pathlib import Path
import dill
from utils import create_dialogue_entry, TITLES

Path("pickles").mkdir(exist_ok=True)


## Euthyphro

In [None]:
# Load the content of the file
with open(TITLES["Euthyphro"]["text"], "r", encoding="utf-8-sig") as file:
    text_content = file.read().split("\n\n")

    # Check if the whole text is composed of EUTHYPHRO-SOCRATES pairs
    for i in range(len(text_content) - 1):
        if i % 2 == 0:
            if not text_content[i].startswith("EUTHYPHRO:"):
                raise ValueError(f"Text at index {i} is not an EUTHYPHRO-SOCRATES pair. {text_content[i]}")
        else:
            if not text_content[i].startswith("SOCRATES:"):
                raise ValueError(f"Text at index {i} is not an EUTHYPHRO-SOCRATES pair. {text_content[i]}")

# the check above showed the text is composed of EUTHYPHRO-SOCRATES pairs
# which means we can simply iterate over the text_content
dialogue_pairs = []
for i, text in enumerate(text_content):
    if text.startswith("EUTHYPHRO:"):
        dialogue_pairs.append(
            create_dialogue_entry(
                text,
                text_content[i + 1] if i + 1 < len(text_content) else None,
            )
        )


# Save the dialogue pairs to a binary file using dill
with open(TITLES["Euthyphro"]["pickle"], "wb") as file:
    dill.dump(dialogue_pairs, file)


## Apology


In [None]:
# Load the content of the file
with open(TITLES["Apology"]["text"], "r", encoding="utf-8-sig") as file:
    text_content = file.read().strip().split("\n\n")
    
    # no checking needed for Apology since the whole text is iterated over and added to the dictionary, so nothing is skipped.

# Save the paragraphs to a binary file using dill
with open(TITLES["Apology"]["pickle"], "wb") as file:
    dill.dump(text_content, file)


## Crito


In [None]:
with open(TITLES["Crito"]["text"], "r", encoding="utf-8-sig") as file:
    text_content = file.read().strip().split("\n\n")
    
    # Check if the whole text is composed of SOCRATES-CRITO pairs
    for i in range(len(text_content) - 1):
        if i % 2 == 0:
            if not text_content[i].startswith("SOCRATES:"):
                raise ValueError(f"Text at index {i} is not a SOCRATES-CRITO pair. {text_content[i]}")
        else:
            if not text_content[i].startswith("CRITO:"):
                raise ValueError(f"Text at index {i} is not a SOCRATES-CRITO pair. {text_content[i]}")
# the check above showed the text is composed of SOCRATES-CRITO pairs
# which means we can simply iterate over the text_content
dialogue_pairs = []
for i, text in enumerate(text_content):
    if text.startswith("SOCRATES:"):
        dialogue_pairs.append(
            create_dialogue_entry(
                text,
                text_content[i + 1] if i + 1 < len(text_content) else None,
            )
        )
        

# Save the dialogue pairs to a binary file using dill
with open(TITLES["Crito"]["pickle"], "wb") as file:
    dill.dump(dialogue_pairs, file)


## Phaedo

In [None]:
with open(TITLES["Phaedo"]["text"], "r", encoding="utf-8-sig") as file:
    text_content = file.read().strip().split("\n\n")
    
    # Check if the whole text is composed of ECHECRATES-PHAEDO pairs
    for i, line in enumerate(text_content):
        if line.startswith('ESCHECRATES:'):
            if not text_content[i + 1].startswith('PHAEDO:'):
                raise ValueError(f"Text at index {i} is not a ECHECRATES-PHAEDO pair. {text_content[i]}")

# Extract PHAEDO-ECHECRATES pairs and handle the middle section from Phaedo
dialogue_pairs = []
index = 1

for i, text in enumerate(text_content):
    
    if text.startswith("ECHECRATES:") and (i + 1 < len(text_content) and text_content[i + 1].startswith("PHAEDO:")):
        dialogue_pairs.append(
            create_dialogue_entry(
                text,
                text_content[i + 1] if i + 1 < len(text_content) else None,
            )
        )
        
    elif text.startswith("PHAEDO:"):
        # this line is captured in the previous index
        pass
    
    else:
        # if the line is more than 140 characters, add it is an its own index
        if len(text) > 140:
            dialogue_pairs.append(text)
            
        # if the line is less than 140 characters, add it to the previous index
        elif len(text) < 140 and dialogue_pairs:
            dialogue_pairs[-1] += " " + text
            
        else:
            raise ValueError(f"Text at index {i} is not a ECHECRATES-PHAEDO pair. {text_content[i]}")
# Save the dialogue pairs to a binary file using dill
with open(TITLES["Phaedo"]["pickle"], "wb") as file:
    dill.dump(dialogue_pairs, file)
