In [1]:
    import pandas as pd
    import numpy as np
    from glob import glob
    import re
    import nltk
    import plotly_express as px
    import configparser
    import os
    config = configparser.ConfigParser()
    config.read("../../../env-sample.ini")
    data_home = '/Users/muhur/OneDrive/Desktop/Muhurto/Data Science Grad School/DS5001/KafkaFinal/data'
    output_dir = '/Users/muhur/OneDrive/Desktop/Muhurto/Data Science Grad School/DS5001/KafkaFinal/output'
    local_lib = '/Users/muhur/OneDrive/Desktop/Muhurto/Data Science Grad School/DS5001/DS5001-2025-01-R/lessons/lib'
    import sys
    sys.path.append(local_lib)
    from textparser import TextParser

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    # All are 'chap'and 'm'
    roman = '[IVXLCM]+'
    caps = "[A-Z';, -]+"
    ohco_pat_list = [
        (5200,   rf"^\s*CHAPTER\s+{roman}\s*$"), #Metamorphosis
        (7849,   rf"^\s*{roman}\s*$"), #The Trial
        (6969,  rf"^\s*LETTER .* to .*$"), # The Castle
        (6262,   rf"^CHAPTER\s+{roman}$"), # Amerika
        (6161,   rf"^CHAPTER\s+\d+$"), # The Judgement
        (6060,   rf"^Chapter\s+\d+$"), # Dearest Father
        (6363,  rf"^Chapter\s+\d+$"), # In the Penal colony
        (6464,   rf"^CHAPTER\s+\d+$"), # The Hunger Artist
        (6565, rf"^\s*CHAPTER\s+{roman}\."), # The Jackals and Arabs
        (6666, rf"^\s*CHAPTER\s+{roman}\s*$"), # A Country Doctor
        (6767, rf"^\s*CHAPTER\s+{roman}\s*$"), # An Imperial Message
        (5959,  rf"^(?:ETYMOLOGY|EXTRACTS|CHAPTER)"), # A report for an Academy
        (5858,  rf"^\s*CHAPTER\s+{roman}\.\s*$"), # The Great Wall of China
        (5757, rf"^\s*{roman}\.\s*$"), # The Hunter Gracchus
        (5656,  rf"^\s*{roman}\. .*$"), # Up in the Gallery
        (5555, rf"^CHAPTER\s+{roman}\.?$"), # Before the Law
        (5454, rf"^\s*[A-Z,;-]+\.\s*$"), # Josephine the Songstress
        (5353,  rf"^CHAPTER "), # The Burrow
        (5252, rf"^CHAPTER\s+{roman}\.\s*$"), # Blumfeld
        (23532, rf"Chapter\s+{roman}") # Meditation
    ]
    chapter_regexes = [
        (5200,   rf"^\s*{roman}\s*$"),
        (7849,   rf"^\s*Chapter\s+(?:One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten)\s*$"),
        (6969,   rf"^\s*\d+\s*$"),
        (6262,   rf"^\s*\d+\s*$"),
        (6161,   "NOCHAPTERS"),
        (6060,   "NOCHAPTERS"),
        (6363,   "NOCHAPTERS"),
        (6464,   "NOCHAPTERS"),
        (6565,   "NOCHAPTERS"),
        (6666,   "NOCHAPTERS"),
        (6767,   "NOCHAPTERS"),
        (5959,   "NOCHAPTERS"),
        (5858,   "NOCHAPTERS"),
        (5757,   "NOCHAPTERS"),
        (5656,   "NOCHAPTERS"),
        (5555,   "NOCHAPTERS"),
        (5454,   "NOCHAPTERS"),
        (5353,   "NOCHAPTERS"),
        (5252,   "NOCHAPTERS"),
        (23532,  rf"^(Children on the country road|Unmasking a con artist|The Sudden Walk|Resolutions|The trip to the mountains|The Bachelor's Misfortune|The Merchant|Distracted Looking Out|The Way Home|The Passers-by|Passenger|Dresses|The rejection|Food for thought for gentlemen riders|The Alley Window|Desire to become an Indian|The Trees|Unhappiness)$")  # Poem title on line 1
    ]
    ohco_pat_list = chapter_regexes
    source_files = f'{data_home}'
    source_file_list = sorted(glob(f"{source_files}/*.*"))

    book_data = []
    for source_file_path in source_file_list:
        # Get the filename only, e.g. 'pg5353.txt'
        filename = os.path.basename(source_file_path)
        # Extract the numeric ID from the filename (remove 'pg' and '.txt')
        book_id = int(filename.replace('pg', '').replace('.txt', ''))
        # Use filename (without extension) as a raw title (optional: clean further)
        book_title = filename.replace('.txt', '').replace('_', ' ')
        # Append a tuple of (book_id, path, title)
        book_data.append((book_id, source_file_path, book_title))
    # Convert to DataFrame
    LIB = pd.DataFrame(book_data, columns=['book_id', 'source_file_path', 'raw_title']) \
            .set_index('book_id') \
            .sort_index()
    book_titles = {
        5200: "Metamorphosis",
        7849: "The Trial",
        6969: "The Castle",
        6262: "Amerika",
        6161: "The Judgement",
        6060: "Dearest Father",
        6363: "In the Penal Colony",
        6464: "The Hunger Artist",
        6565: "The Jackals and Arabs",
        6666: "A Country Doctor",
        6767: "An Imperial Message",
        5959: "A Report for an Academy",
        5858: "The Great Wall of China",
        5757: "The Hunter Gracchus",
        5656: "Up in the Gallery",
        5555: "Before the Law",
        5454: "Josephine the Songstress",
        5353: "The Burrow",
        5252: "Blumfeld",
        23532: "Meditation"
    }
    book_titles = {f'pg{key}': value for key, value in book_titles.items()}
    try:
        LIB['author'] = 'KAFKA, FRANZ'
        LIB['title'] = LIB.raw_title.replace(book_titles).str.upper()
        LIB = LIB.drop('raw_title', axis=1)
    except AttributeError:
        pass
    LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))
LIB['work_type'] = LIB['title'].str.lower()
replacemap = {
    "amerika": "novel",
    "the castle": "novel",
    "the trial": "novel",
    
    "metamorphosis": "novella",
    "the burrow": "novella",
    "in the penal colony": "novella",
    "the hunger artist": "novella",
    
    "blumfeld": "short story",
    "josephine the songstress": "short story",
    "before the law": "short story",
    "up in the gallery": "short story",
    "the hunter gracchus": "short story",
    "the great wall of china": "short story",
    "a report for an academy": "short story",
    "the judgement": "short story",
    "the jackals and arabs": "short story",
    "a country doctor": "short story",
    "an imperial message": "short story",
    "meditation": "short story",
    
    "dearest father": "letter"
}
LIB['work_type'] = LIB['work_type'].replace(replacemap)
LIB

kafka_named_character_map = {
    "amerika": "named",
    "the castle": "unnamed",
    "the trial": "named",

    "metamorphosis": "named",
    "the burrow": "unnamed",
    "in the penal colony": "unnamed",
    "the hunger artist": "unnamed",

    "blumfeld": "named",
    "josephine the songstress": "named",
    "before the law": "unnamed",
    "up in the gallery": "unnamed",
    "the hunter gracchus": "named",
    "the great wall of china": "unnamed",
    "a report for an academy": "unnamed",
    "the judgement": "named",
    "the jackals and arabs": "unnamed",
    "a country doctor": "unnamed",
    "an imperial message": "unnamed",
    "meditation": "unnamed",

    "dearest father": "named"
}
LIB['protagonist_name'] = LIB['title'].str.lower()
LIB['protagonist_name'] = LIB['protagonist_name'].replace(kafka_named_character_map)
LIB['ending_type'] = LIB['title'].str.lower()

kafka_ending_map = {
    "amerika": "ambiguous",
    "the castle": "not happy",
    "the trial": "not happy",

    "metamorphosis": "not happy",
    "the burrow": "not happy",
    "in the penal colony": "not happy",
    "the hunger artist": "not happy",

    "blumfeld": "ambiguous",
    "josephine the songstress": "not happy",
    "before the law": "not happy",
    "up in the gallery": "ambiguous",
    "the hunter gracchus": "not happy",
    "the great wall of china": "ambiguous",
    "a report for an academy": "ambiguous",
    "the judgement": "not happy",
    "the jackals and arabs": "not happy",
    "a country doctor": "not happy",
    "an imperial message": "not happy",
    "meditation": "ambiguous",

    "dearest father": "not happy"
}
LIB['ending_type'] = LIB['ending_type'].replace(kafka_ending_map)
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,work_type,protagonist_name,ending_type
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5200,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",METAMORPHOSIS,^\s*[IVXLCM]+\s*$,novella,named,not happy
5252,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",BLUMFELD,NOCHAPTERS,short story,named,ambiguous
5353,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",THE BURROW,NOCHAPTERS,novella,unnamed,not happy
5454,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",JOSEPHINE THE SONGSTRESS,NOCHAPTERS,short story,named,not happy
5555,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",BEFORE THE LAW,NOCHAPTERS,short story,unnamed,not happy
5656,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",UP IN THE GALLERY,NOCHAPTERS,short story,unnamed,ambiguous
5757,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",THE HUNTER GRACCHUS,NOCHAPTERS,short story,named,not happy
5858,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",THE GREAT WALL OF CHINA,NOCHAPTERS,short story,unnamed,ambiguous
5959,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",A REPORT FOR AN ACADEMY,NOCHAPTERS,short story,unnamed,ambiguous
6060,/Users/muhur/OneDrive/Desktop/Muhurto/Data Sci...,"KAFKA, FRANZ",DEAREST FATHER,NOCHAPTERS,letter,named,not happy


In [2]:
import os

def average_document_length_in_characters(file_paths):
    total_characters = 0
    num_documents = 0

    for path in file_paths:
        if os.path.isfile(path):
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
                char_count = len(text)  # Measure characters
                total_characters += char_count
                num_documents += 1
        else:
            print(f"File not found: {path}")

    if num_documents == 0:
        return 0

    return total_characters / num_documents

source_file_paths = LIB['source_file_path'].to_list()

avg_length = average_document_length_in_characters(source_file_paths)
print(f"Average document length: {avg_length:.2f} characters")


Average document length: 117223.80 characters


In [3]:
#LIB.to_csv('LIB.csv')