In [None]:
import os
from booknlp.booknlp import BookNLP
import pandas as pd
import numpy as np
import json
from collections import Counter, defaultdict
from tqdm import tqdm
import string
from unidecode import unidecode

In [None]:
# --- Configuration ---
input_dir = "Your_Input_DIR"  # Directory containing .txt files
output_base_dir = "booknlp_output"  # Base directory for BookNLP outputs
analysis_output_dir = "analysis_data_output"  # Output directory for analysis files
location_coords_file = "Your_location_file.csv" # CSV with target locations and their coordinates and aliases
window_size = 1  # Number of sentences before and after the sentence with the location for the context window

In [None]:
# --- Initialize BookNLP ---
model_params = {
    "pipeline": "entity,quote,supersense,event,coref",
    "model": "big"
}
booknlp = BookNLP("en", model_params)

# Create output directories if they don't exist
os.makedirs(output_base_dir, exist_ok=True)
os.makedirs(analysis_output_dir, exist_ok=True)


In [None]:
# Lowercase, remove leading/trailing whitespace, punctuation, and normalize accents
def normalize_text(text):
    if pd.isna(text):
        return ''
    cleaned = str(text).strip().strip(string.punctuation).lower()
    return unidecode(cleaned)

# Load locations with coordinates and create a lookup for aliases
location_lookup = {}  # Maps alias/canonical name (normalized) to canonical location data
canonical_locs = set()
locations_df = pd.read_csv(location_coords_file, encoding='utf-8')

for _, row in locations_df.iterrows():
    can_raw = row['Location']
    canonical = normalize_text(can_raw)  # normalize canonical!
    x, y, z = row['X_Coord'], row['Y_Coord'], row['Z_Coord']
    canonical_locs.add(canonical)
    # Store canonical in lookup
    location_lookup[canonical] = {"canonical_name": can_raw, "x": x, "y": y, "z": z}
    # Add all aliases, normalized
    aliases = str(row['Aliases']) if not pd.isna(row['Aliases']) else ""
    for alias in [a.strip() for a in aliases.split(';') if a.strip()]:
        alias_norm = normalize_text(alias)
        location_lookup[alias_norm] = {"canonical_name": can_raw, "x": x, "y": y, "z": z}
print(f"Loaded {len(canonical_locs)} geocoded canonical locations and all normalized aliases.")


In [None]:
# Define "Helper" Functions

# ------------------- Supersense Helper -----------------------
def make_supersense_lookup(tokens, supersense_df):
    tok2ss = {}
    for _, row in supersense_df.iterrows():
        for t in range(int(row['start_token']), int(row['end_token']) + 1):
            tok2ss[t] = row['supersense_category']
    return tok2ss

# ------------------- Label/Metadata Helpers ------------------
def get_main_label_from_entities(entities, coref):
    # entities['COREF'] is int
    c = int(coref)
    mentions = entities[(entities['cat'] == "PER") & (entities['COREF'] == c)]
    texts = [str(txt) for txt in mentions['text'] if pd.notna(txt)]
    prons = {"i", "me", "my", "he", "him", "his", "she", "her", "they", "them", "their", "you", "your", "we", "us", "our"}
    for t in texts:
        if t.strip().lower() not in prons:
            return t
    return texts[0] if texts else f"CHAR_{coref}"

def get_character_metadata_from_bookjson(book_json, coref):
    if "characters" in book_json:
        for character in book_json["characters"]:
            if str(character["id"]) == str(coref):
                count = character.get("count", 0)
                referential_gender = "unknown"
                if "g" in character and character["g"] and character["g"].get("argmax"):
                    referential_gender = character["g"]["argmax"]
                label = ""
                mentions = character.get("mentions", {})
                propers = mentions.get("proper", [])
                if propers and "n" in propers[0]:
                    label = propers[0]["n"]
                modList = character.get("mod", [])
                mod_counter = Counter()
                for m in modList:
                    if "d" in m:
                        mod_counter[m["d"]] += m.get("c", 1)
                if mod_counter:
                    most_common_mod, mod_count = mod_counter.most_common(1)[0]
                else:
                    most_common_mod, mod_count = "", 0
                return {
                    "label": label,
                    "gender": referential_gender,
                    "mention_count": count,
                    "most_common_mod": most_common_mod,
                    "mod_count": mod_count
                }
    return {
        "label": "",
        "gender": "",
        "mention_count": 0,
        "most_common_mod": "",
        "mod_count": 0
    }


In [None]:
# -------------- MAIN LOOP: PER BOOK -----------------
for filename in tqdm(os.listdir(input_dir), desc="Processing Books (.txt)"):
    if not filename.endswith(".txt"):
        continue
    book_id = os.path.splitext(filename)[0]
    input_file = os.path.join(input_dir, filename)
    book_output_dir = os.path.join(output_base_dir, book_id)
    os.makedirs(book_output_dir, exist_ok=True)
    # ----- Run BookNLP -----
    try:
        booknlp.process(input_file, book_output_dir, book_id)
    except Exception as e:
        print(f"ERROR running BookNLP on {book_id}: {e}")
        continue
    # ----- Load BookNLP outputs -----
    try:
        tokens = pd.read_csv(os.path.join(book_output_dir, f"{book_id}.tokens"), sep="\t")
        entities = pd.read_csv(os.path.join(book_output_dir, f"{book_id}.entities"), sep="\t")
        supersenses = pd.read_csv(os.path.join(book_output_dir, f"{book_id}.supersense"), sep="\t")
        tok2ss = make_supersense_lookup(tokens, supersenses)
        quotes_path = os.path.join(book_output_dir, f"{book_id}.quotes")
        quotes = pd.read_csv(quotes_path, sep="\t") if os.path.isfile(quotes_path) else None
        book_json_path = os.path.join(book_output_dir, f"{book_id}.book")
        book_json = {}
        if os.path.isfile(book_json_path):
            with open(book_json_path, encoding='utf-8') as f:
                book_json = json.load(f)
    except Exception as e:
        print(f"ERROR loading BookNLP outputs for {book_id}: {e}")
        continue
    # --- Node: COREF setup ---
    per_entities = entities[entities['cat'] == "PER"]
    all_corefs = per_entities['COREF'].unique()
    coref2mention_tokens = defaultdict(list)
    for _, row in per_entities.iterrows():
        for t in range(int(row['start_token']), int(row['end_token']) + 1):
            coref2mention_tokens[str(row['COREF'])].append(t)
    token2sent = dict(zip(tokens['token_ID_within_document'], tokens['sentence_ID']))

    # ========== Edge Extraction Loop ==========
    edges = []
    used_windows = set()
    for idx, tok in tokens.iterrows():
        loc_surface = normalize_text(tok['word'])
        s_id = tok['sentence_ID']
        if loc_surface not in location_lookup:
            continue
        locus = location_lookup[loc_surface]
        canonical_loc = locus['canonical_name']
        x, y, z = locus["x"], locus["y"], locus["z"]
        win_key = (s_id, canonical_loc)
        if win_key in used_windows:
            continue
        used_windows.add(win_key)
        win_start = max(0, s_id - window_size)
        win_end = s_id + window_size
        window_sent_ids = list(range(win_start, win_end + 1))

        window_token_ids = set(tokens[tokens['sentence_ID'].isin(window_sent_ids)]['token_ID_within_document'])
        # --- Corefs present in this window ---
        corefs_in_window = set()
        for coref, mention_tokens in coref2mention_tokens.items():
            if any(t in window_token_ids for t in mention_tokens):
                corefs_in_window.add(coref)
        if not corefs_in_window or len(corefs_in_window) < 2:
            continue

        # --------- Scene edges ---------------
        coref_list = sorted(corefs_in_window)
        for i in range(len(coref_list)):
            for j in range(i + 1, len(coref_list)):
                edges.append({
                    "Source": coref_list[i],
                    "Target": coref_list[j],
                    "Weight": 1,
                    "EdgeType": "scene_copresence",
                    "Location": canonical_loc,
                    "X_Coord": x,
                    "Y_Coord": y,
                    "Z_Coord": z,
                    "BookID": book_id,
                    "LocationWindow_Sentence": s_id,
                    "ContextWinStart": win_start,
                    "ContextWinEnd": win_end,
                    "Events": "",
                    "Supersense": "",
                    "QuoteText": ""
                })

        # --------- Agent/Patient (verb) edges ---------------

        # For Debugging:
        # print("==== BEGIN Agent/Patient (verb) edges ====")

        # For debugging: List and count POS_tag values in window
        # window_tokens = tokens[tokens['sentence_ID'].isin(window_sent_ids)]
        # print("POS_tag unique values in window tokens:", window_tokens['POS_tag'].unique())
        # print("Total tokens in window:", len(window_tokens))

        # Candidate verbs diagnostic
        # total_verb_candidates = 0
        # for sent_id in window_sent_ids:
        #     stoks = tokens[tokens['sentence_ID'] == sent_id]
        #     for _, trow in stoks.iterrows():
        #         if str(trow['POS_tag']) == 'VERB':
        #             total_verb_candidates += 1
        # print(f"# candidate verbs found in this window: {total_verb_candidates}")

        # Prepare entity mapping (cat == "PER")
        person_tokens = {}
        for coref in corefs_in_window:
            for t in coref2mention_tokens[coref]:
                person_tokens[t] = coref

        for sent_id in window_sent_ids:
            stoks = tokens[tokens['sentence_ID'] == sent_id]
            for _, trow in stoks.iterrows():
                if str(trow['POS_tag']) != 'VERB':
                    continue
                verb_token_id = trow['token_ID_within_document']
                verb_lemma = trow['lemma']
                verb_word  = trow['word']
                verb_ss    = tok2ss.get(verb_token_id, "NA")

                # For Debugging
                # print(f"Verb candidate: {verb_word} (Lemma: {verb_lemma}) at token {verb_token_id} in sentence {sent_id}")

                subj_rows = stoks[
                    (stoks['syntactic_head_ID'] == verb_token_id) &
                    (stoks['dependency_relation'].isin(['nsubj', 'nsubjpass']))
                ]
                obj_rows = stoks[
                    (stoks['syntactic_head_ID'] == verb_token_id) &
                    (stoks['dependency_relation'].isin(['dobj', 'obj', 'iobj']))
                ]
                # For Debugging
                # print(f"  Found {len(subj_rows)} subjects and {len(obj_rows)} objects for this verb.")

                if subj_rows.empty or obj_rows.empty:
                    continue

                for _, subj_row in subj_rows.iterrows():
                    subj_token_id = subj_row['token_ID_within_document']
                    subj_coref = None
                    for _, ent_row in per_entities.iterrows():
                        if int(ent_row['start_token']) <= subj_token_id <= int(ent_row['end_token']):
                            subj_coref = str(ent_row['COREF'])
                            # For Debugging
                            # print(f"    Subject token {subj_token_id} maps to COREF {subj_coref} ('{ent_row['text']}')")
                            break
                    if not subj_coref:
                        # For Debugging
                        # print(f"    WARNING: Subject token {subj_token_id} could not be mapped to a 'PER' entity")
                        continue

                    for _, obj_row in obj_rows.iterrows():
                        obj_token_id = obj_row['token_ID_within_document']
                        obj_coref = None
                        for _, ent_row in per_entities.iterrows():
                            if int(ent_row['start_token']) <= obj_token_id <= int(ent_row['end_token']):
                                obj_coref = str(ent_row['COREF'])
                                # For Debugging
                                # print(f"    Object token {obj_token_id} maps to COREF {obj_coref} ('{ent_row['text']}')")
                                break
                        if not obj_coref:
                            # For Debugging
                            # print(f"    WARNING: Object token {obj_token_id} could not be mapped to a 'PER' entity")
                            continue
                        if subj_coref != obj_coref:
                            # For Debugging                            
                            # print(f"    ---> Adding ACTION edge: Source: {subj_coref}, Target: {obj_coref}, Verb: {verb_lemma}, Supersense: {verb_ss}")
                            edges.append({
                                "Source": subj_coref,
                                "Target": obj_coref,
                                "Weight": 1,
                                "EdgeType": "action",
                                "Location": canonical_loc,
                                "X_Coord": x,
                                "Y_Coord": y,
                                "Z_Coord": z,
                                "BookID": book_id,
                                "LocationWindow_Sentence": s_id,
                                "ContextWinStart": win_start,
                                "ContextWinEnd": win_end,
                                "Events": verb_lemma,
                                "Supersense": verb_ss,
                                "QuoteText": ""
                            })

        # For Debugging
        # print("==== END Agent/Patient (verb) edges ====")
        
        # --------- Dialogue edges (quotes) ---------------
        if quotes is not None:
            # All token_ID_within_document in window:
            window_token_ids = set(tokens[tokens['sentence_ID'].isin(window_sent_ids)]['token_ID_within_document'])
            for _, qrow in quotes.iterrows():
                # quote_start/quote_end are token indices
                quote_tokens = set(range(int(qrow['quote_start']), int(qrow['quote_end']) + 1))
                if not window_token_ids.intersection(quote_tokens):
                    continue
                speaker = str(int(qrow['char_id'])) if not pd.isna(qrow['char_id']) else None
                if not speaker or speaker not in corefs_in_window:
                    continue
                quote_text = qrow['quote'] if 'quote' in qrow and not pd.isna(qrow['quote']) else ""
                for other_coref in corefs_in_window:
                    if other_coref != speaker:
                        edges.append({
                            "Source": speaker,
                            "Target": other_coref,
                            "Weight": 1,
                            "EdgeType": "dialogue",
                            "Location": canonical_loc,
                            "X_Coord": x,
                            "Y_Coord": y,
                            "Z_Coord": z,
                            "BookID": book_id,
                            "LocationWindow_Sentence": s_id,
                            "ContextWinStart": win_start,
                            "ContextWinEnd": win_end,
                            "Events": "",
                            "Supersense": "",
                            "QuoteText": quote_text
                        })

    # ================= NODE TABLE ========================
    character_corefs_in_edges = set()
    for edge in edges:
        character_corefs_in_edges.add(str(edge['Source']))
        character_corefs_in_edges.add(str(edge['Target']))
    nodes = []
    for coref in character_corefs_in_edges:
        node_attr = get_character_metadata_from_bookjson(book_json, coref) if book_json else {}
        label = node_attr.get("label") or get_main_label_from_entities(entities, coref)
        nodes.append({
            "Id": coref,
            "Label": label,
            "Gender": node_attr.get("gender", ""),
            "MentionCount": node_attr.get("mention_count", 0),
            "Type": "Character",
            "BookID": book_id
            # "MostCommonModifier": node_attr.get("most_common_mod", ""),
            # "ModifierCount": node_attr.get("mod_count", 0)
        })

    # -------------- Add location nodes --------------
    added_locations = set()
    for _, tok in tokens.iterrows():
        loc_word = normalize_text(tok['word'])
        if loc_word in location_lookup:
            canonical_loc = location_lookup[loc_word]['canonical_name']
            if canonical_loc in added_locations:
                continue
            added_locations.add(canonical_loc)
            nodes.append({
                "Id": f"LOC_{canonical_loc.replace(' ', '_')}",
                "Label": canonical_loc,
                "Gender": "",
                "MentionCount": "",
                "Type": "Location",
                "BookID": book_id,
                # "MostCommonModifier": "",
                # "ModifierCount": "",
                "X_Coord": location_lookup[loc_word]['x'],
                "Y_Coord": location_lookup[loc_word]['y'],
                "Z_Coord": location_lookup[loc_word]['z']
            })

    # =============== OUTPUT ===========================
    edges_df = pd.DataFrame(edges)
    nodes_df = pd.DataFrame(nodes).drop_duplicates("Id")
    edges_df.to_csv(os.path.join(analysis_output_dir, f"{book_id}_edges.csv"), index=False)
    nodes_df.to_csv(os.path.join(analysis_output_dir, f"{book_id}_nodes.csv"), index=False)
    print(f"Done: {book_id}: {len(edges_df)} edges, {len(nodes_df)} nodes.")

print("\n=== ALL BOOKS FINISHED ===")
print(f"Edges/Nodes are in: {analysis_output_dir}")
