In [None]:
"""
This script takes the original nodes and edges csv files produced by the script written to
use BookNLP for social network detection, and converts them to an event-based social 
network with nodes and edges csv files that are useable by Cytoscape network analyzer software.  

Parenthetical Note: The original script was set up to have outputs useable by Gephi, 
but the author experienced software errors in Gephi that were deleting networks, 
so he switched to Cytoscape and an event-based network.

"""

import pandas as pd
import glob
import os

FOLDER = "Your_working_directory"
EDGE_SUFFIX = "_edges.csv"
NODE_SUFFIX = "_nodes.csv"
NODE_COL_KEEP = ['ID', 'Label', 'Gender', 'MentionCount', 'Type', 'BookID']

os.chdir(FOLDER)
edge_files = glob.glob(f"*{EDGE_SUFFIX}")

for ef in edge_files:
    basename = ef.replace(EDGE_SUFFIX, '')
    nf = basename + NODE_SUFFIX
    if not os.path.exists(nf):
        print(f"WARNING: Node file {nf} not found for {ef}")
        continue
    print(f"Processing: {ef} + {nf}")

    edges = pd.read_csv(ef, dtype=str).fillna("")
    nodes = pd.read_csv(nf, dtype=str).fillna("")
    nodes.set_index("ID", inplace=True)
    # Initialization
    occurrence_nodes = []
    master_person_nodes = set()
    master_location_nodes = {}
    copresence_edges = []
    appearance_edges = []
    location_edges = []
    appearance_path_edges = []
    person_appearance_order = {}

    for idx, row in edges.iterrows():
        event_id = f"EVT_{basename}_{idx+1}"
        loc_name = row['Location']
        loc_id = f"LOC_{loc_name.replace(' ','_')}"
        for role in ['Source', 'Target']:
            pid = row[role]
            occ_id = f"{pid}@{event_id}"

            # Add master person
            if pid not in master_person_nodes:
                if pid in nodes.index:
                    nmeta = nodes.loc[pid]
                else:
                    nmeta = pd.Series()
                master_person_nodes.add(pid)
                occurrence_nodes.append({
                    'ID': pid,
                    'Label': nmeta.get('Label', pid),
                    'Gender': nmeta.get('Gender',""),
                    'MentionCount': nmeta.get('MentionCount',""),
                    'Type': 'Person',
                    'BookID': nmeta.get('BookID',""),
                    'Location': '',
                    'EventID': '',
                    'X_Coord': '',
                    'Y_Coord': '',
                    'Z_Coord': '',
                    'PersonID': pid
                })
            # Add occurrence node
            if pid in nodes.index:
                nmeta = nodes.loc[pid]
            else:
                nmeta = pd.Series()
            occurrence_nodes.append({
                'ID': occ_id,
                'Label': nmeta.get('Label', pid),
                'Gender': nmeta.get('Gender',""),
                'MentionCount': nmeta.get('MentionCount',""),
                'Type': 'Occurrence',
                'BookID': nmeta.get('BookID',""),
                'Location': loc_name,
                'EventID': event_id,
                'X_Coord': row['X_Coord'],
                'Y_Coord': row['Y_Coord'],
                'Z_Coord': row.get('Z_Coord',""),
                'PersonID': pid
            })
            # Link occurrence to person
            appearance_edges.append({
                'Source': pid,
                'Target': occ_id,
                'EdgeType': 'appearance-of'
            })
            # Add location node if not already
            if loc_id not in master_location_nodes:
                master_location_nodes[loc_id] = {
                    'ID': loc_id,
                    'Label': loc_name,
                    'Type': 'Location',
                    'Gender': '',
                    'MentionCount': '',
                    'BookID': row.get('BookID',""),
                    'Location': loc_name,
                    'EventID': '',
                    'X_Coord': row['X_Coord'],
                    'Y_Coord': row['Y_Coord'],
                    'Z_Coord': row.get('Z_Coord',""),
                    'PersonID': ''
                }
            # Link occurrence to location
            location_edges.append({
                'Source': occ_id,
                'Target': loc_id,
                'EdgeType': 'at-location'
            })
            # For appearance-paths
            if pid not in person_appearance_order:
                person_appearance_order[pid] = []
            person_appearance_order[pid].append((idx, occ_id))

        # Copresence/event edge
        source_occ = f"{row['Source']}@{event_id}"
        target_occ = f"{row['Target']}@{event_id}"
        copresence_edges.append({
            'Source': source_occ,
            'Target': target_occ,
            'EdgeType': row.get('EdgeType', '').strip() or 'scene_copresence',
            'EventID': event_id,
            'Location': loc_name,
            'X_Coord': row['X_Coord'],
            'Y_Coord': row['Y_Coord'],
            'Z_Coord': row.get('Z_Coord',""),
            **{k: row[k] for k in edges.columns if k not in ['Source','Target']}
        })

    # (Optional) add appearance-sequence edges for a person (chronological order):
    for pid, occ_list in person_appearance_order.items():
        sorted_occs = [x[1] for x in sorted(occ_list)]
        for i in range(len(sorted_occs) - 1):
            appearance_path_edges.append({
                'Source': sorted_occs[i],
                'Target': sorted_occs[i+1],
                'EdgeType': 'appearance-sequence'
            })

    all_location_nodes = list(master_location_nodes.values())
    all_nodes = pd.DataFrame(occurrence_nodes + all_location_nodes).drop_duplicates(subset=['ID'])
    COLS_NODE = ['ID', 'Label', 'Gender', 'MentionCount', 'Type', 'BookID', 'Location', 'EventID', 'X_Coord', 'Y_Coord', 'Z_Coord', 'PersonID']
    for c in COLS_NODE:
        if c not in all_nodes.columns:
            all_nodes[c] = ""
    all_nodes = all_nodes[COLS_NODE].fillna('')

    all_edges = pd.DataFrame(
        copresence_edges + appearance_edges + location_edges + appearance_path_edges
    )
    COLS_EDGE = ['Source', 'Target', 'EdgeType', 'EventID', 'Location', 'X_Coord', 'Y_Coord', 'Z_Coord'] + [
        c for c in all_edges.columns if c not in ['Source', 'Target', 'EdgeType', 'EventID', 'Location', 'X_Coord', 'Y_Coord', 'Z_Coord']
    ]
    for c in COLS_EDGE:
        if c not in all_edges.columns:
            all_edges[c] = ""
    all_edges = all_edges[COLS_EDGE].fillna('')

    all_nodes.to_csv(f"{basename}nodes_Cytospace.csv", index=False)
    all_edges.to_csv(f"{basename}_edges_Cytospace.csv", index=False)
    print(f"Output: {basename}_spatialNet_nodes.csv and _edges.csv")

print("DONE")