In [None]:
"""
This script converts an exported social network from Cytoscape software and converts it to 
a network based on sequential movement flows in the narrative of texts, based on the
already-encoded sentence location of the social-network-detection-at-locationABC that
was extracted using the social network detection process using BookNLP

"""

import pandas as pd
import numpy as np

INPUT  = "MergedNetwork_MovementFlows_WorkingCopy.csv"
OUTPUT = "MergedNetwork_MovementFlows_Final.csv"

# Columns present in the input
required_cols = [
    "BookID", "Location", "LocationSentence",
    "X_Coord", "Y_Coord", "Z_Coord"
]

# Read
df = pd.read_csv(INPUT, usecols=required_cols)

# Fix the specific accented name to ASCII version
df["Location"] = df["Location"].replace({
    "CollÃ¨ge CÃ©venol": "College Cevenol",
    "Collège Cévenol": "College Cevenol"
})

# Ensure numeric coordinate types (floats); invalid values -> NaN
for c in ["X_Coord", "Y_Coord", "Z_Coord"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Sort within each book by LocationSentence (numeric if possible; stable for ties)
df["_loc_num"] = pd.to_numeric(df["LocationSentence"], errors="coerce")
df = df.sort_values(by=["BookID", "_loc_num", "LocationSentence"], kind="mergesort")

# Shift to make consecutive edges within each BookID
df["Target"] = df.groupby("BookID")["Location"].shift(-1)
df["X_Coord_Target"] = df.groupby("BookID")["X_Coord"].shift(-1)
df["Y_Coord_Target"] = df.groupby("BookID")["Y_Coord"].shift(-1)
df["Z_Coord_Target"] = df.groupby("BookID")["Z_Coord"].shift(-1)

# Keep only rows that have a next row (an edge)
edges = df[df["Target"].notna()].copy()

# Rename source fields
edges = edges.rename(columns={
    "Location": "Source",
    "X_Coord": "X_Coord_Source",
    "Y_Coord": "Y_Coord_Source",
    "Z_Coord": "Z_Coord_Source"
})

# Enforce float dtype in output coordinate columns (good for ArcGIS)
coord_cols_out = [
    "X_Coord_Source", "Y_Coord_Source", "Z_Coord_Source",
    "X_Coord_Target", "Y_Coord_Target", "Z_Coord_Target"
]
edges[coord_cols_out] = edges[coord_cols_out].astype("float64")

# Compute great-circle distance (km) using haversine (WGS84, X=lon, Y=lat)
R = 6371.0088  # mean Earth radius in km (WGS84)
lat1 = np.radians(edges["Y_Coord_Source"])
lon1 = np.radians(edges["X_Coord_Source"])
lat2 = np.radians(edges["Y_Coord_Target"])
lon2 = np.radians(edges["X_Coord_Target"])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
edges["DistanceKM"] = (R * c).astype("float64") # NaNs propagate automatically for missing coords

# If you prefer to drop any edge with missing coords, uncomment:
# edges = edges.dropna(subset=coord_cols_out)

# Final column order (added DistanceKM)
out = edges[
    ["Source", "Target"] +
    coord_cols_out +
    ["DistanceKM", "BookID", "LocationSentence"]
]

out.to_csv(OUTPUT, index=False)
print(f"Wrote {len(out):,} rows to {OUTPUT}")


In [None]:
# --- Aggregate identical Source–Target edges ---
OUTPUT_AGG = OUTPUT.replace(".csv", "_Aggregated_Edges.csv")

agg_out = out.groupby(["Source", "Target"], as_index=False).agg(
    X_Coord_Source=("X_Coord_Source", "first"),
    Y_Coord_Source=("Y_Coord_Source", "first"),
    Z_Coord_Source=("Z_Coord_Source", "first"),
    X_Coord_Target=("X_Coord_Target", "first"),
    Y_Coord_Target=("Y_Coord_Target", "first"),
    Z_Coord_Target=("Z_Coord_Target", "first"),
    DistanceKM=("DistanceKM", "first"),
    Weight=("Source", "size")  # count of edges consolidated
)

# Enforce dtypes (floats for coords/dist, int for Weight)
float_cols = [
    "X_Coord_Source", "Y_Coord_Source", "Z_Coord_Source",
    "X_Coord_Target", "Y_Coord_Target", "Z_Coord_Target",
    "DistanceKM"
]
agg_out[float_cols] = agg_out[float_cols].astype("float64")
agg_out["Weight"] = agg_out["Weight"].astype("int64")

agg_out.to_csv(OUTPUT_AGG, index=False)
print(f"Wrote {len(agg_out):,} aggregated rows to {OUTPUT_AGG}")

In [None]:
# --- Create Nodes CSV from aggregated edges ---
# Builds a node list of distinct locations with their coordinates.
# Requires 'agg_out' from the previous aggregation block.

NODES_OUT = OUTPUT.replace(".csv", "_Aggregated_Nodes.csv")  # -> MergedNetwork_MovementFlows_Final_Aggregated_Nodes.csv

nodes_src = agg_out[["Source", "X_Coord_Source", "Y_Coord_Source", "Z_Coord_Source"]].rename(
    columns={
        "Source": "name",
        "X_Coord_Source": "X_Coord",
        "Y_Coord_Source": "Y_Coord",
        "Z_Coord_Source": "Z_Coord",
    }
)

nodes_tgt = agg_out[["Target", "X_Coord_Target", "Y_Coord_Target", "Z_Coord_Target"]].rename(
    columns={
        "Target": "name",
        "X_Coord_Target": "X_Coord",
        "Y_Coord_Target": "Y_Coord",
        "Z_Coord_Target": "Z_Coord",
    }
)

nodes = pd.concat([nodes_src, nodes_tgt], ignore_index=True)
nodes = nodes.dropna(subset=["name"])
nodes = nodes.drop_duplicates(subset=["name"], keep="first")  # assumes coords are consistent per name
nodes = nodes[["name", "X_Coord", "Y_Coord", "Z_Coord"]].sort_values("name")

# Enforce float dtype for coordinates
nodes[["X_Coord", "Y_Coord", "Z_Coord"]] = nodes[["X_Coord", "Y_Coord", "Z_Coord"]].astype("float64")

nodes.to_csv(NODES_OUT, index=False)
print(f"Wrote {len(nodes):,} nodes to {NODES_OUT}")