## 01c - Data preparation: Household-to-household edgelist

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read household ids and ego flag from spine
dtypes_spine = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64,
    "is_ego":pl.Int64
}

spine = (
    pl
    .scan_csv(config["data"]["spine"], separator=",", encoding="utf8", schema_overrides=dtypes_spine)
    .select(dtypes_spine.keys())
    .collect()
)

In [None]:
# Read parent-child and partner edges
dtypes_fam_core = {
    "RINPERSOON":pl.String,
    "RINPERSOONRELATIE":pl.String
}

fam_core = pl.scan_csv(config["data"]["edges_fam_core"], separator=",", encoding="utf8", schema_overrides=dtypes_fam_core).collect()

# 1 - Create household-to-household edgelist

In [None]:
# Create edgelist by replacing fam_core edges with household ids
edges_hhd = (
    fam_core
    # Join household ids
    .join(spine.select(["RINPERSOON","id_hhd"]), how = "left", on = "RINPERSOON")
    .join(spine.select(["RINPERSOON","id_hhd"]), how ="left", left_on = "RINPERSOONRELATIE", right_on = "RINPERSOON")
    # Rename columns
    .rename({"id_hhd":"ego", "id_hhd_right":"alter"})
    # Turn into edgelist
    .select(["ego", "alter"])
    # Delete self-edges if any
    .filter(pl.col("ego") != pl.col("alter"))
    # Keep only one occurrence per edge
    .unique(keep="any")
)

In [None]:
# Check that all edges are reciprocated
check_reciprocity = (
    edges_hhd
    .join(edges_hhd, how = "anti", left_on = ["ego","alter"], right_on = ["alter","ego"])
)
 
# Check
print(f"Number of non-reciprocated household edges: {len(check_reciprocity)}")

In [None]:
# Store
edges_hhd.write_csv(config["data"]["edges_hhd"], separator=",", line_terminator="\n")

# 2 - Household-level alters at distances 1, 2, 3, and 4

In [None]:
# Identify the household id of egos
ego_hhd = spine.filter(pl.col("is_ego")==1).select("id_hhd").rename({"id_hhd":"ego"})

## 2.1 - Distance 1

In [None]:
# Distance 1
edges_hhd_d1 = (
    ego_hhd
    .join(edges_hhd, how="inner", on="ego")
    .sort("ego")
)

## 2.2 - Distance 2

In [None]:
# Distance 2
edges_hhd_d2 = (
    edges_hhd_d1
    # Join alters at distance 2
    .join(edges_hhd, how = "left", left_on = "alter", right_on = "ego")
    .lazy()
    # Turn into edgelist by deleting alters at d1
    .select(["ego", "alter_right"])
    .rename({"alter_right":"alter"})
    # Keep unique edges
    .unique(keep = "any")
    # Delete self-edges
    .filter(pl.col("ego") != pl.col("alter"))
    .collect()
)

# Delete edges already existing at distance 1
edges_hhd_d2 = (
    edges_hhd_d2
    .join(edges_hhd_d1, how = "anti", on = ["ego","alter"])
)

## 2.3 - Distance 3

In [None]:
# Distance 3
edges_hhd_d3 = (
    edges_hhd_d2
    # Join nodes at distance 3
    .join(edges_hhd, how = "left", left_on = "alter", right_on = "ego")
    .lazy()
    # Turn into edgelist by deleting alters at d2
    .select(["ego", "alter_right"])
    .rename({"alter_right":"alter"})
    # Keep unique edges
    .unique(keep = "any")
    # Delete self-edges
    .filter(pl.col("ego") != pl.col("alter"))
    .collect()
)

# Delete edges already existing at distances 1 and 2
edges_hhd_d3 = (
    edges_hhd_d3
    .join(edges_hhd_d1, how = "anti", on = ["ego","alter"])
    .join(edges_hhd_d2, how = "anti", on = ["ego","alter"])
)

## 2.4 - Distance 4

In [None]:
# Fetch edges
edges_hhd_d4 = (
    edges_hhd_d3
    # Join nodes at distance 4
    .join(edges_hhd, how = "left", left_on = "alter", right_on = "ego")
    .lazy()
    # Turn into edgelist by deleting alters at d3
    .select(["ego", "alter_right"])
    .rename({"alter_right":"alter"})
    # Keep unique edges
    .unique(keep = "any")
    # Delete self-edges
    .filter(pl.col("ego") != pl.col("alter"))
    .collect()
)

# Delete edges already existing at distances 1, 2, and 3
edges_hhd_d4 = (
    edges_hhd_d4
    .join(edges_hhd_d1, how = "anti", on = ["ego","alter"])
    .join(edges_hhd_d2, how = "anti", on = ["ego","alter"])
    .join(edges_hhd_d3, how = "anti", on = ["ego","alter"])
)

## 2.5 - Store and tidy

In [None]:
# Store files
edges_hhd_d1.write_csv(config["data"]["edges_hhd_d1"], separator=",", line_terminator="\n")
edges_hhd_d2.write_csv(config["data"]["edges_hhd_d2"], separator=",", line_terminator="\n")
edges_hhd_d3.write_csv(config["data"]["edges_hhd_d3"], separator=",", line_terminator="\n")
edges_hhd_d4.write_csv(config["data"]["edges_hhd_d4"], separator=",", line_terminator="\n")

In [None]:
del fam_core, spine

# 3 - Ego-networks file

In [None]:
# Create a file that is an edgelist representing the ego-network of each ego household

# 1 - Collect all nodes in each ego-network

# Collect alters by concatenating distance edgelists
egonets = pl.concat([edges_hhd_d1, edges_hhd_d2, edges_hhd_d3, edges_hhd_d4])

# Add a self-edge for egos to find their alters later
self_edges = ego_hhd.with_columns(alter=pl.col("ego"))
egonets = pl.concat([egonets, self_edges])
egonets = egonets.sort(["ego","alter"])

# 2 - Subset the edges of all nodes per network

egonets = (
    egonets
    .join(edges_hhd, how="left", left_on="alter", right_on="ego")
    .rename({"ego":"id_hhd", "alter":"ego", "alter_right":"alter"})
)

In [None]:
# 3 - Exclude self-edges and edges that concern alters not in the ego-network (> distance 4)

# Prepare a list of alters allowed in a network
allowed_alters = (
    egonets
    .select(["id_hhd","ego"])
    .rename({"ego":"alter"})
    # Do not allow self-edges
    .filter(pl.col("id_hhd")!=pl.col("alter"))
    # Keep only instance per alter
    .unique(["id_hhd","alter"], keep="any")
)

# Exclude edges that concern alters not in the list
egonets = egonets.join(allowed_alters, how="semi", on=["id_hhd","alter"])

In [None]:
# Store files
egonets.write_csv(config["data"]["egonets"], separator=",", line_terminator="\n")

In [None]:
del egonets, allowed_alters