## 02a - Variable calculation: Network size

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [2]:
import yaml
import polars as pl

In [3]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [4]:
# Read spine
dtypes_spine = {
    "RINPERSOON":pl.String,
    "gender_female":pl.Int64,
    "birth_date":pl.Date,
    "death_date":pl.Date,
    "age":pl.Int64,
    "SOORTOBJECTNUMMER":pl.String,
    "RINOBJECTNUMMER":pl.String,
    "gemeente":pl.String,
    "wijk":pl.String,
    "buurt":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String
}

spine = pl.scan_csv(config["data"]["spine"], separator=",", encoding="utf8", schema_overrides=dtypes_spine).collect()

# 1 - Network size

In [5]:
# Create df to store variables
sample = spine.filter(pl.col("is_ego")==1)

In [6]:
# Read household-level edgelists
dtypes_edges = {
    "ego":pl.Int64,
    "alter":pl.Int64
}

edges_hhd_d1 = pl.scan_csv(config["data"]["edges_hhd_d1"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d2 = pl.scan_csv(config["data"]["edges_hhd_d2"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d3 = pl.scan_csv(config["data"]["edges_hhd_d3"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d4 = pl.scan_csv(config["data"]["edges_hhd_d4"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()

In [7]:
# Caculate network size per distance
size_hhd_d1 = edges_hhd_d1.group_by("ego").len().rename({"ego":"id_hhd","len":"net_size_hhd_1"})
size_hhd_d2 = edges_hhd_d2.group_by("ego").len().rename({"ego":"id_hhd","len":"net_size_hhd_2"})
size_hhd_d3 = edges_hhd_d3.group_by("ego").len().rename({"ego":"id_hhd","len":"net_size_hhd_3"})
size_hhd_d4 = edges_hhd_d4.group_by("ego").len().rename({"ego":"id_hhd","len":"net_size_hhd_4"})
sample = (
    sample
    .join(size_hhd_d1, how="left", on="id_hhd")
    .join(size_hhd_d2, how="left", on="id_hhd")
    .join(size_hhd_d3, how="left", on="id_hhd")
    .join(size_hhd_d4, how="left", on="id_hhd")
)
sample = sample.with_columns(pl.col("net_size_hhd_1","net_size_hhd_2","net_size_hhd_3","net_size_hhd_4").fill_null(strategy="zero"))

In [8]:
# Total network size
sample = sample.with_columns(net_size_hhd = pl.col("net_size_hhd_1") + pl.col("net_size_hhd_2") + pl.col("net_size_hhd_3") + pl.col("net_size_hhd_4"))

In [9]:
# Free memory
del edges_hhd_d1, edges_hhd_d2, edges_hhd_d3, edges_hhd_d4, size_hhd_d1, size_hhd_d2, size_hhd_d3, size_hhd_d4

In [10]:
# Store
sample.write_csv(config["data"]["sample"], separator=",", line_terminator="\n")