## 03h - Output: Miscellaneous statistics

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
from scipy.stats import pearsonr

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read spine
dtypes_spine = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64
}

spine = pl.scan_csv(config["data"]["spine"], separator=",", encoding="utf8", schema_overrides=dtypes_spine).select(dtypes_spine.keys()).collect()

In [None]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64,
    "is_ego_child":pl.Int64,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "density_2":pl.Float64,
    "density_3":pl.Float64,
    "density_4":pl.Float64,
    "overlap_1_child":pl.Int64,
    "overlap_2_child":pl.Int64,
    "overlap_3_child":pl.Int64,
    "overlap_4_child":pl.Int64
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).select(dtypes_sample.keys()).collect()

In [None]:
# Read edgelists
dtypes_edges = {
    "ego":pl.Int64,
    "alter":pl.Int64
}

edges_hhd_d1 = pl.scan_csv(config["data"]["edges_hhd_d1"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d2 = pl.scan_csv(config["data"]["edges_hhd_d2"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d3 = pl.scan_csv(config["data"]["edges_hhd_d3"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d4 = pl.scan_csv(config["data"]["edges_hhd_d4"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()

In [None]:
# Keep unique households in sample
sample = (
    sample
    .filter(pl.col("is_ego_child")==1)
    # Keep only one row per ego-household with child 0-4
    .unique(subset=["id_hhd"], keep="any")
    # Fill overlap nulls
    .with_columns(pl.col(["overlap_1_child","overlap_2_child","overlap_3_child","overlap_4_child"]).fill_null(strategy="zero"))
)

# 1 - Number of nodes, alters, edges, etc

In [None]:
# Concatenate edges
edges = pl.concat([edges_hhd_d1,edges_hhd_d2,edges_hhd_d3,edges_hhd_d4])

In [None]:
# Get household ids
mothers = sample.select("id_hhd")

In [None]:
# Keep edges of women in sample
edges = (
    edges
    .join(mothers, how="semi", left_on="ego", right_on="id_hhd")
)

In [None]:
# Calculate number of unique alters
len(edges.drop("ego").unique(keep="any"))

In [None]:
# Calculte number of unique nodes / households
egos = edges.drop("alter").rename({"ego":"node"})
alters = edges.drop("ego").rename({"alter":"node"})
nodes = pl.concat([egos,alters])
nodes = nodes.unique(keep="any")
len(nodes)

In [None]:
# Calculate number of unique individuals
nodes_ind = nodes.join(spine, how="left", left_on="node", right_on="id_hhd").unique(["RINPERSOON"],keep="any")
len(nodes_ind)

# 2 - Network size

In [None]:
# Calculate total network size
sample = sample.with_columns(pl.sum_horizontal(pl.col(["net_size_hhd_1","net_size_hhd_2","net_size_hhd_3","net_size_hhd_4"])).alias("net_size_hhd_total"))

In [None]:
# Describe
(
    sample
    .select(["net_size_hhd_1","net_size_hhd_2","net_size_hhd_3","net_size_hhd_4","net_size_hhd_total"])
    .describe(percentiles=[0.25,0.5,0.75,0.95])
)

In [None]:
# Number of egos without connections at distance 1
len(sample.filter(pl.col("net_size_hhd_1")==0))

In [None]:
# Number of egos with 5 or less connections at distance 3
len(sample.filter(pl.col("net_size_hhd_3")<=5))

In [None]:
# Number of egos with 50 or more connections at distance 3
len(sample.filter(pl.col("net_size_hhd_3")>=25))

# 3 - Density

In [None]:
# Describe
(
    sample
    .select(["density_2","density_3","density_4"])
    .describe(percentiles=[0.25,0.5,0.75,0.95])
)

In [None]:
# % of egos that have density=0 that are isolates
len(sample.filter((pl.col("density_2")==0) & (pl.col("net_size_hhd_1")==0))) / len(sample.filter(pl.col("density_2")==0)) * 100

In [None]:
# Correlation between density and total network size at d4
pearsonr(sample["net_size_hhd_total"], sample["density_4"])

# 4 - Overlap

In [None]:
# Describe
(
    sample
    .select(["overlap_1_child","overlap_2_child","overlap_3_child","overlap_4_child"])
    .describe(percentiles=[0.25,0.5,0.75,0.99])
)

In [None]:
# % of mothers with at least 1 overlapping network at d1
len(sample.filter(pl.col("overlap_1_child")>=1)) / len(sample) * 100

In [None]:
# % of mothers with at least 5 overlapping network at d1
len(sample.filter(pl.col("overlap_1_child")>=5)) / len(sample) * 100

In [None]:
# % of mothers with at least 50 overlapping network at d3
len(sample.filter(pl.col("overlap_3_child")>=50)) / len(sample) * 100

In [None]:
# % of mothers with at least 100 overlapping network at d3
len(sample.filter(pl.col("overlap_3_child")>=100)) / len(sample) * 100

In [None]:
# Correlation between overlap and total network size at d4
pearsonr(sample["net_size_hhd_total"], sample["overlap_4_child"])