## 03g - Output: Descriptive statistics of sample

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
import pandas as pd

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read spine
dtypes_spine = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64,
    "id_child":pl.String(),
    "is_ego_child":pl.Int64,
    "age":pl.Int64,
    "gender_female":pl.Int64,
    "hhd_size":pl.Int64,
}

spine = pl.scan_csv(config["data"]["spine"], separator=",", encoding="utf8", schema_overrides=dtypes_spine).select(dtypes_spine.keys()).collect()

In [None]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64,
    "is_ego_child":pl.Int64,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "density_4":pl.Float64,
    "overlap_4_child":pl.Int64
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).select(dtypes_sample.keys()).collect()

# 1 - Prepare data

In [None]:
# Identify unique household ids
table = (
    spine
    .filter(pl.col("is_ego_child")==1)
    # Keep only one row per ego-household with child 0-4
    .unique(subset=["id_hhd"], keep="any")
    .select(["id_hhd","id_child"])
    .sort("id_hhd")
)

In [None]:
# Join household size
table = (
    table
    .join(spine.unique("id_hhd",keep="any").select(["id_hhd","hhd_size"]), how="left", on="id_hhd")
)

In [None]:
# Number of children aged 0-4
n_children = spine.filter(pl.col("age")<=4).group_by("id_hhd").len().rename({"len":"n_children"})
table = table.join(n_children, how="left", on="id_hhd")

In [None]:
# Age and gender youngest

# Get ids of children
children = (
    spine
    .filter(pl.col("id_child").is_not_null())
    .select("id_child")
    .unique("id_child", keep="any")
)

# Age and gender
youngest = (
    spine
    # Keep only data about children
    .select(["RINPERSOON","id_hhd","age","gender_female"])
    .join(children, how="semi", left_on="RINPERSOON", right_on="id_child")
    # Keep only data about youngest per household
    .sort(["id_hhd","age"], descending=False)
    .unique(["id_hhd"], keep="first")
    .rename({"age":"youngest_age", "gender_female":"youngest_gender"})
)

table = table.join(youngest, how="left", on="id_hhd")

In [None]:
# Mean household age
mean_age = spine.select(["id_hhd","age"]).group_by("id_hhd").mean().rename({"age":"mean_hhd_age"})
table = table.join(mean_age, how="left", on="id_hhd")

In [None]:
# Full network size, density, and overlap
net_vars = (
    sample
    .with_columns(net_size = pl.sum_horizontal(["net_size_hhd_1","net_size_hhd_2","net_size_hhd_3","net_size_hhd_4"]))
    .select(["id_hhd","net_size","density_4","overlap_4_child"])
    .rename({"density_4":"net_density", "overlap_4_child":"net_overlap"})
    .with_columns(pl.col("net_overlap").fill_null(strategy="zero"))
    .unique("id_hhd",keep="any")
)
table = table.join(net_vars, how="left", on="id_hhd")

In [None]:
# Get parents of children
dtypes_edges_fam_uni = {
    "RINPERSOON":pl.String,
    "RINPERSOONRELATIE":pl.String,
    "RELATIE":pl.Int64
}

parents = (
    pl
    .scan_csv(config["data"]["edges_fam_uniplex"], separator=",", encoding="utf8", schema_overrides=dtypes_edges_fam_uni)
    .filter(pl.col("RELATIE")==301)
    .join(table.select("RINPERSOON").lazy(), how="semi", on="RINPERSOON")
    .drop("RELATIE")
    .rename({"RINPERSOONRELATIE":"parent"})
    # From long to wide
    .group_by("RINPERSOON")
    .agg("parent")
    .with_columns(pl.col("parent").list.to_struct(fields=["parent_1","parent_2"]))
    .unnest("parent")
    .collect()
)

In [None]:
# Number of parents + Do they coreside
parents = (
    parents
    .join(spine.select(["RINPERSOON","id_hhd"]), how="left", left_on="parent_1", right_on="RINPERSOON")
    .rename({"id_hhd":"hhd_1"})
    .join(spine.select(["RINPERSOON","id_hhd"]), how="left", left_on="parent_2", right_on="RINPERSOON")
    .rename({"id_hhd":"hhd_2"})
    .with_columns(
        parents_number = pl.sum_horizontal(pl.col(["parent_1","parent_2"]).is_not_null()),
        parents_coreside = pl.when((pl.col("hhd_1")==pl.col("hhd_2")) | (pl.col("parent_2").is_null())).then(1).otherwise(0)
    )
    .select(["RINPERSOON","parents_number","parents_coreside"])
)

table = table.join(parents, how="left", on="RINPERSOON")

In [None]:
# Drop unnecessary columns
table_output = table.drop(["id_hhd","id_child","RINPERSOON"])

In [None]:
# Dictionary of variables for table
dict_tab = {
    "hhd_size": "Household size",
    "n_children": "# of children aged 0-4",
    "youngest_age": "Age of youngest",
    "youngest_gender": "Gender of youngest (Girl = 1)",
    "parents_number": "# of living parents of youngest",
    "parents_coreside": "All parents coreside (Yes = 1)",
    "mean_hhd_age": "Mean household age",
    "net_size": "Full network size",
    "net_density": "Full network density",
    "net_overlap": "Full network overlap",
}

In [None]:
# Create Table 1
tab1 = (
    table
    .select(dict_tab.keys())
    .rename(dict_tab)
    .describe(percentiles = [0.05, 0.25, 0.5, 0.75, 0.95])
    .filter(pl.col("statistic").is_in(["min", "max"]) == False)
    .rename({"statistic": "Statistic"})
)

In [None]:
# Table 1
tab1 = tab1.with_columns(pl.col(tab1.columns[1:]).round(2)).to_pandas()
tab1 = tab1.T.reset_index()
tab1.columns = tab1.iloc[0]
tab1 = tab1.iloc[1:]

In [None]:
# Save
tab1.to_csv(config["output"]["tab_descriptives"], sep=",", encoding="utf8", index=False)