## 02c - Variable calculation: Network composition

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "gender_female":pl.Int64,
    "birth_date":pl.Date,
    "death_date":pl.Date,
    "age":pl.Int64,
    "SOORTOBJECTNUMMER":pl.String,
    "RINOBJECTNUMMER":pl.String,
    "gemeente":pl.String,
    "wijk":pl.String,
    "buurt":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "net_size_hhd":pl.Int64,
    "density_2":pl.Float64,
    "density_3":pl.Float64,
    "density_4":pl.Float64,
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).collect()

In [None]:
# Read relevant spine variables for hhd composition
dtypes_spine = {
    "RINPERSOON":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String
}

spine = (
    pl
    .scan_csv(config["data"]["spine"], separator=",", encoding="utf8", schema_overrides=dtypes_spine)
    .select(dtypes_spine.keys())
    .collect()
)

The household and network composition figures will only concern households with children aged 0-4. Therefore from here onwards I only calculate composition for such households.

# 1 - Household composition

In [None]:
# Get ids of ego children
id_egos = sample.filter(pl.col("is_ego_child")==1).unique(subset=["id_hhd"], keep="any").select("id_child").rename({"id_child":"RINPERSOON"})
print(f"Number of households with children aged 0-4: {len(id_egos)}")

In [None]:
# Read housemate edges of target children
dtypes_edges_housemate = {
    "RINPERSOON":pl.String,
    "RINPERSOONRELATIE":pl.String
}

edges_housemate = (
    pl
    .scan_csv(config["data"]["edges_house_full"], separator=",", encoding="utf8", schema_overrides=dtypes_edges_housemate)
    .join(id_egos.lazy(), how="semi", on="RINPERSOON")
    .collect()
)

In [None]:
# Read uniplex family edge types of target children
dtypes_edges_fam_uni = {
    "RINPERSOON":pl.String,
    "RINPERSOONRELATIE":pl.String,
    "RELATIE":pl.String
}

edges_fam_uni = (
    pl
    .scan_csv(config["data"]["edges_fam_uniplex"], separator=",", encoding="utf8", schema_overrides=dtypes_edges_fam_uni)
    .join(id_egos.lazy(), how="semi", on="RINPERSOON")
    .collect()
)

In [None]:
# Join family edge types to the household edges
print(len(edges_housemate))
edges_house_fam = (
    edges_housemate
    .join(edges_fam_uni, how="left", on=["RINPERSOON","RINPERSOONRELATIE"])
    .with_columns(RELATIE = pl.when(pl.col("RELATIE").is_null()).then(0).otherwise(pl.col("RELATIE")))
)
print(len(edges_house_fam))

In [None]:
# Check edge types
pl.Config.set_tbl_rows(30)
print(edges_house_fam["RELATIE"].value_counts(sort=True))

In [None]:
# Check edge types for children with <=15 housemates
id_egos_smallhhd = (
    sample
    .filter(pl.col("hhd_size")<=15)
    .filter(pl.col("is_ego_child")==1)
    .unique(subset=["id_hhd"], keep="any")
    .select("id_child")
    .rename({"id_child":"RINPERSOON"})
    .select("RINPERSOON")
)
check_smallhhd = edges_house_fam.join(id_egos_smallhhd, how="semi", on="RINPERSOON")
print(check_smallhhd["RELATIE"].value_counts(sort=True))

In [None]:
# Recoding kin based on kin proximity
# Number prefixes are used for sorting and disposed off later
recode_edges = {
    "301":"10Parent",
    "302":"20Partner",
    "312":"20Partner",
    "313":"20Partner",
    "303":"30Grandparent",
    "400":"90Extended kin",
    "304":"50Child",
    "305":"60Grandchild",
    "306":"70Sibling",
    "307":"70Sibling",
    "308":"70Sibling",
    "317":"80Step-kin",
    "318":"80Step-kin",
    "319":"80Step-kin",
    "309":"90Extended kin",
    "310":"90Extended kin",
    "311":"90Extended kin",
    "320":"90Extended kin",
    "321":"90Extended kin",
    "322":"90Extended kin",
    "314":"98In-law",
    "315":"98In-law",
    "316":"98In-law",
    "0":"99Other",
}

edges_house_fam_rec = edges_house_fam.with_columns(pl.col("RELATIE").cast(pl.String).replace(recode_edges))

In [None]:
# Household types based on cohabiting kin
hhd_types = (
    edges_house_fam_rec
    .lazy()
    # Keep only edge per type
    .unique(["RINPERSOON","RELATIE"], keep="any")
    # Get all housemate types per ego
    .group_by("RINPERSOON", maintain_order=True)
    .agg(pl.col("RELATIE"))
    .rename({"RELATIE":"edge_types"})
    .with_columns(edge_types=pl.col("edge_types").list.sort())
    # From list to string to be able to check unique households
    .with_columns(edge_types=pl.col("edge_types").list.join(" | "))
    # Remove number prefixes used for sorting
    .with_columns(edge_types=pl.col("edge_types").str.replace_all(r"[0-9]+",""))
    .collect()
)

In [None]:
# In smaller households
hhd_types_smallhhd = hhd_types.join(id_egos_smallhhd, how="semi", on="RINPERSOON")
print(hhd_types_smallhhd["edge_types"].value_counts(sort=True))

In [None]:
# Prepare output by collapsing combinations with less than 10 households into one row
hhd_comp_output = hhd_types_smallhhd["edge_types"].value_counts(sort=True)
hhd_comp_output_above = hhd_comp_output.filter(pl.col("count")>=10)
hhd_comp_output_below = hhd_comp_output.filter(pl.col("count")<10)
hhd_comp_output_below = (
    hhd_comp_output_below
    .with_columns(edge_types = 0)
    .select(pl.all().sum())
    .with_columns(pl.col("edge_types").cast(pl.String).replace("0", "Other combinations"))
)
hhd_comp_output = pl.concat([hhd_comp_output_above, hhd_comp_output_below])


In [None]:
# Store table for smaller households
hhd_comp_output.write_csv(config["output"]["tab_hhd_comp"], separator=",", line_terminator="\n")

# 2 - Network composition

In [None]:
# Prepare RINPERSOON-id_hhd pairs
id_hhd = spine.select(["id_hhd","RINPERSOON"])

In [None]:
# Prepare child-id_hhd pairs
id_egos_hhds = id_egos.join(id_hhd, how="left", on="RINPERSOON")

In [None]:
# Recode RELATIE in uniplex family edgelist
recode_counts = {
    "301":"parent",
    "302":"coparent",
    "312":"partner",
    "313":"partner",
    "303":"grandparent",
    "400":"great-grandparent",
    "304":"child",
    "305":"grandchild",
    "306":"sibling",
    "307":"sibling",
    "308":"sibling",
    "309":"cousin",
    "310":"nephew",
    "311":"uncle",
    "320":"cousin-in-law",
    "321":"nephew-in-law",
    "322":"uncle-in-law",
    "314":"parent-in-law",
    "315":"child-in-law",
    "316":"sibling-in-law",
    "317":"step-parent",
    "318":"step-child",
    "319":"step-sibling",
}

edges_fam_counts = (
    edges_fam_uni
    .with_columns(pl.col("RELATIE").cast(pl.String).replace(recode_counts))
)

In [None]:
def count_kin_per_distance(egos_hhds, distance, edges_family, household_ids):
    '''
    Function to count the number of each type of family member present
    at a specified distance in the household network of egos.

    Inputs:
    * ego_hhds: Dataframe containing the RINPERSOONs of the target egos
                as well as their household ids.
    * distance: An integer specifying the network distance at which kin
                should be counted.
    * edges_family: Uniplex family edgelist used to identify types of kin.
    * household_ids: Dataframe containing all RINPERSOONs in the population
                and their respective household ids.

    Output:
    * kin_counts: Dataframe containing one row per ego and one column per 
                type of family member indicating the count of that type
                at the specificied distance.
    '''

    # Load household edges at distance
    dtypes_edges_hhd = {
        "ego":pl.Int64,
        "alter":pl.Int64
    }
    edges_path = f"edges_hhd_d{distance}"
    edges_hhd = (
        pl
        .scan_csv(config["data"][edges_path], separator=",", encoding="utf8", schema_overrides=dtypes_edges_hhd)
        .join(egos_hhds.lazy(), how="inner", left_on="ego", right_on="id_hhd")
        .select(["RINPERSOON","alter"])
        .rename({"RINPERSOON":"ego"})
        .collect()
    )

    # Replace alter-households by their members, i.e. alter-individuals
    edges_ind = (
        edges_hhd
        .join(household_ids, how="left", left_on="alter", right_on="id_hhd")
        .select(["ego","RINPERSOON"])
        .rename({"RINPERSOON":"alter"})
        .unique(["ego","alter"], keep="any")
    )

    # Join family relationship to alter
    edges_ind = (
        edges_ind
        .join(edges_family, how="left", left_on=["ego","alter"], right_on=["RINPERSOON","RINPERSOONRELATIE"])
        .select(["ego","alter","RELATIE"])
        # Fill in null RELATIE
        .with_columns(pl.col("RELATIE").fill_null("other"))
    )

    # Count kin
    kin_counts = (
        edges_ind
        .group_by(["ego","RELATIE"])
        .len()
        .rename({"len":"count"})
        # Pivot from long to wide
        .pivot("RELATIE",index="ego",values="count")
        # Fill nulls with zeroes
        .fill_null(strategy="zero")
        # Rename ego
        .rename({"ego":"RINPERSOON"})
    )

    # Rename columns
    cols = kin_counts.columns
    cols = cols[1:] # Exclude RINPERSOON
    cols_new = [col + "_" + f"{distance}" for col in cols]
    cols_new = cols_new 
    cols_rename = {col[:-2]:col for col in cols_new}
    kin_counts = kin_counts.rename(cols_rename)

    # Calculate degree / network size as well
    net_size = (
        edges_ind
        .group_by("ego")
        .len()
        .rename({"ego":"RINPERSOON", "len":f"net_size_ind_{distance}"})
    )

    kin_counts = kin_counts.join(net_size, how="left", on="RINPERSOON")

    return kin_counts

In [None]:
# Count kin at distances 1-4
kin_counts_1 = count_kin_per_distance(id_egos_hhds, 1, edges_fam_counts, id_hhd)
kin_counts_2 = count_kin_per_distance(id_egos_hhds, 2, edges_fam_counts, id_hhd)
kin_counts_3 = count_kin_per_distance(id_egos_hhds, 3, edges_fam_counts, id_hhd)
kin_counts_4 = count_kin_per_distance(id_egos_hhds, 4, edges_fam_counts, id_hhd)

In [None]:
# Count kin at distance 0 (in the household)
kin_counts_0 = (
    edges_housemate
    .join(edges_fam_counts, how="left", on=["RINPERSOON","RINPERSOONRELATIE"])
    # Fill in null relatie
    .with_columns(pl.col("RELATIE").fill_null("other"))
    # Count
    .group_by(["RINPERSOON","RELATIE"])
    .len()
    .rename({"len":"count"})
    # Pivot from long to wide
    .pivot("RELATIE",index="RINPERSOON",values="count")
    # Fill nulls with zeroes
    .fill_null(strategy="zero")
)

# Rename columns
cols = kin_counts_0.columns
cols = cols[1:] # Exclude RINPERSOON
cols_new = [col + "_0" for col in cols]
cols_rename = {col[:-2]:col for col in cols_new}
kin_counts_0 = kin_counts_0.rename(cols_rename)

# Calculate net_size_ind_d0
net_size_d0 = (
    edges_housemate
    .group_by("RINPERSOON")
    .len()
    .rename({"len":"net_size_ind_0"})
)

kin_counts_0 = kin_counts_0.join(net_size_d0, how="left", on="RINPERSOON")

In [None]:
# Merge all
kin_counts = (
    id_egos
    .join(kin_counts_0, how="left", on="RINPERSOON")
    .join(kin_counts_1, how="left", on="RINPERSOON")
    .join(kin_counts_2, how="left", on="RINPERSOON")
    .join(kin_counts_3, how="left", on="RINPERSOON")
    .join(kin_counts_4, how="left", on="RINPERSOON")
    .with_columns(pl.all().fill_null(strategy="zero"))
)

In [None]:
# Store
kin_counts.write_csv(config["data"]["kincounts"], separator=",", line_terminator="\n")