## 02b - Variable calculation: Network density

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "gender_female":pl.Int64,
    "birth_date":pl.Date,
    "death_date":pl.Date,
    "age":pl.Int64,
    "SOORTOBJECTNUMMER":pl.String,
    "RINOBJECTNUMMER":pl.String,
    "gemeente":pl.String,
    "wijk":pl.String,
    "buurt":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "net_size_hhd":pl.Int64
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).collect()

In [None]:
# Read edgelists
dtypes_edges = {
    "ego":pl.Int64,
    "alter":pl.Int64
}

edges_hhd = pl.scan_csv(config["data"]["edges_hhd"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d1 = pl.scan_csv(config["data"]["edges_hhd_d1"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d2 = pl.scan_csv(config["data"]["edges_hhd_d2"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d3 = pl.scan_csv(config["data"]["edges_hhd_d3"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d4 = pl.scan_csv(config["data"]["edges_hhd_d4"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()

# 1 - Calculate density

In [None]:
# Calculate cumulative network size
densities = (
    sample
    .with_columns(
        net_size_hhd_cum_2 = pl.sum_horizontal("net_size_hhd_1","net_size_hhd_2"),
        net_size_hhd_cum_3 = pl.sum_horizontal("net_size_hhd_1","net_size_hhd_2","net_size_hhd_3"),
        net_size_hhd_cum_4 = pl.sum_horizontal("net_size_hhd_1","net_size_hhd_2","net_size_hhd_3","net_size_hhd_4")
    )
    .select(["id_hhd", "net_size_hhd_cum_2", "net_size_hhd_cum_3", "net_size_hhd_cum_4"])
    .unique(keep="any")
)

In [None]:
# Function to calculate density at cumulative distance d
def calculate_density(df_storage, df_edges, list_df_edges, distance):
    '''
    Function to calculate the density of the household 
    ego-networks by a given cumulative network distance. 
    
    Density = (number of edges in ego-network) /
    (number of potential edges in ego-network given network size)

    Inputs:
    * df_storage = Dataframe to which the calculations must be joined.
    * df_edges = Edgelist of edges between all households in the population.
    * list_df_edges = List of dataframes containing the household network edges per distance.
    * distance = Cumulative distance at which density is calculated

    Output:
    * df_storage: Including the density measure, its numerator, and its denominator
    '''

    # 1 - Create a df with a network id column and an alter column
    
    ## Concatenate edges up to cumulative distance
    if len(list_df_edges) > 1:
        df_nets = pl.concat(list_df_edges, how = "vertical")
    else:
        df_nets = list_df_edges[0]

    ## Turn into a network id column and an alter id column
    df_nets = df_nets.sort(["ego","alter"]).rename({"ego":"id_hhd", "alter":"ego"})
    
    # 2 - Subset the population-scale edgelist to include only households in the 
    # cumulative edgelist so that edges to alters not in the ego-networks are not considered
    df_edges_filt = (
        df_edges
        .join(df_nets.select("ego"), how="semi", on="ego")
        .join(df_nets.select("ego"), how="semi", left_on="alter", right_on="ego")
    )
    
    # 3 - Join the alters of each alter
    df_nets = df_nets.join(df_edges_filt, how = "left", on = "ego")
    
    # 4 - Keep only hhd1-hhd2 edges that concern alters present in each ego-network
    # This also deletes edges to the ego
    filter_in_net = df_nets.select(["id_hhd","ego"]).rename({"ego":"alter"})
    df_nets = df_nets.join(filter_in_net, how="semi", on=["id_hhd","alter"])
    
    # 5 - Count number of edges in the network (i.e. density numerator)
    df_numerator = df_nets.group_by("id_hhd").len().rename({"len":f"density_numerator_{distance}"})
    
    # 6 - Calculate density as the proportion of  edges out of all possible edges given the degree
    df_storage = (
        df_storage
        # Join count of edges to df_storage
        .join(df_numerator, how = "left", on = "id_hhd")
        # Calculate number of possible edges (i.e. density denominator)
        .with_columns(temp = pl.col(f"net_size_hhd_cum_{distance}") * (pl.col(f"net_size_hhd_cum_{distance}") - 1))
        .rename({"temp":f"density_denominator_{distance}"})
        # Calculate density
        .with_columns(temp = pl.col(f"density_numerator_{distance}") / pl.col(f"density_denominator_{distance}"))
    )
    # 7 - Fill in nulls which have no numerator but have denominator
    df_storage = (
        df_storage
        # No numerator, yes denominator
        .with_columns(
            temp = pl.when(
                pl.col(f"density_numerator_{distance}").is_null(),
                pl.col(f"density_denominator_{distance}").is_not_null()
            )
            .then(0).otherwise("temp")
        )
        # Yes numerator, but denominator equals zero
        .with_columns(
            temp = pl.when(
                pl.col(f"density_numerator_{distance}").is_not_null(),
                pl.col(f"density_denominator_{distance}") == 0)
            .then(0).otherwise("temp")
        )
        # Rename
        .rename({"temp":f"density_{distance}"})
    )

    return df_storage

In [None]:
# Prepare for loop
list_df_edges = [
    [edges_hhd_d1, edges_hhd_d2],
    [edges_hhd_d1, edges_hhd_d2, edges_hhd_d3], 
    [edges_hhd_d1, edges_hhd_d2, edges_hhd_d3, edges_hhd_d4]
]
list_dist = [2,3,4]

In [None]:
# Calculate density at d
for df,d in zip(list_df_edges, list_dist):
    densities = calculate_density(densities, edges_hhd, df, d)

In [None]:
# Join to sample and store
density_measures = densities.select(["id_hhd","density_2","density_3","density_4"])
sample = sample.join(density_measures, how="left", on="id_hhd")
sample.write_csv(config["data"]["sample"], separator=",", line_terminator="\n")