## 02e - Variable calculation: Network overlap

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [1]:
import yaml
import polars as pl

In [2]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [3]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "gender_female":pl.Int64,
    "birth_date":pl.Date,
    "death_date":pl.Date,
    "age":pl.Int64,
    "SOORTOBJECTNUMMER":pl.String,
    "RINOBJECTNUMMER":pl.String,
    "gemeente":pl.String,
    "wijk":pl.String,
    "buurt":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "net_size_hhd":pl.Int64,
    "density_2":pl.Float64,
    "density_3":pl.Float64,
    "density_4":pl.Float64,
    "is_ego_survey":pl.Int64
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).collect()

In [4]:
# Read egonets
dtypes_egonets = {
    "id_hhd":pl.Int64,
    "ego":pl.Int64,
    "alter":pl.Int64,
}

egonets = pl.scan_csv(config["data"]["egonets"], separator=",", encoding="utf8", schema_overrides=dtypes_egonets).collect()

In [5]:
# Read edgelists
dtypes_edges = {
    "ego":pl.Int64,
    "alter":pl.Int64
}

edges_hhd_d1 = pl.scan_csv(config["data"]["edges_hhd_d1"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d2 = pl.scan_csv(config["data"]["edges_hhd_d2"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d3 = pl.scan_csv(config["data"]["edges_hhd_d3"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()
edges_hhd_d4 = pl.scan_csv(config["data"]["edges_hhd_d4"], separator=",", encoding="utf8", schema_overrides=dtypes_edges).collect()

# 1 - Prepare data

In [6]:
# Create a copy of egonets and edges only including children egos
# Goal: checking overlap only for and between networks with ego children

# Get child ids
children = sample.filter(pl.col("is_ego_child")==1).select("id_hhd").unique("id_hhd", keep="any")

# Create copies
egonets_child = egonets.join(children, how="semi", on="id_hhd")
edges_hhd_d1_child = edges_hhd_d1.join(children, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d2_child = edges_hhd_d2.join(children, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d3_child = edges_hhd_d3.join(children, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d4_child = edges_hhd_d4.join(children, how="semi", left_on="ego", right_on="id_hhd")

In [7]:
# Create a copy of edges for survey sample
# Goal: checking overlap only for survey networks but between networks of all women
# Facilitates computation

# Get survey women ids
survey_women = sample.filter(pl.col("is_ego_survey")==1).select("id_hhd").unique("id_hhd", keep="any")

# Create copies
edges_hhd_d1_survey = edges_hhd_d1.join(survey_women, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d2_survey = edges_hhd_d2.join(survey_women, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d3_survey = edges_hhd_d3.join(survey_women, how="semi", left_on="ego", right_on="id_hhd")
edges_hhd_d4_survey = edges_hhd_d4.join(survey_women, how="semi", left_on="ego", right_on="id_hhd")

In [8]:
# Identify in which egonets each alter appears

# Women 18-40
alter_to_net = egonets.unique(["alter","id_hhd"], keep="any").drop("ego").select(["alter","id_hhd"]).sort(["alter","id_hhd"])

# Children
alter_to_net_child = egonets_child.unique(["alter","id_hhd"], keep="any").drop("ego").select(["alter","id_hhd"]).sort(["alter","id_hhd"])

In [9]:
# Free memory
del children, survey_women, egonets, egonets_child, edges_hhd_d1, edges_hhd_d2, edges_hhd_d3, edges_hhd_d4

# 2 - Calculate overlap

In [10]:
# Function to calculate overlap
def identify_overlapping_nets(alter_to_net, edges_hhd_d):
    '''
    Function to identify all egonetworks with which the network
    of ego overlaps at a specified distance.

    Inputs:
    * alter_to_net: Dataframe identifying each of the ego-networks
        in which an alter can be found.
    * edges_hhd_d: Edgelist of the alters of egos at the specified
        distance.

    Output:
    * overlap_nets: Ego-to-Ego edgelist that identifies all networks
        with which the network of ego overlaps at the specified distance.
    '''
    overlap_nets = (
        edges_hhd_d
        .lazy()
        # Join network of each alter
        .join(alter_to_net.lazy(), how="left", on="alter")
        .rename({"id_hhd":"id_net"})
        # Keep one row per ego-to-network pair
        .unique(["ego","id_net"], keep="any")
        .drop("alter")
        .collect()
    )

    return overlap_nets

In [11]:
def calculate_overlap(list_overlap_nets, sample_suffix, distance):
    '''
    Function to calculate a continuous measure of network 
    overlap at a specified cumulative distance for each ego-network.

    Inputs:
    * list_overlap_nets: List of dataframes each of which contains
        the networks with which the network of ego overlaps at a
        given distance. This is list is used to pool networks at
        the specified cumulative distance.
    * sample_suffix: Suffix string to add to the column name to identify
        the sample for which the measure was calculated.
    * distance: Cumulative distance at which overlap should be calculated.

    Output:
    * overlap: Dataframe containing the continuous measure of overlap.
    '''
    
    # Pool at the specified cumulative distance
    overlap_nets_cum = pl.concat(list_overlap_nets)
    overlap = (
        overlap_nets_cum
        # Keep only one ego-to-network pair
        .unique(["ego","id_net"], keep="any")
        # Count number of overlapping networks
        .group_by("ego")
        .len()
        .rename({"len":f"overlap_{sample_suffix}_{distance}"})
    )

    return overlap

In [12]:
# Calculate overlap child

overlap_nets_1_child = identify_overlapping_nets(alter_to_net_child, edges_hhd_d1_child)
overlap_nets_2_child = identify_overlapping_nets(alter_to_net_child, edges_hhd_d2_child)
overlap_nets_3_child = identify_overlapping_nets(alter_to_net_child, edges_hhd_d3_child)
overlap_nets_4_child = identify_overlapping_nets(alter_to_net_child, edges_hhd_d4_child)

overlap_1_child = calculate_overlap([overlap_nets_1_child], "child", 1)
overlap_2_child = calculate_overlap([overlap_nets_1_child, overlap_nets_2_child], "child", 2)
overlap_3_child = calculate_overlap([overlap_nets_1_child, overlap_nets_2_child, overlap_nets_3_child], "child", 3)
overlap_4_child = calculate_overlap([overlap_nets_1_child, overlap_nets_2_child, overlap_nets_3_child, overlap_nets_4_child], "child", 4)

del overlap_nets_1_child, overlap_nets_2_child, overlap_nets_3_child, overlap_nets_4_child

In [13]:
# Calculate overlap women 18-40 in survey

overlap_nets_1_survey = identify_overlapping_nets(alter_to_net, edges_hhd_d1_survey)
overlap_nets_2_survey = identify_overlapping_nets(alter_to_net, edges_hhd_d2_survey)
overlap_nets_3_survey = identify_overlapping_nets(alter_to_net, edges_hhd_d3_survey)
overlap_nets_4_survey = identify_overlapping_nets(alter_to_net, edges_hhd_d4_survey)

overlap_1_survey = calculate_overlap([overlap_nets_1_survey], "survey", 1)
overlap_2_survey = calculate_overlap([overlap_nets_1_survey, overlap_nets_2_survey], "survey", 2)
overlap_3_survey = calculate_overlap([overlap_nets_1_survey, overlap_nets_2_survey, overlap_nets_3_survey], "survey", 3)
overlap_4_survey = calculate_overlap([overlap_nets_1_survey, overlap_nets_2_survey, overlap_nets_3_survey, overlap_nets_4_survey], "survey", 4)

del overlap_nets_1_survey, overlap_nets_2_survey, overlap_nets_3_survey, overlap_nets_4_survey

In [14]:
# Join to sample
overlap_dfs = [overlap_1_survey, overlap_2_survey, overlap_3_survey, overlap_4_survey, 
               overlap_1_child, overlap_2_child, overlap_3_child, overlap_4_child]

for df in overlap_dfs:
    sample = sample.join(df, how="left", left_on="id_hhd", right_on="ego")

In [15]:
# Fill in missings
sample = (
    sample
    .with_columns(
        overlap_1_child = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_child_1").is_null()) & (pl.col("is_ego_child")==1)
            ).then(0).otherwise(pl.col("overlap_child_1")),
        overlap_2_child = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_child_2").is_null()) & (pl.col("is_ego_child")==1)
            ).then(0).otherwise(pl.col("overlap_child_2")),
        overlap_3_child = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_child_3").is_null()) & (pl.col("is_ego_child")==1)
            ).then(0).otherwise(pl.col("overlap_child_3")),
        overlap_4_child = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_child_4").is_null()) & (pl.col("is_ego_child")==1)
            ).then(0).otherwise(pl.col("overlap_child_4")),
        overlap_1_survey = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_survey_1").is_null()) & (pl.col("is_ego_survey")==1)
            ).then(0).otherwise(pl.col("overlap_survey_1")),
        overlap_2_survey = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_survey_2").is_null()) & (pl.col("is_ego_survey")==1)
            ).then(0).otherwise(pl.col("overlap_survey_2")),
        overlap_3_survey = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_survey_3").is_null()) & (pl.col("is_ego_survey")==1)
            ).then(0).otherwise(pl.col("overlap_survey_3")),
        overlap_4_survey = pl.when(
            (pl.col("net_size_hhd")>0) & (pl.col("overlap_survey_4").is_null()) & (pl.col("is_ego_survey")==1)
            ).then(0).otherwise(pl.col("overlap_survey_4")),
    )
)

In [16]:
# Store
sample.write_csv(config["data"]["sample"], separator=",", line_terminator="\n")