## 03d - Output: Network overlap

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
import pandas as pd
import seaborn.objects as so
from seaborn import axes_style
import matplotlib as mpl

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read sample
dtypes_overlap = {
    "id_hhd":pl.Int64,
    "overlap_child_1":pl.Int64,
    "overlap_child_2":pl.Int64,
    "overlap_child_3":pl.Int64,
    "overlap_child_4":pl.Int64,
}

overlap = (
    pl
    .scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_overlap)
    .filter(pl.col("is_ego_child")==1)
    .select(dtypes_overlap.keys())
    # Keep only one row per ego-household with child 0-4
    .unique(subset=["id_hhd"], keep="any")
    .collect()
)

# 1 - Prepare dataframe

In [None]:
# From wide to long and back to Polars
stub = "overlap_child"
overlap_plot = overlap.to_pandas()
overlap_plot = pd.wide_to_long(overlap_plot, i = "id_hhd", j = "dist", stubnames = stub, sep = "_")
overlap_plot = pl.from_pandas(overlap_plot, include_index = True)

# Keep only overlap values with at least 10 observations per dist
overlap_plot = (
    overlap_plot
    .sort(["id_hhd","dist"])
    .with_columns(len = pl.col("dist").len().over(["dist","overlap_child"]))
    .filter(pl.col("len") >= 10)
    .sort("id_hhd")
    .drop("len")
)


# 2 - Plot figure

In [None]:
# Initialise figure with matplotlib
fig_5 = mpl.figure.Figure(figsize = (8,6), layout = "tight", dpi = 400)

# Plot main content with seaborn objects (so)
bins = 50
p = (
    # Clustering by distance
    so.Plot(data=overlap_plot, x="overlap_child", color="dist")
    # 4 subplots
    .facet("dist", wrap=2)
    # Add histogram
    .add(
        so.Bars(alpha=1),
        so.Hist(
            bins = bins,
            stat = "count"
        )
    )
    # Adjust axes
    .scale(
        # Ticks
        x = so.Continuous().tick(every=200),
        y = so.Continuous(trans="symlog").tick(at=[10,100,1000,10000,100000,1000000]),
        # Colorblind-friendly palette
        color = ["#785EF0","#DC267F", "#FE6100", "#FFB000"]
    )
    # Axes limits
    .limit(x = (0,1200), y = (10, 1000000))
    # Label axes
    .label(
        x = None, 
        y = "Count",
        title = "By distance {}".format
    )
    # Configure theme
    .theme({**axes_style("whitegrid"), "axes.edgecolor": "black", "grid.linestyle":":", "legend.frameon":False, "legend.fontsize":20})
    # Plot on figure
    .on(fig_5)
).plot()

# Tick labels
p._figure.axes[0].yaxis.set_ticklabels(["10", "100", "1k", "10k", "100k", "1M"])

# X label
p._figure.supxlabel("Number of networks that overlap with ego's network")

# Title of first subplot
p._figure.subplots

# No legend
p._figure.legends.clear()

p.save(config["output"]["fig_5_overlap"], dpi = 400)