## 03b - Output: Network size

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
import polars.selectors as cs
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.patches as patches
import seaborn as sns
import seaborn.objects as so
from seaborn import axes_style
import matplotlib as mpl

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read network size
dtypes_size = {
    "id_hhd":pl.Int64,
    "net_size_hhd_1":pl.Float64,
    "net_size_hhd_2":pl.Float64,
    "net_size_hhd_3":pl.Float64,
    "net_size_hhd_4":pl.Float64,
}

size = (
    pl
    .scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_size)
    .filter(pl.col("is_ego_child")==1)
    .select(dtypes_size.keys())
    # Keep only one row per ego-household with child 0-4
    .unique(subset=["id_hhd"], keep="any")
    .collect()
)

# 1 - Prepare dataframe

In [None]:
# Dataframe for grouped plot

# Columns and stubs for wide-to-long
list_cols = ["id_hhd","net_size_hhd_1","net_size_hhd_2", "net_size_hhd_3", "net_size_hhd_4"]
degree_var = "net_size_hhd"
size_plot = size.select(list_cols)

# Fill nulls so that missing values count as degree = 0
size_plot = size_plot.fill_null(strategy = "zero")

# From wide to long with pandas and back
size_plot = size_plot.to_pandas()
size_plot = pd.wide_to_long(size_plot, i = "id_hhd", j = "dist", stubnames = degree_var, sep = "_")
size_plot = pl.from_pandas(size_plot, include_index = True)

# Tidy
size_plot = size_plot.sort(["id_hhd","dist"])

In [None]:
# Frequency dataframe for hhd-level figure
size_plot_hhd = (
    size_plot
    # Count and sort frequencies
    .group_by(["dist",degree_var])
    .len()
    .sort(["dist",degree_var])
    # Exclude values with less than 10 observations
    # following CBS privacy regulation
    .filter(pl.col("len") >= 10)
)

# 2 - Plot figure

In [None]:
# Plot

# Initialise figure with matplotlib
fig_3, ax = plt.subplots(figsize = (7,5), layout = "constrained")

# Plot main content with seaborn objects (so)
(
    # Plot by distance
    so.Plot(x = "net_size_hhd", y = "len", data = size_plot_hhd, color = "dist")
    # Add data points with log scale
    .add(so.Dots(pointsize = 4, fillalpha = 1)).scale(x = "log", y = "log")
    # Add line connecting data points
    .add(so.Line(linewidth = 1, alpha = 0.7)).scale(x = "log", y = "log")
    # Manipulate axis scale
    .scale(
        x = so.Continuous(trans="sqrt").tick(at = [0,1,5,10,25,50,100,200,500,700, 1000]),
        y = so.Continuous(trans="sqrt"),
        # Colorblind-friendly palette
        color = ("#785EF0", "#DC267F", "#FE6100", "#FFB000")
    )
    # Label axes
    .label(x = "Number of alters", y = "Frequency")
    # Attach to figure
    .on(ax)
    .plot()
)
# Get and manipulate legend elements
legend = fig_3.legends.pop(0)
legend = ax.legend(
    # Title
    title = "Network distances:",
    # Location
    loc = "upper right",
    #bbox_to_anchor = (0.728, 0.99),
    # Box format
    ncol = 4,
    columnspacing = 0.06,
    handletextpad = -0.4,
    # Values and labels
    handles = legend.legend_handles,
    labels = [f"{t.get_text()}" for t in legend.texts],
    fontsize = 11,
    # Legend box
    frameon = True,
    fancybox = False,
    facecolor = "white",
    edgecolor = "black",
)
# Figure colour
ax.set_facecolor("white")
# Grid
ax.grid(True, color = "#DED3D1", linestyle = ":")
# Axes label size
ax.set_xlabel("Network size",fontdict = {"fontsize":12})
ax.set_ylabel("Count",fontdict = {"fontsize":12})
# Ticks
ax.set_xticks([0,5,10,25,50,100,200,400])
ax.set_yticks([10, 1000, 10000, 25000, 50000, 100000, 200000])
ax.set_yticklabels([10, "1k", "10k", "25k", "50k", "100k", "200k"])
ax.tick_params(axis = "both", which = "both", labelsize = 12)
# Figure title
# ax.set_title("Distribution of number of alters per network distance", pad = 10, loc = "left", fontsize = 12)
# Figure borders
for side in ["top","bottom","right","left"]:
    ax.spines[side].set_color("black")
    ax.spines[side].set_linewidth(0.8)
# Limit tails of distributions and annotate maxima
max_d1 = size_plot_hhd.filter(pl.col("dist")==1)
max_d1 = max_d1["net_size_hhd"].cast(pl.Int64).max()
max_d2 = size_plot_hhd.filter(pl.col("dist")==2)
max_d2 = max_d2["net_size_hhd"].cast(pl.Int64).max()
max_d3 = size_plot_hhd.filter(pl.col("dist")==3)
max_d3 = max_d3["net_size_hhd"].cast(pl.Int64).max()
max_d4 = size_plot_hhd.filter(pl.col("dist")==4)
max_d4 = max_d4["net_size_hhd"].cast(pl.Int64).max()
plt.xlim((-0.2,500))
(
    # Print maxima to show that you have limited the tails
    ax.add_patch(patches.Rectangle((272.5,146000),216,61000, facecolor = "white", edgecolor = "black", alpha = 1, zorder = 2)),
    ax.annotate("X-axis maxima:", xy = (284, 182000), fontsize = 10),
    ax.annotate(f"{max_d1},", xy = (284, 158000), fontsize = 11, weight = "bold", color = "#785EF0"),
    ax.annotate(f"{max_d2},", xy = (320, 158000), fontsize = 11, weight = "bold", color = "#DC267F"),
    ax.annotate(f"{max_d3},", xy = (371, 158000), fontsize = 11, weight = "bold", color = "#FE6100"),
    ax.annotate(f"{max_d4}", xy = (428, 158000), fontsize = 11, weight = "bold", color = "#FFB000")
)

In [None]:
# Store 
fig_3 = fig_3.get_figure()
fig_3.savefig(config["output"]["fig_3_size"], bbox_inches = "tight", dpi = 400)

# 3 - Histogram version

In [None]:
# Prepare data
size_plot_hist = (
    size_plot
    .sort(["id_hhd","dist"])
    .with_columns(len = pl.col("dist").len().over(["dist","net_size_hhd"]))
    # Exclude data points with less than 10 cases
    # Following CBS privacy legislation
    .filter(pl.col("len") >= 10)
    .sort("id_hhd")
    .drop("len")
)

In [None]:
# Initialise figure with matplotlib
fig_3b = mpl.figure.Figure(figsize = (8,6), layout = "tight", dpi = 400)

# Plot main content with seaborn objects (so)
bins = 50
p = (
    # Clustering by distance
    so.Plot(data=size_plot_hist, x="net_size_hhd", color="dist")
    # 4 subplots
    .facet("dist", wrap=2)
    # Add histogram
    .add(
        so.Bars(alpha=1),
        so.Hist(
            bins=bins,
            stat="count"
        )
    )
    # Adjust axes
    .scale(
        # Ticks
        x = so.Continuous(),#.tick(every=200),
        y = so.Continuous(trans="symlog"),#.tick(at=[10,100,1000,10000,100000,1000000]),
        # Colorblind-friendly palette
        color = ["#785EF0","#DC267F", "#FE6100", "#FFB000"]
    )
    # Axes limits
    .limit(x = (0,400), y = (10, 1000000))
    # Label axes
    .label(
        x = "Network size", 
        y = "Count",
        title = "Distance {}".format
    )
    # Configure theme
    .theme({**axes_style("whitegrid"), "axes.edgecolor": "black", "grid.linestyle":":", "legend.frameon":False, "legend.fontsize":20})
    # Plot on figure
    .on(fig_3b)
).plot()

# Tick labels
p._figure.axes[0].yaxis.set_ticklabels(["10", "100", "1k", "10k", "100k", "1M"])

# X label
#p._figure.supxlabel("Network size")

# Title of first subplot
p._figure.subplots

# No legend
p._figure.legends.clear()

p.save(config["output"]["fig_3_size_hist"], dpi = 400)