## 03a - Output: Network composition

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read kin counts
kin_counts = (
    pl
    .scan_csv(config["data"]["kincounts"], separator=",", encoding="utf8")
    .drop("RINPERSOON")
    .collect()
)

# 1 - Prepare dataframe

In [None]:
# Reduce each column to a sum of its rows
agg_counts = kin_counts.select(pl.all().sum())

In [None]:
# Calculate sum of sums columns

# Prepare names for new columns
list_cols = sorted([k for k in agg_counts.columns])
dict_cols = {}

for col in list_cols:
    new_name = col[:-2]
    if new_name not in dict_cols:
        dict_cols[new_name] = [col]
    else:
        dict_cols[new_name] = dict_cols[new_name] + [col]

# Calculate columns
for col in dict_cols.keys():
    agg_counts = (
        agg_counts
        .with_columns(pl.sum_horizontal(dict_cols[col]).alias(col))
    )

In [None]:
# Calculate % of tie type at distance

# New column names
list_cols_perc = ["perc_" + k for k in list_cols]

# Count-to-total dictionary
dict_count_total = {}
for col in list_cols:
    name_total = col[:-2]
    dict_count_total[col] = name_total

# Calculate percentages
for perc,col in zip(list_cols_perc,list_cols):
    agg_counts = (
        agg_counts
        .with_columns(((pl.col(col) / pl.col(dict_count_total[col]) * 100)).alias(perc))
    )

In [None]:
# Keep only variables of interest
matrix = agg_counts.select(list_cols_perc)

In [None]:
# From wide to long with pandas
list_stubs = list(set([i[:-2] for i in list_cols_perc]))
matrix = matrix.with_columns(i = 1).to_pandas()
matrix = pd.wide_to_long(matrix, i = "i", j = "dist", stubnames = list_stubs, sep = "_")

Pandas from here onwards.

In [None]:
# Rename columns and format df

# Matrix column names
matrix_names_keys = list(set([col[:-3] for col in list_cols_perc]))
dict_matrix = {}
for k in matrix_names_keys:
    v = k[5:]
    dict_matrix[k] = v

# Rename columns, reset index, fill missings
matrix = (
    matrix
    #.rename(columns = dict_matrix)
    .reset_index(level = 1)
    .set_index("dist")
    .reset_index(drop = False)
    .fillna(0) # Nulls = Ties that don't appear at a given dist
)

# Rename columns
matrix.columns = [col[5:] for col in matrix.columns]
matrix = matrix.rename(columns = {"net_size_ind": "all alters"})

# Re-order columns
matrix = matrix.loc[:, ["parent","step-parent","sibling","step-sibling","sibling-in-law","grandparent",
                        #"great-grandparent",
                        "uncle","uncle-in-law","cousin","cousin-in-law","nephew", 
                        "nephew-in-law",
                        #"other",
                        "all alters"]]

# Capitalise column names
matrix.columns = map(str.capitalize, matrix.columns)

# Transpose df
matrix = matrix.T

# Rename column 0
matrix = matrix.rename(columns = {0:"Co-resident"})

# Add a very small float to avoid weird zero formatting for nulls
matrix = matrix + 0.0000000000001

# 2 - Plot matrix

In [None]:
# Plot matrix

sns.set(rc={"figure.figsize":(7,6.5)})
fig_2 = sns.heatmap(
    matrix,
    annot = matrix,
    fmt = ".2f",
    cbar = True,
    cmap = "Purples",
    vmax = 100,
    vmin = 0,
    linewidths = 0.8, 
    linecolor = "white"
)
plt.xlabel("Network distance")
plt.ylabel("Relationship type")

In [None]:
# Save figure
fig_2_out = fig_2.get_figure()
fig_2_out.savefig(config["output"]["fig_2_kincounts"], bbox_inches = "tight", dpi = 400)