## 01d - Data preparation: Encrypted LISS survey

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl

In [None]:
# Load YAML configuration
path_config = "config.yml"
with open(path_config, "r") as f:
    config = yaml.safe_load(f)

In [None]:
# Read and store the encrypted survey SPSS file
survey_raw = pl.scan_csv(config["data"]["survey_raw"], ignore_errors=True).collect()
survey_raw = survey_raw.rename({"nomem_encr_crypt":"nomem_encr", "nohouse_encr_crypt":"nohouse_encr"})
survey_raw.write_csv(config["data"]["survey"], separator=",", line_terminator="\n")

In [None]:
# Read survey

dtypes_survey_vars = {
    "nomem_encr":pl.String,
    "nohouse_encr":pl.String,
}

dtypes_survey_alters = {
    "nomem_encr":pl.String,
    "alter1":pl.String,
    "alter2":pl.String,
    "alter3":pl.String,
    "alter4":pl.String,
    "alter5":pl.String,
    "alter6":pl.String,
    "alter7":pl.String,
    "alter8":pl.String,
    "alter9":pl.String,
    "alter10":pl.String,
    "alter11":pl.String,
    "alter12":pl.String,
    "alter13":pl.String,
    "alter14":pl.String,
    "alter15":pl.String,
    "alter16":pl.String,
    "alter17":pl.String,
    "alter18":pl.String,
    "alter19":pl.String,
    "alter20":pl.String,
    "alter21":pl.String,
    "alter22":pl.String,
    "alter23":pl.String,
    "alter24":pl.String,
    "alter25":pl.String,
    "relation_alter1_new":pl.String,
    "relation_alter2_new":pl.String,
    "relation_alter3_new":pl.String,
    "relation_alter4_new":pl.String,
    "relation_alter5_new":pl.String,
    "relation_alter6_new":pl.String,
    "relation_alter7_new":pl.String,
    "relation_alter8_new":pl.String,
    "relation_alter9_new":pl.String,
    "relation_alter10_new":pl.String,
    "relation_alter11_new":pl.String,
    "relation_alter12_new":pl.String,
    "relation_alter13_new":pl.String,
    "relation_alter14_new":pl.String,
    "relation_alter15_new":pl.String,
    "relation_alter16_new":pl.String,
    "relation_alter17_new":pl.String,
    "relation_alter18_new":pl.String,
    "relation_alter19_new":pl.String,
    "relation_alter20_new":pl.String,
    "relation_alter21_new":pl.String,
    "relation_alter22_new":pl.String,
    "relation_alter23_new":pl.String,
    "relation_alter24_new":pl.String,
    "relation_alter25_new":pl.String,
    "contact_f2f_alter1":pl.String,
    "contact_f2f_alter2":pl.String,
    "contact_f2f_alter3":pl.String,
    "contact_f2f_alter4":pl.String,
    "contact_f2f_alter5":pl.String,
    "contact_f2f_alter6":pl.String,
    "contact_f2f_alter7":pl.String,
    "contact_f2f_alter8":pl.String,
    "contact_f2f_alter9":pl.String,
    "contact_f2f_alter10":pl.String,
    "contact_f2f_alter11":pl.String,
    "contact_f2f_alter12":pl.String,
    "contact_f2f_alter13":pl.String,
    "contact_f2f_alter14":pl.String,
    "contact_f2f_alter15":pl.String,
    "contact_f2f_alter16":pl.String,
    "contact_f2f_alter17":pl.String,
    "contact_f2f_alter18":pl.String,
    "contact_f2f_alter19":pl.String,
    "contact_f2f_alter20":pl.String,
    "contact_f2f_alter21":pl.String,
    "contact_f2f_alter22":pl.String,
    "contact_f2f_alter23":pl.String,
    "contact_f2f_alter24":pl.String,
    "contact_f2f_alter25":pl.String,
    "help_child_alter1":pl.String,
    "help_child_alter2":pl.String,
    "help_child_alter3":pl.String,
    "help_child_alter4":pl.String,
    "help_child_alter5":pl.String,
    "help_child_alter6":pl.String,
    "help_child_alter7":pl.String,
    "help_child_alter8":pl.String,
    "help_child_alter9":pl.String,
    "help_child_alter10":pl.String,
    "help_child_alter11":pl.String,
    "help_child_alter12":pl.String,
    "help_child_alter13":pl.String,
    "help_child_alter14":pl.String,
    "help_child_alter15":pl.String,
    "help_child_alter16":pl.String,
    "help_child_alter17":pl.String,
    "help_child_alter18":pl.String,
    "help_child_alter19":pl.String,
    "help_child_alter20":pl.String,
    "help_child_alter21":pl.String,
    "help_child_alter22":pl.String,
    "help_child_alter23":pl.String,
    "help_child_alter24":pl.String,
    "help_child_alter25":pl.String,
    "talk_child_alter1":pl.String,
    "talk_child_alter2":pl.String,
    "talk_child_alter3":pl.String,
    "talk_child_alter4":pl.String,
    "talk_child_alter5":pl.String,
    "talk_child_alter6":pl.String,
    "talk_child_alter7":pl.String,
    "talk_child_alter8":pl.String,
    "talk_child_alter9":pl.String,
    "talk_child_alter10":pl.String,
    "talk_child_alter11":pl.String,
    "talk_child_alter12":pl.String,
    "talk_child_alter13":pl.String,
    "talk_child_alter14":pl.String,
    "talk_child_alter15":pl.String,
    "talk_child_alter16":pl.String,
    "talk_child_alter17":pl.String,
    "talk_child_alter18":pl.String,
    "talk_child_alter19":pl.String,
    "talk_child_alter20":pl.String,
    "talk_child_alter21":pl.String,
    "talk_child_alter22":pl.String,
    "talk_child_alter23":pl.String,
    "talk_child_alter24":pl.String,
    "talk_child_alter25":pl.String
}

survey_vars = (
    pl
    .scan_csv(config["data"]["survey"], separator=",", null_values=["NA",""], encoding="utf8", schema_overrides=dtypes_survey_vars)
    .select(dtypes_survey_vars.keys())
    .collect()
)

survey_alters = (
    pl
    .scan_csv(config["data"]["survey"], separator=",", null_values=["NA",""], encoding="utf8", schema_overrides=dtypes_survey_alters)
    .select(dtypes_survey_alters.keys())
    .collect()
)

# 1 - Tidy alter data

In [None]:
# Column lists for easy manipulation and recoding
cols_alters = ["nomem_encr"] + ["alter" + str(i) for i in range(1,26)]
cols_relation = ["nomem_encr"] + ["relation_alter" + str(i) + "_new" for i in range(1,26)]
cols_relation_new = ["nomem_encr"] + ["type" + str(i) for i in range(1,26)]
cols_relation_rename = {k:v for k,v in zip(cols_relation,cols_relation_new)}
cols_contact = ["nomem_encr"] + ["contact_f2f_alter" + str(i) for i in range(1,26)]
cols_help = ["nomem_encr"] + ["help_child_alter" + str(i) for i in range(1,26)]
cols_talk = ["nomem_encr"] + ["talk_child_alter" + str(i) for i in range(1,26)]

In [None]:
# Define set of edge types that are kin
kin_edge_types = set(["1","2","3","4","5"])

In [None]:
# From wide to long
pivot_alters = (
    survey_alters
    .select(cols_alters)
    # Wide to long
    .unpivot(index="nomem_encr", variable_name="alter")
    # Create alter id column
    .with_columns(pl.col("alter").str.extract(r"alter([0-9]+)").cast(pl.Int64))
    .sort(["nomem_encr","alter"])
    # Create dummy that flags whether alter was reported
    .with_columns(pl.when(pl.col("value").is_null()).then(0).otherwise(1).alias("reported"))
    .drop("value")
)

pivot_relation = (
    survey_alters
    .select(cols_relation)
    # Change names for easier pivot
    .rename(cols_relation_rename)
    # Wide to long
    .unpivot(index="nomem_encr", variable_name="type")
    # Create alter id column
    .with_columns(pl.col("type").str.extract(r"type([0-9]+)").cast(pl.Int64).alias("alter"))
    # Extract only one edge type per alter, prioritising family types
    .with_columns(pl.col("value").str.split(by=",")) # Convert to list
    .with_columns(pl.col("value").list.set_intersection(kin_edge_types).alias("type")) # Get family edge types
    .with_columns(pl.col("type").list.first().fill_null("0").cast(pl.Int64))  # Get first sorted family type as int
    # Flag if edge is family
    .with_columns(is_fam = pl.when(pl.col("type")>0).then(1).otherwise(0))
    .drop("value")
    # Tidy
    .sort(["nomem_encr","alter"])
    .select(["nomem_encr","alter","type","is_fam"])
)

pivot_contact = (
    survey_alters
    .select(cols_contact)
    # Wide to long
    .unpivot(index="nomem_encr", variable_name="contact_f2f_alter")
    # Create alter id column
    .with_columns(pl.col("contact_f2f_alter").str.extract(r"contact_f2f_alter([0-9]+)").cast(pl.Int64).alias("alter"))
    .sort(["nomem_encr","alter"])
    .rename({"value":"contact"})
    # Recode values
    .with_columns(contact = pl.when(pl.col("contact")=="Daily").then(1).otherwise(pl.col("contact")))
    .with_columns(contact = pl.when(pl.col("contact")=="Several times a week").then(1).otherwise(pl.col("contact")))
    .with_columns(contact = pl.when(pl.col("contact")=="Several times a month").then(0).otherwise(pl.col("contact")))
    .with_columns(contact = pl.when(pl.col("contact")=="About once a month").then(0).otherwise(pl.col("contact")))
    .with_columns(contact = pl.when(pl.col("contact")=="A few times a year or less").then(0).otherwise(pl.col("contact")))
    .with_columns(pl.col("contact").cast(pl.Int64))
    .drop("contact_f2f_alter")
)

pivot_supp = (
    survey_alters
    .select(cols_help)
    # Wide to long
    .unpivot(index="nomem_encr", variable_name="help_child_alter")
    # Create alter id column
    .with_columns(pl.col("help_child_alter").str.extract(r"help_child_alter([0-9]+)").cast(pl.Int64).alias("alter"))
    .sort(["nomem_encr","alter"])
    .rename({"value":"supp"})
    # Recode values
    .with_columns(supp = pl.when(pl.col("supp")=="Could ask for help in caring for child").then(1).otherwise(pl.col("supp")))
    .with_columns(supp = pl.when(pl.col("supp")=="Could not ask for help in caring for child").then(0).otherwise(pl.col("supp")))
    .with_columns(pl.col("supp").cast(pl.Int64))
    .drop("help_child_alter")
)

pivot_talk = (
    survey_alters
    .select(cols_talk)
    # Wide to long
    .unpivot(index="nomem_encr", variable_name="talk_child_alter")
    # Create alter id column
    .with_columns(pl.col("talk_child_alter").str.extract(r"talk_child_alter([0-9]+)").cast(pl.Int64).alias("alter"))
    .sort(["nomem_encr","alter"])
    .rename({"value":"talk_child"})
    # Recode values
    .with_columns(talk_child = pl.when(pl.col("talk_child")=="Do discuss having children with this person").then(1).otherwise(pl.col("talk_child")))
    .with_columns(talk_child = pl.when(pl.col("talk_child")=="Do not discuss having children with this person").then(0).otherwise(pl.col("talk_child")))
    .with_columns(pl.col("talk_child").cast(pl.Int64))
    .drop("talk_child_alter")
)

# Merge
survey_edges = (
    pivot_alters
    .join(pivot_relation, how="left", on=["nomem_encr","alter"])
    .join(pivot_contact, how="left", on=["nomem_encr","alter"])
    .join(pivot_supp, how="left", on=["nomem_encr","alter"])
    .join(pivot_talk, how="left", on=["nomem_encr","alter"])
)

In [None]:
# Store
survey_edges.write_csv(config["data"]["survey_edges"], separator=",", line_terminator="\n")

# 2 - Calculate outcomes

## 2.1 - All edges

In [None]:
# Network size
survey_size_all = survey_edges.group_by("nomem_encr").agg(pl.col("reported").sum().alias("size_all"))

# Subset family edges and measure outcomes
survey_fam_all = (
    survey_edges
    .filter(pl.col("is_fam")==1)
    .group_by("nomem_encr")
    .agg(
        pl.col("is_fam").sum().alias("y_count_fam_all"),
        pl.col("contact").sum().alias("count_contact_all"),
        pl.col("supp").sum().alias("count_supp_all"),
        pl.col("talk_child").sum().alias("count_talk_all")
    )
    .with_columns(
        y_perc_contact_all = pl.col("count_contact_all") / pl.col("y_count_fam_all") * 100,
        y_perc_supp_all = pl.col("count_supp_all") / pl.col("y_count_fam_all") * 100,
        y_perc_talk_all = pl.col("count_talk_all") / pl.col("y_count_fam_all") * 100
    )
    .with_columns(pl.col(["y_perc_contact_all","y_perc_supp_all","y_perc_talk_all"]).round(2))
)

## 2.2 - Only complete edges

In [None]:
# Keep only complete edges
survey_edges_comp = survey_edges.drop_nulls()

In [None]:
# Network size
survey_size = survey_edges_comp.group_by("nomem_encr").agg(pl.col("reported").sum().alias("size_comp"))

# Subset family edges and measure outcomes
survey_fam = (
    survey_edges_comp
    .filter(pl.col("is_fam")==1)
    .group_by("nomem_encr")
    .agg(
        pl.col("is_fam").sum().alias("y_count_fam_comp"),
        pl.col("contact").sum().alias("count_contact_comp"),
        pl.col("supp").sum().alias("count_supp_comp"),
        pl.col("talk_child").sum().alias("count_talk_comp")
    )
    .with_columns(
        y_perc_contact_comp = pl.col("count_contact_comp") / pl.col("y_count_fam_comp") * 100,
        y_perc_supp_comp = pl.col("count_supp_comp") / pl.col("y_count_fam_comp") * 100,
        y_perc_talk_comp = pl.col("count_talk_comp") / pl.col("y_count_fam_comp") * 100
    )
    .with_columns(pl.col(["y_perc_contact_comp","y_perc_supp_comp","y_perc_talk_comp"]).round(2))
)

## 2.3 - % fam

In [None]:
# Merge and calculate % fam in network
survey_outcomes = (
    survey_vars
    .join(survey_size_all, how="left", on="nomem_encr")
    .join(survey_fam_all, how="left", on="nomem_encr")
    .join(survey_size, how="left", on="nomem_encr")
    .join(survey_fam, how="left", on="nomem_encr")
    .with_columns(
        y_perc_fam_all = pl.col("y_count_fam_all") / pl.col("size_all") * 100,
        y_perc_fam_comp = pl.col("y_count_fam_comp") / pl.col("size_comp") * 100
    )
    .with_columns(pl.col(["y_perc_fam_all","y_perc_fam_comp"]).round(2))
)

In [None]:
# Store
survey_outcomes.write_csv(config["data"]["survey_outcomes"], separator=",", line_terminator="\n")