## 02d - Variable calculation: Survey identifiers for linkage

- **Project:** _Families, households, networks: Rethinking the relational structure of families through large-scale network data_ <br>
- **Authors:** Nicol√°s Soler (ORCID 0009-0001-4239-9396), Tom Emery, Agnieszka Kanas <br>
- **Last updated:** January 2026 <br>
- **Full research article published in journal:** _Demography_ (2026)

In [None]:
import yaml
import polars as pl
import pandas as pd

In [None]:
# Load YAML configuration
path_config = 'config.yml'
with open(path_config, 'r') as f:
    config = yaml.safe_load(f)

In [None]:
# Read sample
dtypes_sample = {
    "RINPERSOON":pl.String,
    "gender_female":pl.Int64,
    "birth_date":pl.Date,
    "death_date":pl.Date,
    "age":pl.Int64,
    "SOORTOBJECTNUMMER":pl.String,
    "RINOBJECTNUMMER":pl.String,
    "gemeente":pl.String,
    "wijk":pl.String,
    "buurt":pl.String,
    "id_hhd":pl.Int64,
    "hhd_size":pl.Int64,
    "is_ego":pl.Int64,
    "is_ego_child":pl.Int64,
    "id_child":pl.String,
    "net_size_hhd_1":pl.Int64,
    "net_size_hhd_2":pl.Int64,
    "net_size_hhd_3":pl.Int64,
    "net_size_hhd_4":pl.Int64,
    "net_size_hhd":pl.Int64,
    "density_2":pl.Float64,
    "density_3":pl.Float64,
    "density_4":pl.Float64
}

sample = pl.scan_csv(config["data"]["sample"], separator=",", encoding="utf8", schema_overrides=dtypes_sample).collect()

In [None]:
# Read survey identifiers

dtypes_ids_survey = {
    "nomem_encr":pl.String,
}

ids_survey = (
    pl
    .scan_csv(config["data"]["survey_outcomes"], separator=",", encoding="utf8", schema_overrides=dtypes_ids_survey)
    .select(dtypes_ids_survey.keys())
    .collect()
)

In [None]:
# Read CBS-LISS linkage file and merge to id_survey

# Read file from SPSS
link_file = pd.read_spss(config["data"]["link_file"])
link_file = link_file.drop(columns=["RINPERSOONS"]).rename(columns={"nomem_encr_crypt":"nomem_encr"})
link_file = pl.from_pandas(link_file)

# Drop rows with missing values
link_file = link_file.with_columns(pl.when(pl.col(pl.String)=="").then(None).otherwise(pl.col(pl.String)).name.keep()).drop_nulls()

# Merge RINPERSOON to ids_survey
ids_survey = ids_survey.join(link_file, how="left", on="nomem_encr")

In [None]:
# Flag survey RINPERSOON and join to sample
ids_survey = ids_survey.drop_nulls().with_columns(is_ego_survey = 1)
sample = sample.join(ids_survey, how="left", on="RINPERSOON")
sample = sample.with_columns(pl.col("is_ego_survey").fill_null(strategy="zero"))

In [None]:
# Store sample
sample.write_csv(config["data"]["sample"], separator=",", line_terminator="\n")