In [40]:
import pandas as pd

In [41]:
# -------- settings --------
PRIVATE = "../data/private_dataL.xlsx"           # survey

# -------- load --------
s = pd.read_excel(PRIVATE)

In [42]:
s

Unnamed: 0,name,sex,evote,dob,zip,education,citizenship,marital_status,party
0,"el-Baig, Hawraa",Female,0,1992-03-14,2100,Primary education,Denmark,Married/separated,Green
1,"Del Rosario, Jacqueline",Female,0,1989-08-31,2200,Vocational bachelors educations,Denmark,Never married,Green
2,"Benavides, Rosemerry",Female,0,1934-06-22,2200,Vocational Education and Training (VET),Poland,Divorced,Green
3,"Yeddanapudy, Natalie",Female,0,1988-06-02,2100,Vocational Education and Training (VET),Bolivia,Never married,Green
4,"Ramirez-Moran, Jose",Male,0,1965-03-07,2400,Vocational Education and Training (VET),Denmark,Married/separated,Red
...,...,...,...,...,...,...,...,...,...
195,"Maclellan, Ariana",Female,1,1997-03-18,2100,Masters programmes,Denmark,Never married,Green
196,"el-Ahmad, Amaan",Male,0,1990-08-24,2300,Vocational Education and Training (VET),Denmark,Never married,Green
197,"al-Hamidi, Mufliha",Female,0,2004-12-31,2200,Primary education,Denmark,Never married,Green
198,"Hutcheson, Twanna",Female,0,1996-05-29,2200,Bachelors programmes,Denmark,Married/separated,Red


# Convert DOB -> age

In [43]:
from datetime import datetime
import pandas as pd
import numpy as np

# Convert dob to datetime
s['dob'] = pd.to_datetime(s['dob'], format='%d.%m.%Y', errors='coerce')

# Calculate age correctly
today = pd.Timestamp.today()
s['age'] = (
    today.year - s['dob'].dt.year
    - ((today.month < s['dob'].dt.month) | ((today.month == s['dob'].dt.month) & (today.day < s['dob'].dt.day)))
)

# Add controlled random noise to age
np.random.seed(42)
noise = np.random.randint(-5, 5, size=len(s))
s['age_noisy'] = (s['age'] + noise).clip(lower=18, upper=90)

# Convert noisy age into groups
s["age_a"] = pd.cut(
    s["age_noisy"],
    bins=[0, 30, 50, 65, 150],
    labels=["<30", "30–49", "50–64", "65+"],
    right=False
)


# Convert education into categories

In [44]:
def anonymize_edu(x):
    x = str(x).lower()
    if any(word in x for word in ["primary", "basic", "lower", "secondary", "vocational", "vet", "short"]):
        return "Lower"
    if any(word in x for word in ["bachelor", "master", "phd", "university", "tertiary", "postsecondary"]):
        return "Higher"
    return "Other"


s["edu_a"] = s["education"].map(anonymize_edu)


# Convert country to EU, non EU

In [45]:
def anonymize_citizenship(x):
    x = str(x).lower()
    eu_countries = [
        "austria", "belgium", "bulgaria", "croatia", "cyprus", "czech republic",
        "denmark", "estonia", "finland", "france", "germany", "greece", "hungary",
        "ireland", "italy", "latvia", "lithuania", "luxembourg", "malta",
        "netherlands", "poland", "portugal", "romania", "slovakia", "slovenia",
        "spain", "sweden"
    ]
    if x in eu_countries:
        return "EU"
    return "non EU"

s["citizenship_a"] = s["citizenship"].map(anonymize_citizenship)

# Anonymize marital status

In [46]:
def anonymize_marital(x):
    x = str(x).lower()
    if "married" in x:
        return "Married"
    return "Single"

s["maritalstatus_a"] = s["marital_status"].map(anonymize_marital)


# Anonymize zip code

In [47]:
s["zip_a"] = s["zip"].astype(str).str[:2] + "xx"


## Add * as zip code where sample unique

In [48]:
QID = ['sex','age_a','edu_a','citizenship_a','maritalstatus_a','zip_a']

# recompute equivalence class sizes
s['eq_size'] = s.groupby(QID)['sex'].transform('size')

# mask rows that are unique
mask_uniques = s['eq_size'] == 1

# replace their zip with suppression symbol
s.loc[mask_uniques, 'zip_a'] = '*'

  s['eq_size'] = s.groupby(QID)['sex'].transform('size')


# Drop non-anonymous cols

In [49]:
s = s.drop(columns=["name", "dob", "zip", "education", "citizenship", "marital_status", "age"])
s

Unnamed: 0,sex,evote,party,age_noisy,age_a,edu_a,citizenship_a,maritalstatus_a,zip_a,eq_size
0,Female,0,Green,34,30–49,Lower,EU,Married,21xx,2
1,Female,0,Green,34,30–49,Lower,EU,Married,22xx,5
2,Female,0,Green,90,65+,Lower,EU,Single,22xx,2
3,Female,0,Green,36,30–49,Lower,non EU,Married,*,1
4,Male,0,Red,61,50–64,Lower,EU,Married,24xx,9
...,...,...,...,...,...,...,...,...,...,...
195,Female,1,Green,30,30–49,Higher,EU,Married,21xx,2
196,Male,0,Green,34,30–49,Lower,EU,Married,23xx,5
197,Female,0,Green,18,<30,Lower,EU,Married,22xx,10
198,Female,0,Red,25,<30,Higher,EU,Married,*,1


#

# Calculate disclosure risk

In [50]:
QID = ['sex','age_a','edu_a','citizenship_a','maritalstatus_a','zip_a']

# drop rows where any QID is NaN, or keep them but they won't form ECs
s2 = s.dropna(subset=QID)

# sizes per record
s2['eq_size'] = s2.groupby(QID, observed=True)['sex'].transform('size')
s2['id_risk'] = 1 / s2['eq_size']

avg_risk  = s2['id_risk'].mean()
prop_uniques = (s2['eq_size'] == 1).mean()
k_min = int(s2['eq_size'].min())
prop_high = (s2['id_risk'] > 0.2).mean()

ec_sizes = (
    s2.groupby(QID, observed=True)
      .size()
      .value_counts()
      .sort_index()
      .rename_axis('equivalence_class_size')
      .reset_index(name='num_classes')
)

avg_risk, prop_uniques, k_min, prop_high, ec_sizes

(0.34,
 0.085,
 1,
 0.515,
    equivalence_class_size  num_classes
 0                       1           17
 1                       2           24
 2                       3           10
 3                       4            2
 4                       5            4
 5                       6            7
 6                       7            1
 7                       9            2
 8                      10            1)

In [51]:
ec = (s.groupby(QID).size().reset_index(name='n').sort_values('n'))
uniques = ec[ec['n'] == 1]  # inspect these patterns
uniques

  ec = (s.groupby(QID).size().reset_index(name='n').sort_values('n'))


Unnamed: 0,sex,age_a,edu_a,citizenship_a,maritalstatus_a,zip_a,n
440,Male,65+,Lower,EU,Married,*,1
445,Male,65+,Lower,EU,Single,*,1
210,Female,65+,Lower,non EU,Married,*,1
20,Female,<30,Lower,EU,Married,*,1
40,Female,<30,Other,EU,Married,*,1
205,Female,65+,Lower,EU,Single,*,1
455,Male,65+,Lower,non EU,Single,*,1
130,Female,50–64,Higher,non EU,Married,*,1
0,Female,<30,Higher,EU,Married,*,1
270,Male,<30,Lower,non EU,Married,*,1


## Save dataset

In [52]:
s = s.drop(columns=["eq_size", "age_noisy"])
s.to_csv('../data/survey_anonymised.csv', index=False)

In [53]:
s

Unnamed: 0,sex,evote,party,age_a,edu_a,citizenship_a,maritalstatus_a,zip_a
0,Female,0,Green,30–49,Lower,EU,Married,21xx
1,Female,0,Green,30–49,Lower,EU,Married,22xx
2,Female,0,Green,65+,Lower,EU,Single,22xx
3,Female,0,Green,30–49,Lower,non EU,Married,*
4,Male,0,Red,50–64,Lower,EU,Married,24xx
...,...,...,...,...,...,...,...,...
195,Female,1,Green,30–49,Higher,EU,Married,21xx
196,Male,0,Green,30–49,Lower,EU,Married,23xx
197,Female,0,Green,<30,Lower,EU,Married,22xx
198,Female,0,Red,<30,Higher,EU,Married,*
