In [343]:
import pandas as pd

In [344]:
# -------- settings --------
PRIVATE = "data/anonymised_dataC.csv"           # survey
PUBLIC = "data/public_data_registerC.xlsx"
LEAK = "data/survey_listC.csv"
# -------- load --------
s = pd.read_csv(PRIVATE)
p = pd.read_excel(PUBLIC)
l = pd.read_csv(LEAK, sep=";")

In [345]:
s

Unnamed: 0,sex,age_band,married,education_level,evote,party
0,Male,34-48,Other,Higher,1,Green
1,Female,63+,Other,Lower,0,Red
2,Female,49-63,Other,Higher,0,Red
3,Male,49-63,Other,Higher,1,Red
4,Male,49-63,Other,Higher,1,Green
...,...,...,...,...,...,...
183,Male,34-48,Other,Higher,0,Red
184,Female,63+,Other,Higher,0,Red
185,Male,63+,Other,Lower,1,Green
186,Male,49-63,Never married,Lower,0,Green


# Calculate age from DOB

In [346]:
from datetime import datetime
import pandas as pd

# convert dob to datetime
p['dob'] = pd.to_datetime(p['dob'], errors='coerce', format='%Y-%m-%d')

# calculate age correctly
today = pd.Timestamp.today()
p['age'] = (
    today.year - p['dob'].dt.year
    - ((today.month < p['dob'].dt.month) | ((today.month == p['dob'].dt.month) & (today.day < p['dob'].dt.day)))
)


p['age_band'] = pd.cut(
    p['age'],
    bins=[0, 33, 48, 63, 150],
    labels=['18-33', '34-48', '49-63', '63+'],
    right=True,
    include_lowest=True
)


# Anonymize marital status

In [347]:
def anonymize_marital(x):
    x = str(x).lower()
    if any(word in x for word in ["married/separated", "divorced", "widowed"]):
        return "Other"
    elif "never married" in x:
        return "Never married"
    return "Unknown"  # fallback in case of missing or strange entries

p["marital_status"] = p["marital_status"].map(anonymize_marital)

# Filter public registrar according to the leaks

In [348]:
filtered_p = p.merge(l, on='name', how='inner')
filtered_p

Unnamed: 0,name,sex,dob,zip,citizenship,marital_status,last_voted,age,age_band
0,"Namkoong, Alexander",Male,2001-08-10,2200,Denmark,Never married,1,24,18-33
1,"Harper Williams, Myah",Female,2003-09-17,2100,Denmark,Never married,1,22,18-33
2,"Mills, Morgan",Female,2001-08-10,2100,Denmark,Never married,1,24,18-33
3,"Akquia, Lee",Male,1999-07-06,2100,Ukraine,Never married,1,26,18-33
4,"Tran, Christopher",Male,2003-02-28,2300,Denmark,Never married,1,22,18-33
...,...,...,...,...,...,...,...,...,...
195,"Perez, Anthony",Male,1989-07-13,2100,Denmark,Never married,0,36,34-48
196,"Chavez, Manuel",Male,1943-09-20,2300,Denmark,Other,0,82,63+
197,"Kwon, Sungchoul",Male,1956-02-05,2300,Denmark,Other,0,69,63+
198,"el-Shams, Rafeeda",Female,1962-02-06,2300,Denmark,Other,0,63,49-63


In [349]:
print(s['married'].value_counts())
print(filtered_p['marital_status'].value_counts())

married
Other            126
Never married     62
Name: count, dtype: int64
marital_status
Other            136
Never married     64
Name: count, dtype: int64


# Calculate disclosure risk

In [350]:
QID = ['sex','age_band','education_level','married']

# drop rows where any QID is NaN, or keep them but they won't form ECs
s2 = s.dropna(subset=QID)

# sizes per record
s2['eq_size'] = s2.groupby(QID, observed=True)['sex'].transform('size')
s2['id_risk'] = 1 / s2['eq_size']

avg_risk  = s2['id_risk'].mean()
prop_uniques = (s2['eq_size'] == 1).mean()
k_min = int(s2['eq_size'].min())
prop_high = (s2['id_risk'] > 0.2).mean()

ec_sizes = (
    s2.groupby(QID, observed=True)
      .size()
      .value_counts()
      .sort_index()
      .rename_axis('equivalence_class_size')
      .reset_index(name='num_classes')
)

avg_risk, prop_uniques, k_min, prop_high, ec_sizes

(np.float64(0.14361702127659573),
 np.float64(0.0),
 2,
 np.float64(0.18617021276595744),
     equivalence_class_size  num_classes
 0                        2            3
 1                        3            7
 2                        4            2
 3                        5            3
 4                        6            2
 5                        9            2
 6                       11            2
 7                       13            1
 8                       14            3
 9                       15            1
 10                      16            1)

In [351]:
# show only records where equivalence class size == 2
ec_size_2 = s2[s2['eq_size'] == 6]

ec_size_2 = ec_size_2.sort_values(QID + ['eq_size'])

# preview
ec_size_2[QID + ['eq_size']].head(20)

Unnamed: 0,sex,age_band,education_level,married,eq_size
29,Female,34-48,Higher,Other,6
40,Female,34-48,Higher,Other,6
41,Female,34-48,Higher,Other,6
64,Female,34-48,Higher,Other,6
90,Female,34-48,Higher,Other,6
127,Female,34-48,Higher,Other,6
98,Male,34-48,Lower,Other,6
102,Male,34-48,Lower,Other,6
120,Male,34-48,Lower,Other,6
134,Male,34-48,Lower,Other,6


In [352]:
filtered_p[(filtered_p['sex'].str.lower() == 'female') &
   (filtered_p['age_band'] == '18-33') &
   (filtered_p['marital_status'].str.lower().str.contains('never'))]

Unnamed: 0,name,sex,dob,zip,citizenship,marital_status,last_voted,age,age_band
1,"Harper Williams, Myah",Female,2003-09-17,2100,Denmark,Never married,1,22,18-33
2,"Mills, Morgan",Female,2001-08-10,2100,Denmark,Never married,1,24,18-33
5,"Merchant, Caylin",Female,1998-11-15,2200,Denmark,Never married,1,26,18-33
9,"Hasan, Hannah",Female,1992-01-28,2200,Denmark,Never married,1,33,18-33
11,"el-Allee, Wafaaa",Female,1996-05-12,2200,Denmark,Never married,1,29,18-33
12,"al-Azam, Hameeda",Female,1995-10-17,2200,Denmark,Never married,1,30,18-33
13,"Asad, Alice",Female,1997-01-10,2300,Denmark,Never married,1,28,18-33
14,"al-Nazar, Sahar",Female,1996-09-09,2200,Turkey,Never married,1,29,18-33
15,"Cruz, Samantha",Female,1994-10-01,2300,Denmark,Never married,1,31,18-33
17,"Trainer, Genevieve",Female,1994-06-12,2100,Denmark,Never married,1,31,18-33


In [353]:
s[(s['sex'].str.lower() == 'female') &
   (s['age_band'] == '18-33') &
   (s['married'].str.lower().str.contains('never'))]

Unnamed: 0,sex,age_band,married,education_level,evote,party
15,Female,18-33,Never married,Higher,1,Green
19,Female,18-33,Never married,Higher,1,Red
24,Female,18-33,Never married,Higher,0,Green
26,Female,18-33,Never married,Higher,1,Red
34,Female,18-33,Never married,Lower,1,Green
43,Female,18-33,Never married,Higher,0,Green
49,Female,18-33,Never married,Lower,1,Green
63,Female,18-33,Never married,Lower,1,Green
76,Female,18-33,Never married,Lower,0,Green
80,Female,18-33,Never married,Lower,0,Green
