# Linkage attack

In [1]:
import requests
import pandas as pd

med_data = requests.get('http://localhost:8000/api/meddata/deidentified').json()
med_data_k_anonymous = requests.get('http://localhost:8000/api/meddata/k_anonymous').json()
work_data = requests.get('http://localhost:8000/api/workdata').json()

In [2]:
med_df = pd.DataFrame(med_data)
work_df = pd.DataFrame(work_data)
med_k_anonymous_df = pd.DataFrame(med_data_k_anonymous)

In [3]:
med_df.head()

Unnamed: 0,id,age,gender,postalCode,diagnosis
0,7366e3cb-1a0a-49ff-a129-3edfeddc5df5,20 - 24,Male,4967700**,V9107XA
1,dfb1f260-df04-489f-bdfc-18f8593a9d4e,20 - 24,Female,2072119**,T82328A
2,8db05a66-8ebf-4931-a878-56755cceb175,25 - 29,Male,9393034**,S92403K
3,38c9b58f-2712-4e58-89f0-7b7912f6213e,15 - 19,Female,2952691**,C8197
4,839f1301-1cc5-44b3-a12f-fd83e301356f,20 - 24,Male,4206415**,V9206XA


In [4]:
med_k_anonymous_df.head()

Unnamed: 0,id,age,gender,postalCode,diagnosis
0,f5254e46-e1d0-4a7c-ae6d-5bdc6a7f6364,30-65,Male,606***,T23419A
1,a94db4f8-6b73-4e06-b1d1-d5c735dd41c9,30-65,Female,606***,K4131
2,22231286-51b2-4261-bea2-7f8097dee95c,30-65,Male,606***,S83269S
3,f7755ea3-d227-4666-9cda-c63ca60438ca,30-65,Male,606***,S23428
4,b56f82a2-9da2-4cc9-adf5-4346a203a6cc,30-65,Female,606***,S93122


In [5]:
work_df.head()

Unnamed: 0,id,postalCode,education,gender,workplace,department,fname,lname
0,1,133261301,Fachhochschule Bonn-Rhein-Sieg,Female,Schamberger Group,Product Management,Tamqrah,Shawdforth
1,2,64744,Chaudhary Charan Singh University,Female,Gutmann,Cummings and Cormier,Ardys,Ferenczi
2,3,62606,University of Sudbury,Female,Doyle,Powlowski and Leuschke,Margette,Dibner
3,4,593301943,University College Cork,Female,Jenkins Inc,Training,Libby,Gandrich
4,5,6311003,University of West Hungary,Male,VonRueden-Dooley,Marketing,Lennard,Castelin


In [6]:
def degeneralize_age(age_range):
    age_lower, _ = age_range.replace(" ", "").split("-")
    return int(age_lower)

In [7]:
def suppress_post_code(code):
    # suppress code to first three digits
    post_code_supped = str(code)[:3] + "***"
    return post_code_supped

In [8]:
work_df['postalCode'] = work_df['postalCode'].apply(suppress_post_code)
med_df['postalCode'] = med_df['postalCode'].apply(suppress_post_code)
med_k_anonymous_df['postalCode'] = med_k_anonymous_df['postalCode'].apply(suppress_post_code)

In [9]:
work_df.head()

Unnamed: 0,id,postalCode,education,gender,workplace,department,fname,lname
0,1,133***,Fachhochschule Bonn-Rhein-Sieg,Female,Schamberger Group,Product Management,Tamqrah,Shawdforth
1,2,647***,Chaudhary Charan Singh University,Female,Gutmann,Cummings and Cormier,Ardys,Ferenczi
2,3,626***,University of Sudbury,Female,Doyle,Powlowski and Leuschke,Margette,Dibner
3,4,593***,University College Cork,Female,Jenkins Inc,Training,Libby,Gandrich
4,5,006***,University of West Hungary,Male,VonRueden-Dooley,Marketing,Lennard,Castelin


In [10]:
def linkage_attack(med_df, work_df):
    matches = pd.merge(
        work_df, 
        med_df, 
        on=['postalCode', 'gender'],
        how='inner',
        suffixes=('_work', '_med')
    )
    unique_matches = matches[matches.groupby(['fname', 'lname'])['fname'].transform('size') == 1]
    count = len(unique_matches)
    return unique_matches, count

# Deidentified dataset

In [11]:
matches, count = linkage_attack(med_df, work_df)

In [12]:
matches.head()

Unnamed: 0,id_work,postalCode,education,gender,workplace,department,fname,lname,id_med,age,diagnosis
0,1,133***,Fachhochschule Bonn-Rhein-Sieg,Female,Schamberger Group,Product Management,Tamqrah,Shawdforth,9d37f125-d0eb-4caa-8c18-fda186c31c83,25 - 29,S52372Q
10,6,670***,Open University of Hong Kong,Male,Yundt-Monahan,Human Resources,Nicol,Hacard,ab327c3c-ab97-4d4f-ae39-573fa40480b9,70 - 74,H7011
11,7,161***,Payame Noor University,Non-binary,Schaefer,Fahey and Mills,Phil,Abelov,05db39ac-59a2-4d13-831f-cb1858a20331,25 - 29,T3370XA
12,8,764***,Zayed University,Polygender,Johns Group,Marketing,Susy,Reuther,c2fc459b-76b4-4760-845a-6c5a8f4f3405,35 - 39,M14849
13,9,816***,Ghana Christian University College,Female,Jast,Rogahn and Dicki,Vania,Klousner,a01094f8-7aa7-4c6c-acc0-2c4c8f0c859c,40 - 44,L940


In [13]:
count

362

the linkage attack managed to re-identify 362 individuals. Which is a very successful attack.

# K anonymous dataset


In [14]:
matches, count = linkage_attack(med_k_anonymous_df, work_df)

In [15]:
matches.head()

Unnamed: 0,id_work,postalCode,education,gender,workplace,department,fname,lname,id_med,age,diagnosis
12,35,212***,Reformed Bible College,Female,Swaniawski-Rutherford,Sales,Rhiamon,Molloy,828c276e-928d-452a-bff3-6159defd863b,30-70,V877XXS
13,40,982***,National Sanskrit University,Male,Effertz Inc,Human Resources,Tirrell,Joskowitz,45c6f982-92a4-4963-b406-75274b6db7a8,25-70,V0929
14,58,850***,Hebei University of Economics and Trade,Female,Nader-Shanahan,Sales,Melantha,Crookall,aef42579-7f91-4570-ab86-ab0eff7eee54,45-85,K50818
32,168,427***,Pedagogical University of Bydgoszcz,Male,Leuschke-Waelchi,Training,Orton,Haresnape,086b8a45-7486-4dba-abfc-681e855cc43a,30-80,S92053G
36,177,480***,Akhbar El Yom Academy,Polygender,Hamill,Olson and Wolf,Eberto,Falloon,473d4909-b6c8-470b-b10a-d593cc8cead4,50-75,T83591D


In [16]:
count

25

the linkage attack managed to re-identify 25 individuals. Which is a significantly less than on dataset without k anonimity