# Linkage attack

In [1]:
import requests
import pandas as pd

med_data = requests.get('http://localhost:8000/api/meddata/deidentified').json()
med_data_k_anonymous = requests.get('http://localhost:8000/api/meddata/k_anonymous').json()
work_data = requests.get('http://localhost:8000/api/workdata').json()

In [2]:
med_df = pd.DataFrame(med_data)
work_df = pd.DataFrame(work_data)
med_k_anonymous_df = pd.DataFrame(med_data_k_anonymous)

In [3]:
med_df.head()

Unnamed: 0,id,age,gender,postalCode,diagnosis
0,7fc7f226-249f-4384-98d1-fc75a8b19008,20 - 24,Male,496770***,V9107XA
1,314e95ab-2715-44fe-9ccb-3114ab61d7fd,20 - 24,Female,207211***,T82328A
2,17d0c6b2-6ea1-4f41-840f-84665888a2df,25 - 29,Male,939303***,S92403K
3,e4e72cdd-6e70-40b6-add9-ce09b2e623db,15 - 19,Female,295269***,C8197
4,d0b1ecff-7d1a-41be-ac48-bb966e2fd6e3,20 - 24,Male,420641***,V9206XA


In [4]:
med_k_anonymous_df.head()

Unnamed: 0,id,age,gender,postalCode,diagnosis
0,927c3780-900f-4230-a7c5-843fb4f3c93b,28-36,*,00*******,V80919A
1,51323ee8-c535-4200-b974-7cd6d3f611ec,28-36,*,00*******,H30043
2,7d50e5ea-15ec-43d3-be25-10cc70df38a8,28-36,*,00*******,S948X1A
3,b8331cb4-a5f1-449c-85e4-f1d5454f6f9d,28-36,*,00***,I090
4,de8bf190-ca7a-45f3-a4ec-180dc8aa4d73,28-36,*,00*******,S60476


In [5]:
work_df.head()

Unnamed: 0,id,postalCode,education,gender,workplace,department,fname,lname
0,1,133261301,Fachhochschule Bonn-Rhein-Sieg,Female,Schamberger Group,Product Management,Tamqrah,Shawdforth
1,2,64744,Chaudhary Charan Singh University,Female,Gutmann,Cummings and Cormier,Ardys,Ferenczi
2,3,62606,University of Sudbury,Female,Doyle,Powlowski and Leuschke,Margette,Dibner
3,4,593301943,University College Cork,Female,Jenkins Inc,Training,Libby,Gandrich
4,5,6311003,University of West Hungary,Male,VonRueden-Dooley,Marketing,Lennard,Castelin


In [6]:
def degeneralize_age(age_range):
    age_lower, _ = age_range.replace(" ", "").split("-")
    return int(age_lower)

In [7]:
def suppress_post_code(code):
    # suppress code to first three digits
    post_code_supped = str(code)[:2] + "***"
    return post_code_supped

In [8]:
work_df['postalCode'] = work_df['postalCode'].apply(suppress_post_code)
work_df.head()

Unnamed: 0,id,postalCode,education,gender,workplace,department,fname,lname
0,1,13***,Fachhochschule Bonn-Rhein-Sieg,Female,Schamberger Group,Product Management,Tamqrah,Shawdforth
1,2,64***,Chaudhary Charan Singh University,Female,Gutmann,Cummings and Cormier,Ardys,Ferenczi
2,3,62***,University of Sudbury,Female,Doyle,Powlowski and Leuschke,Margette,Dibner
3,4,59***,University College Cork,Female,Jenkins Inc,Training,Libby,Gandrich
4,5,00***,University of West Hungary,Male,VonRueden-Dooley,Marketing,Lennard,Castelin


In [9]:
def linkage_attack(med_df, work_df):
    matches = pd.merge(
        work_df, 
        med_df, 
        on=['postalCode', 'gender'],
        how='inner',
        suffixes=('_work', '_med')
    )
    unique_matches = matches[matches.groupby(['fname', 'lname'])['fname'].transform('size') == 1]
    count = len(unique_matches)
    return unique_matches, count

# Deidentified dataset

In [10]:
matches, count = linkage_attack(med_df, work_df)

In [11]:
matches.head()

Unnamed: 0,id_work,postalCode,education,gender,workplace,department,fname,lname,id_med,age,diagnosis
0,2,64***,Chaudhary Charan Singh University,Female,Gutmann,Cummings and Cormier,Ardys,Ferenczi,fdeb7856-bd94-47fc-ab09-3aa974ebf084,50 - 54,T343XXS
1,5,00***,University of West Hungary,Male,VonRueden-Dooley,Marketing,Lennard,Castelin,35fb7243-d9ef-42b1-ac57-b0dedad67e21,40 - 44,S93302A
2,6,67***,Open University of Hong Kong,Male,Yundt-Monahan,Human Resources,Nicol,Hacard,1490dbbc-03f4-4728-91e4-0423f4e3de77,70 - 74,R25
12,18,77***,Ulyanovsk State Technical University,Male,Powlowski-Watsica,Sales,Meade,Yukhnevich,1aeabd2f-cf69-4a81-8fe2-9349ac0c80e8,40 - 44,Z0541
13,20,92***,Columbia College of Missouri,Female,Trantow,Kozey and Kertzmann,Bobbe,Hagstone,9f8759ef-8350-4066-a908-25fb0a0056bb,45 - 49,T485X1S


In [12]:
count

194

the linkage attack managed to re-identify 362 individuals. Which is a very successful attack.

# K anonymous dataset


In [13]:
matches, count = linkage_attack(med_k_anonymous_df, work_df)

In [14]:
matches.head()

Unnamed: 0,id_work,postalCode,education,gender,workplace,department,fname,lname,id_med,age,diagnosis


In [15]:
count

0

the linkage attack managed to re-identify 25 individuals. Which is a significantly less than on dataset without k anonimity