##### Deanonymising group B's dataset 

Anonymisation techniques: 
- Pram on evote, zip, citizenship and party 
- Grouping on age -> age groups 
- 10% chance of each record switching age groups to one above or one below 
- Grouping on education levels -> broader categories 
- Recoding citizenship into boolean Danish citizenship 
- Removed name, dob 
- Masking the party variable + randomly morphed the 2 entries marked as "Invalid Vote" ??
- Masked the zip variable - assigninng a random zip region to each zip code 
- k-anonomity of 2 (supression) 

Importing libraries 

In [1]:
import pandas as pd
import functions 
from datetime import date

Loading in their data

In [2]:
# global recoding on age and pram on sex 
anonymised_data = pd.read_csv("deanon_data/anonymised_dataB.csv")
# global recoding on age, global recoding on marital status 
register_data = pd.read_excel("deanon_data/public_data_registerB.xlsx")
# global recodign on age, global recoding on marital status, and pram on sex 
results_data = pd.read_excel("deanon_data/public_data_resultsB.xlsx")

Preparing the results data 
- Converting age -> age groups 
- Converting citizenship into boolean Danish_citizenship
- Masking zip code into region ???

In [3]:
# convert dob to age 
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
register_data["dob"] = pd.to_datetime(register_data["dob"])
register_data['age'] = register_data['dob'].apply(lambda x: calculate_age(x))

# convert age to age groups 
age_bins = [18, 29, 39, 49, 59, 69, 1000]  # Adjust as needed
age_labels = ["18-29", "30-39", "40-49", "50-59", "60-69", "70+"] 
register_data['age_group'] = pd.cut(register_data['age'], bins=age_bins,right=True, labels=age_labels, include_lowest=True)

In [4]:
# convert citizenship into Danish_citizenship 
def danish(x): 
    if x == "Denmark": 
        return True 
    else: 
        return False 

register_data["Danish_Citizenship"]= register_data["citizenship"].apply(lambda x: danish(x))

In [5]:
# convert zip code into zip region 
def zip_region1(x): 
    if x == 2200: 
        return "Region 1"
    elif x== 2300: 
        return "Region 2"
    elif x==2100: 
        return "Region 3"
    elif x==2400: 
        return "Region 4"
register_data["zip_region1"]= register_data["zip"].apply(lambda x: zip_region1(x))

# convert zip code into zip region 
def zip_region2(x): 
    if x == 2200: 
        return "Region 1"
    elif x== 2400: 
        return "Region 2"
    elif x==2100: 
        return "Region 3"
    elif x==2300: 
        return "Region 4"
register_data["zip_region2"]= register_data["zip"].apply(lambda x: zip_region2(x))

Getting survey voters from register data 

In [6]:
quasi = ['sex', 'marital_status', 'age_group', 'Danish_Citizenship']
with open("deanon_data/survey_listB.txt", "r") as my_file:
    # Read the file content
    data = my_file.read()
    
    # Split the text into a list by newline ('\n')
    data_into_list = data.split("\n")

survey_voters = register_data.query('name in @data_into_list')
survey_voters = survey_voters[quasi+["name"]]
survey_voters

Unnamed: 0,sex,marital_status,age_group,Danish_Citizenship,name
0,Female,Never married,18-29,False,"Dang, Lila"
12,Female,Never married,18-29,True,"Rivera, Gabriela"
14,Male,Never married,18-29,True,"Vogel, William"
18,Female,Never married,18-29,True,"Palacios, Mireya"
25,Male,Never married,18-29,False,"Mcclain, Vaughn"
...,...,...,...,...,...
1503,Male,Never married,60-69,True,"Lau, Francis"
1517,Male,Divorced,60-69,True,"Hanshaw, William"
1518,Female,Widowed,70+,True,"al-Dar, Waneesa"
1521,Male,Married/separated,70+,True,"al-Tabatabai, Tammaam"


Computing k-anonimity violations

In [7]:
functions.k_anonymity_violations(anonymised_data, quasi)

{2: (19, 9.595959595959595),
 3: (45, 22.727272727272727),
 5: (62, 31.313131313131315)}

In [8]:
k2_violations_anon = functions.identify_k_anonymity_violations(anonymised_data, quasi)

Re-identifying people based on the quasi-identifiers: sex, age_group, Danish_Citizenship, and marital_status 

In [33]:
df_matches = []
for _, record in survey_voters.iterrows():
    matches = anonymised_data[
                        (anonymised_data['sex'] == record['sex']) &
                        (anonymised_data['age_group'] == record['age_group']) &
                        (anonymised_data['Danish_Citizenship'] == record['Danish_Citizenship']) &
                        (anonymised_data['marital_status'] == record['marital_status'])]
    matches = matches.copy()  # Avoid SettingWithCopyWarning
    matches["name"] = record["name"]
    #print(f"Matches for record {record.to_dict()}:")
    #print(matches)
    df_matches.append(matches)

# Combine all matches into a single DataFrame
df_matches = pd.concat(df_matches, ignore_index=True)

num_unique=df_matches.groupby("name")[["party"]].nunique()
unique = num_unique[num_unique["party"]==1].reset_index()
filtered_df = df_matches[df_matches['name'].isin(unique['name'])][["name", "party"]]
sim_identifiable = filtered_df.groupby(["name", "party"]).count()
sim_identifiable=sim_identifiable.reset_index()
sim_identifiable
#valid_matches['IsInDf2'] = valid_matches['name'].isin(sim_identifiable['name'])

Unnamed: 0,name,party
0,"Adams, Samantha",Green
1,"Anderson, Quianah",Green
2,"Brodie, Mariah",Red
3,"Brown, Shnika",Red
4,"Bullard, Matthew",Green
5,"Cardoza, Spring",Green
6,"Dahlberg, Chelsea",Red
7,"Dang, Lila",Green
8,"Garcia, Shamika",Green
9,"Hanshaw, William",Green


Make it into .csv file for handin

In [10]:
final_df = sim_identifiable.copy()
final_df['name'] = final_df['name'].apply(lambda x: ' '.join(x.split(', ')[::-1]))
#final_df

In [11]:
final_df.to_csv('deanon/deanonymised.csv', index=False)

Re-identifying people based on the quasi-identifiers: sex, age_group, Danish_Citizenship, and marital_status and zip region - for both versions of the zip regions 

In [12]:
survey_voters_region = register_data.query('name in @data_into_list')
survey_voters_region = survey_voters_region[quasi+["name", "zip_region1", "zip_region2"]]

In [13]:
# for zip region version 1 
df_matches1 = []
for _, record in survey_voters_region.iterrows():
    matches = anonymised_data[
                        (anonymised_data['sex'] == record['sex']) &
                        (anonymised_data['age_group'] == record['age_group']) &
                        (anonymised_data['Danish_Citizenship'] == record['Danish_Citizenship']) &
                        (anonymised_data['marital_status'] == record['marital_status'])&
                        (anonymised_data['zip_region'] == record['zip_region1'])]
    matches = matches.copy()  # Avoid SettingWithCopyWarning
    matches["name"] = record["name"]
    #print(f"Matches for record {record.to_dict()}:")
    #print(matches)
    df_matches1.append(matches)

# Combine all matches into a single DataFrame
df_matches1 = pd.concat(df_matches1, ignore_index=True)

num_unique1=df_matches1.groupby("name")[["party"]].nunique()
unique1 = num_unique1[num_unique1["party"]==1].reset_index()
filtered_df1 = df_matches1[df_matches1['name'].isin(unique1['name'])][["name", "party"]]
sim_identifiable1 = filtered_df1.groupby(["name", "party"]).count()
sim_identifiable1=sim_identifiable1.reset_index()
sim_identifiable1

Unnamed: 0,name,party
0,"Adams, Samantha",Green
1,"Ailes, Yvette",Red
2,"Anderson, Quianah",Green
3,"Aragon, Margarita",Green
4,"Aragon, Matthew",Green
...,...,...
90,"el-Mohammed, Aaisha",Red
91,"el-Muhammad, Fateena",Green
92,"el-Saadeh, Marzooqa",Green
93,"el-Siddiqui, Thamra",Red


In [14]:
# for zip region version 2 
df_matches2 = []
for _, record in survey_voters_region.iterrows():
    matches = anonymised_data[
                        (anonymised_data['sex'] == record['sex']) &
                        (anonymised_data['age_group'] == record['age_group']) &
                        (anonymised_data['Danish_Citizenship'] == record['Danish_Citizenship']) &
                        (anonymised_data['marital_status'] == record['marital_status'])&
                        (anonymised_data['zip_region'] == record['zip_region2'])]
    matches = matches.copy()  # Avoid SettingWithCopyWarning
    matches["name"] = record["name"]
    #print(f"Matches for record {record.to_dict()}:")
    #print(matches)
    df_matches2.append(matches)

# Combine all matches into a single DataFrame
df_matches2 = pd.concat(df_matches2, ignore_index=True)

num_unique2=df_matches2.groupby("name")[["party"]].nunique()
unique2 = num_unique2[num_unique2["party"]==1].reset_index()
filtered_df2 = df_matches2[df_matches2['name'].isin(unique2['name'])][["name", "party"]]
sim_identifiable2 = filtered_df2.groupby(["name", "party"]).count()
sim_identifiable2=sim_identifiable2.reset_index()
sim_identifiable2
#valid_matches['IsInDf2'] = valid_matches['name'].isin(sim_identifiable['name'])

Unnamed: 0,name,party
0,"Adams, Samantha",Green
1,"Allen, Jackie",Green
2,"Anderson, Quianah",Green
3,"Anderson, Rachel",Green
4,"Aragon, Margarita",Green
...,...,...
106,"el-Masih, Insaaf",Green
107,"el-Muhammad, Fateena",Green
108,"el-Saadeh, Marzooqa",Green
109,"el-Siddiqui, Thamra",Red


Finding the intersection 

In [15]:
intersection  = pd.merge(sim_identifiable2, sim_identifiable1, how ='inner', on =["name", "party"])
intersection 

Unnamed: 0,name,party
0,"Adams, Samantha",Green
1,"Anderson, Quianah",Green
2,"Aragon, Margarita",Green
3,"Aragon, Matthew",Green
4,"Arou, Toyan",Green
...,...,...
58,"el-Masih, Insaaf",Green
59,"el-Muhammad, Fateena",Green
60,"el-Saadeh, Marzooqa",Green
61,"el-Siddiqui, Thamra",Red


In [None]:
# finding the ones in `intersection` that are NOT in the original `sim_identifiable`
mask = intersection.apply(tuple, axis=1).isin(sim_identifiable.apply(tuple, axis=1))
not_in_df2 = intersection[~mask]
new_records = sim_identifiable.loc[~intersection.isin(sim_identifiable.to_dict(orient='list')).all(axis=1)].reset_index(drop=True)
new_records

Unnamed: 0,name,party
0,"Brodie, Mariah",Red
1,"Brown, Shnika",Red
2,"Bullard, Matthew",Green
3,"Cardoza, Spring",Green
4,"Garcia, Shamika",Green
5,"Ingah, Nalye",Green
6,"Kanherkar, Tue Yer",Red
7,"Lattimer, Danielle",Red
8,"Lopes, Jenna",Red
9,"Magana, Joseph",Green
