**Notebook for 5-anonymizing the Harvard Facebook dataset.**

In [None]:
# import the dataset as a pandas dataframe
import pandas as pd
import numpy as np

# Assumes Harvard Facebook dataset stored in 105facebook.csv, 
# formatted and cleaned as described in paper (original version
# before binning concentrations)
df_init = pd.read_csv("105facebook.csv")
df_init

In [None]:
# drop last row since NaN
df = df_init.iloc[:-1 , :]
df

In [3]:
def get_quasi_ids(df):
    """
    Finds the quasi identifiers in the given dataframe,
    which is all column names, except for 'Email' and 'Name'
    """
    quasi_ids = df.columns.to_list()
    quasi_ids.remove("Email")
    quasi_ids.remove("Name")
    return quasi_ids

In [4]:
# get quasi_ids
quasi_ids = get_quasi_ids(df)
quasi_ids

['House', 'Year', 'Concentration']

In [5]:
# Function for determining level of k-anonymity in a file
# 
def level_k_anon(df, quasi_ids=quasi_ids):
    """
    Determines the level of k anonymity the given dataframe has
    for the given list of quasi identifier names.

    Parameters:
    -----------
    df: pandas DataFrame
        df to find the level of anonymity of

    quasi_ids: list
        list of names of quasi identifiers (col names that correspond
        to quasi identifiers)
        NOTE: default is all column names except "course_id" and "user_id"
    """
    # Group by set of quasi id values
    quasi_id_grouped_df = df.groupby(quasi_ids, dropna=False)
    # Get number of rows in each gruop
    grouped_row_counts = quasi_id_grouped_df.size()
    # Min number of rows in a group = level of k-anonymity
    level_k_anon_num = min(grouped_row_counts)
    return level_k_anon_num
    
    

**1. Determine the level of k-anonymity in the file**

In [6]:
original_level_k_anon = level_k_anon(df)
original_level_k_anon

1

1. **Result:** the datframe is currently 1-anonymous

**2. Make file anonymous using record suppression.**  (Assuming records are rows)

**Question:** how many records need to be deleted to do this?

In [54]:
k = 5

In [55]:
num_records_dropped = 0
# Group the dataframe by each unique set of quasi id values
df_grouped_by_quasi_ids = df.groupby(quasi_ids, dropna=False)
# Iterate through each group, removing any with < 5 entries
for name, group in df_grouped_by_quasi_ids:
    # print(name)
    # print(group)
    # Get size of group
    group_size = group.shape[0]
    # print(f"num of rows in group: {group_size}")
    if group_size < k:
        # print("group size < 0")
        # Record how many entries were removed when the group was removed
        num_records_dropped += group_size
    # print("\n")

print(f"total num of records dropped: {num_records_dropped}")

total num of records dropped: 2735


In [None]:
# Verify results
# Group by all the quasi ids
group_by_quasi_ids_df = df.groupby(quasi_ids, dropna=False)
# Get number of rows in each gruop
grouped_row_counts = group_by_quasi_ids_df.size()
print(grouped_row_counts)

# Get number of rows who belong to a group of quasi ids with less than k members
print(f"k: {k}")
num_rows_less_than_k = grouped_row_counts[grouped_row_counts < k ].sum(skipna=False) 
print(f"num rows with less than k duplicates: {num_rows_less_than_k}")
num_rows_greater_than_k = grouped_row_counts[grouped_row_counts >= k ].sum(skipna=False) 
print(f"num rows with > or = to k duplicates: {num_rows_greater_than_k}")

2. **Result**: 2735 records were deleted to make the dataset 5-anonymous using record supression... This is clearly not a viable solution.

**3. Make the file 5-anonymous using only column suppression**

**Question:** How many columns are needed to do this, and which ones are they? 

In [64]:
# Step 1: group by each col, throw out cols that results in groups with < k rows
cols_removed = []
for quasi_id in quasi_ids:
    # Isolate just the column for a particular quasi id
    col_df = df[quasi_id]
    # Count the number of occurrences (counts) of each unique value in it
    count = df[quasi_id].value_counts()
    # print(count)
    # Get the minimum number of occurrences of a unique value (min count at end)... 
    # if this is < 5, this record must be dropped
    min_count = count.iloc[-1]
    print(f"col for {quasi_id} has a min value count of {min_count}")
    if (min_count < k):
        cols_removed.append(quasi_id)

print(f"cols removed: {cols_removed}")
print(f"num cols removed: {len(cols_removed)}")

col for House has a min value count of 123
col for Year has a min value count of 49
col for Concentration has a min value count of 1
cols removed: ['Concentration']
num cols removed: 1


In [None]:
# Step 2: check if reached 5-anonymity
init_cols_supressed_df = df.drop(cols_removed,axis=1)
# Checking that dropped all 11 desired cols
print(f"init cols supressed df: \n{init_cols_supressed_df}")
# Check level of anonymity in remaining df
sup_quasi_ids = get_quasi_ids(init_cols_supressed_df)
print(f"Quasi ids of col supressed df: {sup_quasi_ids}")
init_col_sup_lvl_anon = level_k_anon(init_cols_supressed_df, quasi_ids=sup_quasi_ids)
print(f"Level of k anonymity of dataframe after step 1: {init_col_sup_lvl_anon}")

3. **Result:** 1 column has to be dropped.  This is: 'Concentration'.  The level of k-anonymity after dropping this column is 23

In [None]:
# Exploratory: what is the level of anonymity if we drop the column with the 2nd-smallest level of duplicates as well?
explor_cols_supressed_df = df.drop(cols_removed + ["House"],axis=1)
# Checking that dropped all 11 desired cols
print(f"explor cols supressed df: \n{explor_cols_supressed_df}")
# Check level of anonymity in remaining df
sup_quasi_ids2 = get_quasi_ids(explor_cols_supressed_df)
print(f"Quasi ids of col supressed df: {sup_quasi_ids2}")
explor_col_sup_lvl_anon = level_k_anon(explor_cols_supressed_df, quasi_ids=sup_quasi_ids2)
print(f"Level of k anonymity of dataframe after exploring deleting 'Concentration' and 'House': {explor_col_sup_lvl_anon}")

**4. Produce a 5-anonymous dataset using generalization**

This was done by generalizing "Concentration" into the following bins:


*Arts*:

* Art, Film, and Visual Studies
* Music
* Theater, Dance, & Media


*Engineering*:

* Biomedical Engineering
* Electrical Engineering
* Engineering Sciences
* Environmental Science and Engineering
* Mechanical Engineering


*History*:

* Anthropology
* Classics
* East Asian Studies
* History
* History and Literature
* History and Science
* History of Art and Architecture
* South Asian Studies


*Languages, Literatures, and Religion*:

* Comparative Literature
* Comparative Study of Religion
* English
* Folklore and Mythology
* Germanic Languages and Literature
* Linguistics
* Near Eastern Languages and Civilizations
* Romance Languages and Literature
* Slavic Literatures and Cultures


*Life Sciences*:

* Chemical and Physical Biology
* Human Developmental and Regenerative Biology
* Human Evolutionary Biology
* Integrative Biology
* Molecular and Cellular Biology
* Neuroscience
* Psychology


*Math and Computation*:

* Applied Math
* Computer Science
* Mathematics
* Statistics


*Physical Sciences*:

* Astrophysics
* Chemistry
* Chemistry and Physics
* Earth and Planetary Sciences
* Physics


*Qualitative Social Sciences*:

* African and African American Studies
* Government
* Philosophy
* Social Studies
* Studies of Women, Gender, and Sexuality


*Quantitative Social Sciences*:
* Economics
* Environmental Science and Public Policy
* Sociology
* Special Concentration



Note: this was done inside of Excel instead of in the code.

In [None]:
# use the dataset with the binned concentrations
binned_df = pd.read_csv("105facebookbins.csv")
binned_df = binned_df.iloc[:-1 , :] # Drop last col bc nan
binned_df

In [8]:
# check the level of k-anonymity
binned_quasi_ids = get_quasi_ids(binned_df)
# remove "Concentration" as quasi id, as now looking at binned quasi ids
binned_quasi_ids.remove("Concentration")
binned_quasi_ids

['House', 'Year', 'ConcentrationBin']

In [9]:
k_anon_binned = level_k_anon(binned_df, binned_quasi_ids)
k_anon_binned

1

Result of binning concentrations: still 1-anonymous...

Checking where the issue was:

In [10]:
def get_grouped_row_counts(df, cols):
    grouped_df = df.groupby(cols, dropna=False)
    return grouped_df.size()

In [11]:
groups_dict = get_grouped_row_counts(binned_df, binned_quasi_ids).to_dict()

In [None]:
groups_dict

When examining by eye, it appears that joint/double concentrations in different bins are primarily causing the problem (there are a bunch of joint/double concentrations and pretty much all of them have < 5 people in their grouping, with many of them having only 1)

However, there are still some non-joint/double concentration people with < 5-anonymity, here's an example of someone who was 1-anonymous:

('Adams', 'Senior', 'Arts'): 1,

('Dunster', 'Sophomore', 'Arts'): 1,

Two routes: bin by house in addition to concentration (necessary due to case of the Adams Senior above), and if this doesn't work: bin by house in addition to concentration AND remove the allied joint/double concentration, or bin by house in addition to by concentration

Note: have to keep binning by concentration because concentration column alone leads to 1-anonymity

***4.1: Binning Houses in Addition to Concentrations***

In [13]:
# Create mappings between each house and the bin it falls into
mappings = {
    "Dudley":"Quad",
    "Currier":"Quad",
    "Pforzheimer":"Quad",
    "Cabot":"Quad",
    "Dunster":"River East",
    "Mather":"River East",
    "Leverett":"River East",
    "Lowell":"River Central",
    "Adams":"River Central",
    "Quincy":"River Central",
    "Winthrop":"River West",
    "Eliot":"River West",
    "Kirkland":"River West"
}

In [14]:
# Bin by house in addition to concentration
binned2_df = binned_df.copy()
binned2_df["HouseBin"] = binned2_df["House"].map(mappings)

In [None]:
binned2_df

In [16]:
# check the level of k-anonymity
binned2_quasi_ids = get_quasi_ids(binned2_df)
# remove "Concentration" as quasi id, as now looking at binned quasi ids
binned2_quasi_ids.remove("Concentration")
binned2_quasi_ids.remove("House")
binned2_quasi_ids

['Year', 'ConcentrationBin', 'HouseBin']

In [17]:
k_anon_binned2 = level_k_anon(binned2_df, binned2_quasi_ids)
k_anon_binned2

1

Result of binning concentrations and houses: still 1-anonymous...

Checking where the issue was:

In [None]:
groups_dict = get_grouped_row_counts(binned2_df, binned2_quasi_ids).to_dict()
groups_dict

Issue seems to be coming from the joint/double concentrations...  removing the allied joint/double concentration 

Example of issue (note that there are lots of them):

('Sophomore',
  'Quantitative Social Sciences; Arts',
  'River Central'): 1,

***4.2: Dropping Allied Joints/Second Double Concentrations on top of Binning Houses and Concentrations***

In [None]:
# splitting the house and concentration binned df on joint/double concentrations (separating
# them into two different columns)
binned3_df = binned2_df.copy()
binned3_df[['ConcentrationBin_1', 'ConcentrationBin_2']] = binned2_df["ConcentrationBin"].str.split('; ', expand=True)
binned3_df.head(6)

In [20]:
# check the level of k-anonymity, using only Concentration_1
binned3_quasi_ids = get_quasi_ids(binned3_df)
# remove "Concentration_2" (as well as "Concentration" and "House" bc already binned)
binned3_quasi_ids.remove("Concentration")
binned3_quasi_ids.remove("House")
binned3_quasi_ids.remove("ConcentrationBin_2")
binned3_quasi_ids.remove("ConcentrationBin")
binned3_quasi_ids

['Year', 'HouseBin', 'ConcentrationBin_1']

In [21]:
k_anon_binned3 = level_k_anon(binned3_df, binned3_quasi_ids)
k_anon_binned3

1

Result of binning concentrations and houses: still 1-anonymous...

Checking where issue was:

In [None]:
groups_dict = get_grouped_row_counts(binned3_df, binned3_quasi_ids).to_dict()
groups_dict

It appears that the issue was "Special Concentration" (several entries have 1 that have "Special Concentration", everything else looks fine)

So removing Special Concentration (this is viable because survey respondents can just group themselves into the concentration bin that matches them best)

***4.3 Removing Entries with Special Concentration***

In [24]:
binned4_quasi_ids = binned3_quasi_ids
binned4_quasi_ids

['Year', 'HouseBin', 'ConcentrationBin_1']

In [None]:
# drop all rows that contain 'Coca Cola'
binned4_df = binned3_df.drop(binned3_df[binned3_df['ConcentrationBin_1'] == 'Special Concentration'].index)
binned4_df

In [27]:
# check the level of k-anonymity
k_anon_binned4 = level_k_anon(binned4_df, binned4_quasi_ids)
k_anon_binned4

5

Result: the Harvard Facebook dataset is now 5-anonymous!