## Re-identification/De-identification 


Read a configuration file containing integers that correspond to columns in the
dataset. Each integer represents a quasi-identifier. The configuration file corresponds to one line for each relevant column in the dataset.


In [147]:
def read_config(config_file):
    with open(config_file) as file:
        identifiers = [(q) for q in file.read().split()]
    file.close()
    return sorted(identifiers)

### Import data

In [77]:
import pandas as pd

Many entries contain NaN where the user did not enter information. Fill these values with 0 in order to filter them in further analysis. 

In [149]:
NA_FILL_VALUE = 0

In [150]:
df_raw = pd.read_csv("mid_sample_set.csv", dtype='unicode')
df_raw.index = df_raw.user_id
df_raw = df_raw.drop('user_id', axis = 1)

In [151]:
# Remove NA columns
original_columns = set(df_raw.columns.values)
df = df_raw.dropna(axis = 1, how = 'all').fillna(NA_FILL_VALUE)
new_columns = set(df.columns.values)
print("Removed columns", original_columns - new_columns)

Removed columns {'roles_isLibrary', 'forumRoles_isCommunityTA', 'roles_isCCX'}


In [81]:
df.shape

(199999, 87)

Upon loading the dataset, there are 200,000 entries (users) and 87 fields for each row entry.

### Direct Identifiers
- Can uniquely identify an individual and should be removed
- This includes ip


In [82]:
df = df.drop('ip', axis = 1)

### Quasi-Identifiers
- Can uniquely identify an individual when linked to other datasets. 
- These include: 'course_id', 'user_id', 'countryLabel', 'continent', 'city', 'region', 'subdivision', 'postalCode', 'LoE', 'YoB', 'gender', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 'nforum_threads', 'nforum_comments', 'nforum_pinned', and 'nforum_events', and are listed as their corresponding column index in the configuration file.
- Redundant quasi-identifiers are not included 
- Create a version of the dataset that only contains the quasi-identifiers.

In [152]:
quasi_identifiers = read_config("config_file.txt")
df_quasi = df.loc[:,quasi_identifiers]
labels = list(df_quasi.columns.values)
# Remove user_id because it's not a quasi-identifier, just a key
quasi_identifier_labels = list(set(labels) - set(["user_id"]))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [93]:
def getAnonCompletionRates(df, quasi_identifier_labels, k):
    if 'course_id' not in quasi_identifier_labels:
        quasi_identifier_labels.append('course_id')
    # Creates groupings of quasi identifiers and finds size of each set
    k_anonymous = df.groupby(quasi_identifier_labels)\
                  .size().reset_index(name = 'ct').set_index(quasi_identifier_labels)
    k_anonymous = k_anonymous[k_anonymous.ct >= k]
    
    completedCount = df.groupby(quasi_identifier_labels + ['completed'])\
                       .size()\
                       .reset_index(name = "completedCount")\
                       .set_index(quasi_identifier_labels)
    
    k_anonymous = k_anonymous.reset_index()\
                  .merge(completedCount[completedCount.completed == 'True'].reset_index(), how = "left")
        
    print("Size of %d-anonymous dataset: %d" % (k, k_anonymous.ct.sum()))
    
    k_AnonTotalCounts = k_anonymous.groupby('course_id')["ct"].sum()\
                                   .reset_index(name = "numStudents")\
                                   .set_index('course_id')
    
    completionRates = ((k_anonymous.set_index('course_id').completedCount.fillna(0)
                        / k_AnonTotalCounts.numStudents).fillna(0) * 100)\
                        .reset_index(name = "completionRate")\
                        .sort_values(by = "completionRate", ascending = False)
    
    if len(k_anonymous) == 0: 
        completionRates.completionRate = 0
    
    return completionRates 

When k = 3

In [94]:
completionRate3Anon = getAnonCompletionRates(df, quasi_identifier_labels, 3)

Size of 3-anonymous dataset: 18409


In [95]:
completionRate3Anon.head()

Unnamed: 0,course_id,completionRate
2015,HarvardX/MUS24.1x/3T2017,33.333333
1920,HarvardX/HLS4X/3T2017,9.52381
2972,HarvardX/SW12.3x/2016,8.695652
1930,HarvardX/HUM1.6x/3T2015,7.142857
2949,HarvardX/SPU29x/3T2016,4.761905


When k = 4

In [96]:
completionRate4Anon = getAnonCompletionRates(df, quasi_identifier_labels, 4)

Size of 4-anonymous dataset: 14125


In [97]:
completionRate4Anon.head()

Unnamed: 0,course_id,completionRate
1668,HarvardX/SW12.3x/2016,10.0
1090,HarvardX/HLS4X/3T2017,9.52381
1657,HarvardX/SPU29x/3T2016,6.25
1659,HarvardX/SPU29x/3T2016,4.166667
1611,HarvardX/PH559x/3T2017,3.636364


When k = 5

In [98]:
completionRate5Anon = getAnonCompletionRates(df, quasi_identifier_labels, 5)

Size of 5-anonymous dataset: 11665


In [99]:
completionRate5Anon.head()

Unnamed: 0,course_id,completionRate
1097,HarvardX/SW12.3x/2016,12.5
1089,HarvardX/SPU29x/3T2016,6.818182
1061,HarvardX/PH559x/3T2017,3.636364
1063,HarvardX/PH559x/3T2017,3.636364
1011,HarvardX/PH556/2015T3,2.040816


### Synthetic Records
- Make the data k-anonymous
- Find the number of synthetic records needed for each case
- Compute completion rates and compare to dataset without synthetic records

When k = 3

In [153]:
def addSyntheticRows(df, quasi_identifier_labels, k):
    synthetic_k_anon_df = df.reset_index()
    # Create groupings based on quasi identifiers and count number of students in each group
    not_k_anonymous = synthetic_k_anon_df.groupby(quasi_identifier_labels)\
                      .size().reset_index(name = 'studentCount')
        
    # Assign groupings where studentCount is less than desired level for k-anonymity 
    not_k_anonymous = not_k_anonymous[not_k_anonymous.studentCount < k]

    # Duplicate rows where groupings of quasi-identifiers are not k-anonymous
    for i in range(k):
        rowsToAdd = not_k_anonymous[not_k_anonymous.studentCount == i]
        if len(rowsToAdd) > 0:
            for j in range(k - i):
                synthetic_k_anon_df = synthetic_k_anon_df.append(rowsToAdd, ignore_index = True)
            
    # Calculate difference between dataset and k-anonymized dataset with synthetic rows
    rowsAdded = synthetic_k_anon_df.shape[0] - len(df)
    print("Size of synthetic dataset: %d\nSynthetic Rows Added: %d" % (len(synthetic_k_anon_df), rowsAdded))
    # Drop the studentCount column from the dataset we return because df_quasi does not
    # have it.  This column is an artifact of when we grouped synthetic_k_anon_df above
    synthetic_k_anon_df = synthetic_k_anon_df.drop("studentCount", axis = 1)
    return synthetic_k_anon_df
    

Add synthetic data when k = 3

In [154]:
synthetic3AnonDf = addSyntheticRows(df, quasi_identifier_labels, 3)

Size of synthetic dataset: 453071
Synthetic Rows Added: 253072


Add synthetic data when k = 4

In [155]:
synthetic4AnonDf = addSyntheticRows(df, quasi_identifier_labels, 4)

Size of synthetic dataset: 587798
Synthetic Rows Added: 387799


Add synthetic data when k = 5

In [156]:
synthetic5AnonDf = addSyntheticRows(df, quasi_identifier_labels, 5)

Size of synthetic dataset: 723851
Synthetic Rows Added: 523852


### K-Anonymity 
- Generalization
    - YoB, nform_* 
- Blurring
    - Last 3 digits of postal_code
- Suprression 
    - Remaining 


In [139]:
df_quasi_copy = df.copy()

In [157]:
def blurring(df, col, lenCol):
    df_quasi_blur = df
    # Blurr the last 3 elements of string with stars
    df_quasi_blur.loc[:,col] = df_quasi_blur.loc[:,col].astype(str).str[0:int(lenCol-3)] + "*" * 3;
    return df_quasi_blur

In [141]:
blurring(df_quasi_copy, "postalCode", 5);

In [158]:
def generalization(df, colChange, bucketSize, maxVal):
    col = colChange
    df_quasi_gen = df
    # Calculate number of categories with maximum value in column and size of each category interval 
    binNum = int(maxVal / bucketSize)
    # Create list of categories 
    bins = [bucketSize*i for i in range(NA_FILL_VALUE-1, binNum)]
    # Convert data to integers and assign each value to specified list of categories 
    df_quasi_gen.loc[:,col] = df_quasi_gen[col].astype(int)
    df_quasi_gen.loc[:,col] = pd.cut(df_quasi_gen[col], bins)
    return df_quasi_gen

In [159]:
r = generalization(df_quasi_copy, "YoB", 10, 2019)
r = generalization(r, "nforum_posts", 10, 470)
r = generalization(r, "nforum_votes", 10, 640)
r = generalization(r, "nforum_endorsed", 10, 50)
r = generalization(r, "nforum_threads", 10, 150)
r = generalization(r, "nforum_comments", 10, 450)
r = generalization(r, "nforum_pinned", 10, 20)

ValueError: cannot convert float NaN to integer

### L-Diversity
- Used to determine distinguishability among indistunghisable quasi-identifiers but distinguishable sensitive attributes
- Sensitive attributes include: grade
- Determine level of l-diversity in order to strengthen k-anonymity

In [160]:
lDiverse = r
# Add column of sensitive values to dataset that is k-anonymous
lDiverse["grade"] = df["grade"]

In [145]:
lDiverseLevels = lDiverse.groupby(quasi_identifier_labels)["grade"].nunique().reset_index(name = "l-diversity-grade")
lDiverseLevels

Unnamed: 0,continent,gender,nforum_comments,nforum_pinned,nforum_events,nforum_posts,nforum_threads,LoE,subdivision,cc_by_ip,city,countryLabel,region,postalCode,YoB,nforum_votes,nforum_endorsed,course_id,l-diversity-grade
0,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/1368.2x/2T2016,1
1,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/1368.4x/2T2015,2
2,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/AI12.1x/2013_SOND,1
3,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/AmPoX.3/3T2017,1
4,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/AmPoX.4/1T2015,2
5,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/AmPoX.4/2T2017,1
6,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/CHEM160/1T2017,3
7,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/CS50B/Business,1
8,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/CS50x/2014_T1,2
9,0,0,"(-10, 0]","(-10, 0]",0,"(-10, 0]","(-10, 0]",0,0,0,0,0,0,0***,"(-10, 0]","(-10, 0]","(-10, 0]",HarvardX/ER22.1x/1T2018,1


In [146]:
lDiverseLevels["l-diversity-grade"].unique()

array([1, 2, 3, 4, 5, 6])

In [None]:
df_quasi["completed"] = df["completed"]

In [None]:
completionRate4Anon_GenBlurSuppress = getAnonCompletionRates(df_quasi, quasi_identifier_labels, 4)

In [None]:
completionRate5Anon_GenBlurSuppress = getAnonCompletionRates(df_quasi, quasi_identifier_labels, 5) 