## Re-identification/De-identification 


Read a configuration file containing integers that correspond to column numbers in the
dataset. Each integer represents a quasi-identifier. 


In [1]:
def read_config(config_file):
    with open(config_file) as file:
        identifiers = [int(q) for q in file.read().split()]
    file.close()
    return sorted(identifiers)

### Import data

In [2]:
import pandas as pd

In [3]:
df_raw = pd.read_csv("mid_sample_set.csv", dtype='unicode')
df_raw.index = df_raw.user_id
df_raw = df_raw.drop('user_id', axis = 1)

In [4]:
# Remove NA columns
original_columns = set(df_raw.columns.values)
df = df_raw.dropna(axis = 1, how = 'all').fillna(-9)
new_columns = set(df.columns.values)
print("Removed columns", original_columns - new_columns)

Removed columns {'forumRoles_isCommunityTA', 'roles_isCCX', 'roles_isLibrary'}


In [5]:
df.shape

(199999, 87)

### Direct Identifiers
- Can uniquely identify an individual and should be removed

### Quasi-Identifiers
- Can uniquely identify an individual when linked to other datasets. 
- These include: 'course_id', 'user_id', 'countryLabel', 'continent', 'city', 'region', 'subdivision', 'postalCode', 'LoE', 'YoB', 'gender', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 'nforum_threads', 'nforum_comments', 'nforum_pinned', and 'nforum_events', and are listed as their corresponding column index in the configuration file.
- Create a version of the dataset that only contains the quasi-identifiers.

In [6]:
quasi_identifiers = read_config("config_file.txt")
df_quasi = df.iloc[:,quasi_identifiers]
labels = list(df_quasi.columns.values)
# Remove user_id because it's not a quasi-identifier, just a key
quasi_identifier_labels = list(set(labels) - set(["user_id"]))

### K-Anonymity
- What remains of the dataset if you make it k-anonymous using suppression
- What is the completion rate for each class at each of these levels.

In [7]:
def getAnonCompletionRates(df, quasi_identifier_labels, k):
    if 'course_id' not in quasi_identifier_labels:
        quasi_identifier_labels.append('course_id')
    
    k_anonymous = df.groupby(quasi_identifier_labels)\
                  .size().reset_index(name = 'ct').set_index(quasi_identifier_labels)
    k_anonymous = k_anonymous[k_anonymous.ct >= k]
    
    completedCount = df.groupby(quasi_identifier_labels + ['completed'])\
                       .size()\
                       .reset_index(name = "completedCount")\
                       .set_index(quasi_identifier_labels)
    
    k_anonymous = k_anonymous.reset_index()\
                  .merge(completedCount[completedCount.completed == 'True'].reset_index(), how = "left")
        
    print("Size of %d-anonymous dataset: %d" % (k, k_anonymous.ct.sum()))
    
    k_AnonTotalCounts = k_anonymous.groupby('course_id')["ct"].sum()\
                                   .reset_index(name = "numStudents")\
                                   .set_index('course_id')
    
    completionRates = ((k_anonymous.set_index('course_id').completedCount.fillna(0)
                        / k_AnonTotalCounts.numStudents).fillna(0) * 100)\
                        .reset_index(name = "completionRate")\
                        .sort_values(by = "completionRate", ascending = False)
    
    if len(k_anonymous) == 0: 
        completionRates.completionRate = 0
    
    return completionRates 

When k = 3

In [8]:
completionRate3Anon = getAnonCompletionRates(df, quasi_identifier_labels, 3)

Size of 3-anonymous dataset: 27889


In [9]:
completionRate3Anon.head()

Unnamed: 0,course_id,completionRate
2668,HarvardX/MCB80.2x/3T2014,1.388889
2342,HarvardX/HKS101A/2015T3,0.20202
502,HarvardX/CS50x/2014_T1,0.02641
1049,HarvardX/CS50x/2014_T1,0.013205
1426,HarvardX/CS50x/2014_T1,0.013205


When k = 4

In [10]:
completionRate4Anon = getAnonCompletionRates(df, quasi_identifier_labels, 4)

Size of 4-anonymous dataset: 22522


In [11]:
completionRate4Anon.head()

Unnamed: 0,course_id,completionRate
1388,HarvardX/HKS101A/2015T3,0.228311
315,HarvardX/CS50x/2014_T1,0.034112
333,HarvardX/CS50x/2014_T1,0.017056
109,HarvardX/CS50x/2014_T1,0.017056
375,HarvardX/CS50x/2014_T1,0.017056


When k = 5

In [12]:
completionRate5Anon = getAnonCompletionRates(df, quasi_identifier_labels, 5)

Size of 5-anonymous dataset: 19094


In [13]:
completionRate5Anon.head()

Unnamed: 0,course_id,completionRate
935,HarvardX/HKS101A/2015T3,0.26178
229,HarvardX/CS50x/2014_T1,0.041675
265,HarvardX/CS50x/2014_T1,0.020838
242,HarvardX/CS50x/2014_T1,0.020838
488,HarvardX/CS50x/2014_T1,0.020838


### Synthetic Records
- Make the data k-anonymous
- Find the number of synthetic records needed for each case
- Compute completion rates and compare to dataset without synthetic records

When k = 3

In [14]:
def addSyntheticRows(df, quasi_identifier_labels, k):
    synthetic_k_anon_df = df.reset_index()
    not_k_anonymous = synthetic_k_anon_df.groupby(quasi_identifier_labels)\
                      .size().reset_index(name = 'studentCount')
    not_k_anonymous = not_k_anonymous[not_k_anonymous.studentCount < k]

    for i in range(k):
        rowsToAdd = not_k_anonymous[not_k_anonymous.studentCount == i]
        if len(rowsToAdd) > 0:
            for j in range(k - i):
                synthetic_k_anon_df = synthetic_k_anon_df.append(rowsToAdd, ignore_index = True, sort = False)
            
    rowsAdded = synthetic_k_anon_df.shape[0] - len(df)
    print("Size of synthetic dataset: %d\nSynthetic Rows Added: %d" % (len(synthetic_k_anon_df), rowsAdded))
    # Drop the studentCount column from the dataset we return because df_quasi does not
    # have it.  This column is an artifact of when we grouped synthetic_k_anon_df above
    synthetic_k_anon_df = synthetic_k_anon_df.drop("studentCount", axis = 1)
    return synthetic_k_anon_df
    

Add synthetic data when k = 3

In [15]:
synthetic3AnonDf = addSyntheticRows(df, quasi_identifier_labels, 3)

Size of synthetic dataset: 524677
Synthetic Rows Added: 324678


Add synthetic data when k = 4

In [16]:
synthetic4AnonDf = addSyntheticRows(df_quasi, quasi_identifier_labels, 4)

Size of synthetic dataset: 692062
Synthetic Rows Added: 492063


Add synthetic data when k = 5

In [17]:
synthetic5AnonDf = addSyntheticRows(df_quasi, quasi_identifier_labels, 5)

Size of synthetic dataset: 860304
Synthetic Rows Added: 660305


### K-Anonymity 
- Generalization
- Blurring
- Suprression 


Compare the number of students who complete and explore the course in the original and in the k-anonymous sets