## Re-identification/De-identification 


Read a configuration file containing integers that correspond to column numbers in the
dataset. Each integer represents a quasi-identifier. 


In [2]:
def read_config(config_file):
    with open(config_file) as file:
        identifiers = [int(q) for q in file.read().split()]
    file.close()
    return sorted(identifiers)


In [3]:
read_config("config_test.txt")

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

### Import data

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("mid_sample_set.csv", dtype='unicode')

In [6]:
df.shape

(199999, 91)

In [7]:
df.head()

Unnamed: 0,course_id,user_id,registered,viewed,explored,certified,completed,ip,cc_by_ip,countryLabel,...,roles_isInstructor,roles_isStaff,roles_isCCX,roles_isFinance,roles_isLibrary,roles_isSales,forumRoles_isAdmin,forumRoles_isCommunityTA,forumRoles_isModerator,forumRoles_isStudent
0,HarvardX/PH525.1x/1T2018,29940,True,False,,False,False,198.214.249.125,US,United States,...,,,,,,,,,,1
1,HarvardX/PH525.1x/1T2018,37095,True,False,,False,False,119.30.45.190,BD,Bangladesh,...,,,,,,,,,,1
2,HarvardX/PH525.1x/1T2018,45634,True,False,,False,False,201.233.80.3,CO,Colombia,...,,,,,,,,,,1
3,HarvardX/PH525.1x/1T2018,52234,True,False,,False,False,213.114.41.34,SE,Sweden,...,,,,,,,,,,1
4,HarvardX/PH525.1x/1T2018,52238,True,True,False,False,False,177.226.237.217,MX,Mexico,...,,,,,,,,,,1


### Direct Identifiers
- Can uniquely identify an individual and should be removed

### Quasi-Identifiers
- Can uniquely identify an individual when linked to other datasets. 
- These include:<font color=red> [INSERT AFTER SUNDAY] </font> and are listed as their corresponding column index in the configuration file.
- Create a version of the dataset that only contains the quasi-identifiers.

In [197]:
quasi_identifiers = read_config("config_test.txt")
print((quasi_identifiers))
df_quasi = df.iloc[:,quasi_identifiers]
df_quasi.index = df.user_id
df_quasi = df_quasi.drop('user_id')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


### K-Anonymity
- What remains of the dataset if you make it k-anonymous using suppression
- What is the completion rate for each class at each of these levels.

In [160]:
def getAnonCompletionRates(df_quasi, quasi_identifiers_labels_test, k):
    k_anonymous = df_quasi.groupby(quasi_identifiers_labels_test)['user_id'].count().reset_index(name = 'ct')
    k_anonymous = k_anonymous[k_anonymous.ct >= k]
    print("Size of %d-anonymous dataset: %d" % (k, k_anonymous.ct.sum()))
    k_AnonCompletionCounts = k_anonymous[["course_id", "completed", "ct"]]\
    .groupby(['course_id', "completed"])["ct"].sum()\
    .reset_index(name = "numStudents").set_index("course_id")
    
    k_AnonTotalCounts = k_anonymous[["course_id", "ct"]]\
    .groupby(['course_id'])["ct"].sum()\
    .reset_index(name = "numStudents").set_index("course_id")
    
    completionRates = (k_AnonCompletionCounts[k_AnonCompletionCounts.completed == 'True'].numStudents 
    / k_AnonTotalCounts.numStudents).fillna(0) *100;
    
    return (completionRates.reset_index(name = "completionRate")\
    .sort_values(by = "completionRate", ascending = False))
    

When k = 3

In [161]:
completionRate3Anon = getAnonCompletionRates(df_quasi, quasi_identifiers_labels_test, 3)

Size of 3-anonymous dataset: 78951


In [162]:
completionRate3Anon.head()

Unnamed: 0,course_id,completionRate
32,HarvardX/Hum3.1x/1T2016,5.069124
45,HarvardX/PH207x/2012_Fall,4.679803
88,HarvardX/SW12.3x/2016,4.615385
38,HarvardX/MUS24.3x/1T2016,4.324324
28,HarvardX/HUM1.6x/3T2015,4.109589


When k = 4

In [163]:
completionRate4Anon = getAnonCompletionRates(df_quasi, quasi_identifiers_labels_test, 4)

Size of 4-anonymous dataset: 68973


In [164]:
completionRate4Anon.head()

Unnamed: 0,course_id,completionRate
36,HarvardX/MUS24.3x/1T2016,5.714286
31,HarvardX/Hum3.1x/1T2016,4.819277
43,HarvardX/PH207x/2012_Fall,3.787466
42,HarvardX/PH201x/2013_SOND,1.58831
94,Harvardx/HLS2X/2T2017,1.492537


When k = 5

In [165]:
completionRate5Anon = getAnonCompletionRates(df_quasi, quasi_identifiers_labels_test, 5)

Size of 5-anonymous dataset: 61745


In [166]:
completionRate5Anon.head()

Unnamed: 0,course_id,completionRate
34,HarvardX/MUS24.3x/1T2016,7.407407
30,HarvardX/Hum3.1x/1T2016,7.272727
40,HarvardX/PH207x/2012_Fall,3.301606
39,HarvardX/PH201x/2013_SOND,1.216023
67,HarvardX/PH555x/2014_T2,1.198257


### Synthetic Records
- Make the data k-anonymous
- Find the number of synthetic records needed for each case
- Compute completion rates and compare to dataset without synthetic records

When k = 3

In [176]:
def addSyntheticRows(df_quasi, k):
    synthetic_k_anon_df = df_quasi.reset_index()
    not_k_anonymous = synthetic_k_anon_df.groupby(quasi_identifiers_labels_test)['user_id'].count().reset_index(name = 'ct')
    not_k_anonymous = not_k_anonymous[not_k_anonymous.ct < k]
    [not_k_anonymous]*(k-not_k_anonymous.count())
    
#     for i in range(len(not_k_anonymous)):
        #synthetic_k_anon_df.append([not_k_anonymous]*(k-not_k_anonymous.count()))
    
    #return synthetic_k_anon_df
    

In [192]:
k = 3
synthetic_k_anon_df = df_quasi.reset_index()
not_k_anonymous = synthetic_k_anon_df.groupby(quasi_identifiers_labels_test + [user_id])['user_id'].count().reset_index(name = 'ct')
not_k_anonymous = not_k_anonymous[not_k_anonymous.ct < k]



ValueError: cannot insert user_id, already exists

In [190]:
synthetic_k_anon_df.head()

Unnamed: 0_level_0,course_id,user_id,registered,viewed,explored,certified,completed,ip,cc_by_ip,countryLabel,...,city,region,subdivision,postalCode,un_major_region,un_economic_group,un_developing_nation,un_special_region,latitude,longitude
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29940,HarvardX/PH525.1x/1T2018,29940,True,False,,False,False,198.214.249.125,US,United States,...,Austin,TX,Texas,78713.0,Northern America,Developed regions,,,30.2672,-97.7431
37095,HarvardX/PH525.1x/1T2018,37095,True,False,,False,False,119.30.45.190,BD,Bangladesh,...,Dhaka,13,Dhaka,,Southern Asia,Developing_Nations,Least developed countries,,23.7231,90.4086
45634,HarvardX/PH525.1x/1T2018,45634,True,False,,False,False,201.233.80.3,CO,Colombia,...,Medellín,ANT,Antioquia,,South America,Developing_Nations,,Latin America and the Caribbean,6.2518,-75.5636
52234,HarvardX/PH525.1x/1T2018,52234,True,False,,False,False,213.114.41.34,SE,Sweden,...,Skanör,M,Skåne,,Northern Europe,Developed regions,,,55.4167,12.85
52238,HarvardX/PH525.1x/1T2018,52238,True,True,False,False,False,177.226.237.217,MX,Mexico,...,León,GUA,Guanajuato,,Central America,Developing_Nations,,Latin America and the Caribbean,21.0931,-101.645


In [189]:
not_k_anonymous.head()

Unnamed: 0,course_id,registered,completed,city,region,ct
0,HarvardX/1368.2x/2T2015,True,False,Abbottabad,NW,1
1,HarvardX/1368.2x/2T2015,True,False,Acton,MA,1
2,HarvardX/1368.2x/2T2015,True,False,Ahmedabad,GJ,1
3,HarvardX/1368.2x/2T2015,True,False,Al Ain,AZ,1
4,HarvardX/1368.2x/2T2015,True,False,Alameda,CA,1


In [184]:
synthetic_k_anon_df.append(not_k_anonymous.loc[[0]], ignore_index = True)

Unnamed: 0,course_id,registered,completed,city,region,ct
0,HarvardX/1368.2x/2T2015,True,False,Abbottabad,NW,1


In [177]:
addSyntheticRows(df_quasi, 3)

ValueError: cannot copy sequence with size 60072 to array axis with dimension 6

When k = 4

### K-Anonymity 
- Generalization
- Blurring
- Suprression 


Compare the number of students who complete and explore the course in the original and in the k-anonymous sets