# QIF Analyses (AOL reduced dataset - Citizen Lab Classification)

In [1]:
import numpy, pandas, random, qif
from collections import Counter
from bvmlib.bvm import BVM

## AOL reduced dataset with Citizen Lab Classification

In [2]:
%%time
# Loads AOL-reduced-Citizen-Lab-Classification dataset into aol_reduced_citizen_lab DataFrame.
aol_reduced_citizen_lab = pandas.read_csv('AOL-reduced-Citizen-Lab-Classification.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 3.79 s, sys: 511 ms, total: 4.31 s
Wall time: 4.29 s


In [3]:
%%time
aol_reduced_citizen_lab['QueryTime'] = pandas.to_datetime(aol_reduced_citizen_lab['QueryTime'])

CPU times: user 1.04 s, sys: 24.9 ms, total: 1.06 s
Wall time: 1.04 s


In [4]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
3135265,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
3135266,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
3135267,521691,2006-03-04 11:36:38,go.com,'NEWS'
3135268,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


## Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_reduced_citizen_lab[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

130689
CPU times: user 767 ms, sys: 56.4 ms, total: 823 ms
Wall time: 822 ms


In [6]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab[~aol_reduced_citizen_lab.RandID.isin(singletons)]

In [7]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [8]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
2871078,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
2871079,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
2871080,521691,2006-03-04 11:36:38,go.com,'NEWS'
2871081,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


## Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_reduced_citizen_lab['RandID'].value_counts())

RandID
463921    23505
311717     2516
352479     1379
118679     1167
342660     1011
          ...  
317417        2
120680        2
448289        2
317341        2
193287        2
Name: count, Length: 211314, dtype: int64

CPU times: user 53.2 ms, sys: 16.5 ms, total: 69.7 ms
Wall time: 68.8 ms


In [10]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == 463921].index.to_list())

In [11]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [12]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
2847574,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
2847575,521691,2006-03-04 11:36:38,go.com,'NEWS'
2847576,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


## Experiment 2: Third-party cookies on AOL reduced dataset with Citizen Lab Classification

### Define browsing histories

In [13]:
%%time
aol_reduced_citizen_lab['BrowsingHistory'] = list(zip(aol_reduced_citizen_lab.Domain, aol_reduced_citizen_lab.QueryTime))

CPU times: user 8.96 s, sys: 391 ms, total: 9.35 s
Wall time: 9.34 s


In [14]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'","(harvard.edu, 2006-03-01 12:10:38)"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'","(harvard.edu, 2006-03-08 21:16:04)"
...,...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,'GOVT',"(nih.gov, 2006-05-05 01:09:28)"
2847574,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'","(typepad.com, 2006-05-05 02:00:33)"
2847575,521691,2006-03-04 11:36:38,go.com,'NEWS',"(go.com, 2006-03-04 11:36:38)"
2847576,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH...","(google.com, 2006-03-15 19:24:17)"


In [15]:
%%time
aol_reduced_citizen_lab_browsing_history = aol_reduced_citizen_lab[['RandID','BrowsingHistory']].groupby('RandID', sort=False)[['BrowsingHistory']].agg(lambda x : list(x)).reset_index()

CPU times: user 7.7 s, sys: 104 ms, total: 7.8 s
Wall time: 7.8 s


In [16]:
%%time
aol_reduced_citizen_lab_browsing_history['BrowsingHistory'] = aol_reduced_citizen_lab_browsing_history['BrowsingHistory'].map(lambda x : str(x))

CPU times: user 8.92 s, sys: 51.8 ms, total: 8.97 s
Wall time: 8.97 s


In [17]:
display(aol_reduced_citizen_lab_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
1,3,"[('dogpile.com', Timestamp('2006-03-12 16:09:3..."
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5..."
4,6,"[('technorati.com', Timestamp('2006-03-03 20:4..."
...,...,...
211308,521674,"[('yahoo.com', Timestamp('2006-03-06 10:04:29'..."
211309,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11..."
211310,521688,"[('ytmnd.com', Timestamp('2006-03-12 20:00:14'..."
211311,521689,"[('globalwitness.org', Timestamp('2006-03-04 0..."


### Privacy

In [18]:
temp = aol_reduced_citizen_lab_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,1,1
1,3,3
2,4,4
3,5,5
4,6,6
...,...,...
211308,521674,521674
211309,521686,521686
211310,521688,521688
211311,521689,521689


RandID    211313
UID       211313
dtype: int64

In [19]:
%%time
E2P = BVM(temp)
E2P.qids(['UID'])
E2P.sensitive(['RandID'])
results = E2P.assess()

CPU times: user 1.81 s, sys: 7.91 ms, total: 1.82 s
Wall time: 1.82 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,211313,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,211313.0,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.732316516257873e-06

4.732316516257873e-06

In [22]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

211313.00000000003

211313.00000000003

### Utility

In [24]:
temp = aol_reduced_citizen_lab_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
1,3,"[('dogpile.com', Timestamp('2006-03-12 16:09:3..."
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5..."
4,6,"[('technorati.com', Timestamp('2006-03-03 20:4..."
...,...,...
211308,521674,"[('yahoo.com', Timestamp('2006-03-06 10:04:29'..."
211309,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11..."
211310,521688,"[('ytmnd.com', Timestamp('2006-03-12 20:00:14'..."
211311,521689,"[('globalwitness.org', Timestamp('2006-03-04 0..."


UID                211313
BrowsingHistory    211313
dtype: int64

In [25]:
%%time
E2U = BVM(temp)
E2U.qids(['UID'])
E2U.sensitive(['BrowsingHistory'])
results = E2U.assess()

CPU times: user 2.89 s, sys: 4.23 ms, total: 2.9 s
Wall time: 2.89 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,211313,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,211313.0,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.732316516257873e-06

4.732316516257873e-06

In [28]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

211313.00000000003

211313.00000000003

## Experiment 3: Topics with Generalization on AOL reduced dataset with Citizen Lab Classification

### Define browsing histories and lists of topics

In [30]:
%%time
aol_reduced_citizen_lab['topics'] = aol_reduced_citizen_lab['topics'].map(lambda x : x.replace(" ", "").split(","))

CPU times: user 5.15 s, sys: 697 ms, total: 5.84 s
Wall time: 5.82 s


In [31]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-03-01 12:10:38,harvard.edu,"['NEWS', 'CTRL', 'HUMR', 'PUBH']","(harvard.edu, 2006-03-01 12:10:38)"
4,1,2006-03-08 21:16:04,harvard.edu,"['NEWS', 'CTRL', 'HUMR', 'PUBH']","(harvard.edu, 2006-03-08 21:16:04)"
...,...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,['GOVT'],"(nih.gov, 2006-05-05 01:09:28)"
2847574,521689,2006-05-05 02:00:33,typepad.com,"['HOST', 'NEWS']","(typepad.com, 2006-05-05 02:00:33)"
2847575,521691,2006-03-04 11:36:38,go.com,['NEWS'],"(go.com, 2006-03-04 11:36:38)"
2847576,521691,2006-03-15 19:24:17,google.com,"['COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRC...","(google.com, 2006-03-15 19:24:17)"


In [32]:
%%time
aol_reduced_citizen_lab_generalization = aol_reduced_citizen_lab[['RandID','BrowsingHistory','topics']].groupby('RandID', sort=False)[['BrowsingHistory','topics']].agg(lambda x : list(x)).reset_index()

CPU times: user 18.3 s, sys: 240 ms, total: 18.6 s
Wall time: 18.5 s


In [33]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","[['PUBH'], ['PUBH'], ['PUBH'], ['NEWS', 'CTRL'..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","[['SRCH'], ['SRCH'], ['HOST', 'NEWS'], ['PUBH'..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","[['NEWS', 'REL'], ['COMT', 'MMED', 'CTRL', 'CU..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","[['NEWS', 'REL'], ['GOVT'], ['SRCH', 'POLR', '..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","[['HOST'], ['COMT', 'MMED', 'CTRL', 'CULTR', '..."
...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","[['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SR..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","[['LGBT', 'MMED'], ['GMB'], ['HOST', 'GRP', 'C..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","[['REL'], ['SRCH', 'POLR', 'HUMR', 'REL'], ['C..."
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","[['ENV', 'POLR'], ['NEWS', 'SRCH', 'GRP', 'COM..."


In [34]:
%%time
aol_reduced_citizen_lab_generalization['topics'] = aol_reduced_citizen_lab_generalization['topics'].map(lambda x : [topic for t in x for topic in t])

CPU times: user 1.1 s, sys: 16.2 ms, total: 1.11 s
Wall time: 1.11 s


In [35]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW..."
...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']"
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',..."


In [36]:
aol_reduced_citizen_lab_generalization = aol_reduced_citizen_lab_generalization.rename(columns={'topics' : 'AllTopics'})

In [37]:
%%time
# Computes the number of unique seen topics.
aol_reduced_citizen_lab_seen_topics = sorted(list(set([x for y in aol_reduced_citizen_lab_generalization['AllTopics'] for x in y])))
display(len(aol_reduced_citizen_lab_seen_topics))

31

CPU times: user 1 s, sys: 35.8 ms, total: 1.04 s
Wall time: 1.04 s


In [38]:
%%time
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['AllTopics'].map(lambda x : dict(Counter(x)))

CPU times: user 1.44 s, sys: 32 ms, total: 1.47 s
Wall time: 1.46 s


In [39]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR...","{''PUBH'': 6, ''NEWS'': 4, ''CTRL'': 3, ''HUMR..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT...","{''SRCH'': 8, ''HOST'': 5, ''NEWS'': 6, ''PUBH..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR...","{''NEWS'': 2, ''REL'': 2, ''COMT'': 1, ''MMED'..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'...","{''NEWS'': 1, ''REL'': 2, ''GOVT'': 3, ''SRCH'..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW...","{''HOST'': 11, ''COMT'': 19, ''MMED'': 10, ''C..."
...,...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC...","{''COMT'': 7, ''CULTR'': 4, ''COMM'': 7, ''NEW..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'...","{''LGBT'': 3, ''MMED'': 2, ''GMB'': 1, ''HOST'..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']","{''REL'': 2, ''SRCH'': 1, ''POLR'': 1, ''HUMR'..."
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',...","{''ENV'': 6, ''POLR'': 6, ''NEWS'': 12, ''SRCH..."


In [40]:
def s_top(topics, s, allTopics):
    # If RandID's AllTopics list has exactly S topics, this is already their sTopics list.
    if len(topics) == s:
        top_s = list(topics.keys())
    # If RandID's AllTopics list has less than S topics, selects the remaining topics at random from the taxonomy.
    elif len(topics) < s:
        top_s = list(topics.keys())
        top_s.extend(random.sample(list(set(allTopics) - set(top_s)), k=s-len(top_s)))
    # If RandID's AllTopics list has more than S topics, selects only S of them, in decreasing order of occurrence.
    elif len(topics) > s:
        # Creates a temporary copy of RandID's AllTopics list (here a dictionary mapping topic to count).
        temp = topics.copy()
        # Creates the sTopics list with RandID's most seen topic and deletes it from temporary list.
        top_s = [max(temp, key=temp.get)]
        val = temp[top_s[0]]
        del temp[top_s[0]]
        # Creates a temporary list, alternatives, to account for topics with the same number of occurrences.
        v = temp[max(temp, key=temp.get)]
        alternatives = [top_s[0]]
        # While the sTopics list has less than S topics OR the next topic has the same number of occurrences.
        while len(top_s) < s or v == val:
            t = max(temp, key=temp.get)
            # If the current topic has the same number of occurrences as the previous one,
            # append it to alternatives.
            if v == val:
                alternatives.append(t)
            # Otherwise, clear alternatives and append the current topic to it.
            else:
                alternatives.clear()
                alternatives.append(t)
            top_s.append(t)
            val = v
            del temp[t]
            # If the temporary list is empty, break the loop.
            if len(temp) == 0:
                break
            else:
                v = temp[max(temp, key=temp.get)]
        # If the sTopics list has more than S topics, randomly selects the appropriate number of topics
        # from alternatives, i.e. from the topics with the same number of occurrences.
        if len(top_s) > s:
            delta = s - (len(top_s) - len(alternatives))
            top_s = top_s[0:s-delta]
            top_s.extend(random.sample(alternatives, k=delta))
    
    return sorted(top_s)

In [41]:
%%time
# Selects S topics to be in sTopics list for each RandID.
random.SystemRandom()
S = 5
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['sTopics'].map(lambda x : s_top(x, S, aol_reduced_citizen_lab_seen_topics))

CPU times: user 4.68 s, sys: 36 ms, total: 4.72 s
Wall time: 4.72 s


In [42]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR...","['COMM', 'CTRL', 'HUMR', 'NEWS', 'PUBH']"
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT...","['COMT', 'CULTR', 'HOST', 'NEWS', 'SRCH']"
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR...","['GOVT', 'HATE', 'NEWS', 'REL', 'SRCH']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'...","['GOVT', 'HUMR', 'NEWS', 'POLR', 'REL']"
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW...","['COMT', 'CULTR', 'NEWS', 'POLR', 'SRCH']"
...,...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC...","['COMM', 'COMT', 'GOVT', 'NEWS', 'SRCH']"
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'...","['GOVT', 'GRP', 'HOST', 'LGBT', 'NEWS']"
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']","['COMM', 'HUMR', 'POLR', 'REL', 'SRCH']"
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',...","['CTRL', 'GOVT', 'HOST', 'NEWS', 'SRCH']"


In [43]:
# Checks if every sTopics list has exactly S topics.
display(aol_reduced_citizen_lab_generalization['sTopics'].map(lambda x : True if len(x) == S else False).all())

True

In [44]:
%%time
aol_reduced_citizen_lab_generalization['BrowsingHistory'] = aol_reduced_citizen_lab_generalization['BrowsingHistory'].map(lambda x : str(x))

CPU times: user 8.56 s, sys: 44.1 ms, total: 8.6 s
Wall time: 8.6 s


### Privacy

In [45]:
temp = aol_reduced_citizen_lab_generalization.copy()
temp = temp.drop(columns=['BrowsingHistory','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,RandID,sTopics
0,1,"['COMM', 'CTRL', 'HUMR', 'NEWS', 'PUBH']"
1,3,"['COMT', 'CULTR', 'HOST', 'NEWS', 'SRCH']"
2,4,"['GOVT', 'HATE', 'NEWS', 'REL', 'SRCH']"
3,5,"['GOVT', 'HUMR', 'NEWS', 'POLR', 'REL']"
4,6,"['COMT', 'CULTR', 'NEWS', 'POLR', 'SRCH']"
...,...,...
211308,521674,"['COMM', 'COMT', 'GOVT', 'NEWS', 'SRCH']"
211309,521686,"['GOVT', 'GRP', 'HOST', 'LGBT', 'NEWS']"
211310,521688,"['COMM', 'HUMR', 'POLR', 'REL', 'SRCH']"
211311,521689,"['CTRL', 'GOVT', 'HOST', 'NEWS', 'SRCH']"


In [46]:
%%time
E3P = BVM(temp)
E3P.qids(['sTopics'])
E3P.sensitive(['RandID'])
results = E3P.assess()

CPU times: user 2.95 s, sys: 32.1 ms, total: 2.98 s
Wall time: 2.94 s


In [47]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.079337,28066,5e-06,0.132817,"{'0': 0.5034664218481589, '1': 0.1051331437251..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],RandID,0.079337,28066.0,5e-06,0.132817,"{'0': 0.5034664218481589, '1': 0.1051331437251..."


In [48]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.732316516257873e-06

4.732316516257873e-06

In [49]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.1328171953452935

0.1328171953452935

In [50]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

28066.000000000004

28066.000000000004

### Utility

In [51]:
temp = aol_reduced_citizen_lab_generalization.copy()
temp = temp.drop(columns=['RandID','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,BrowsingHistory,sTopics
0,"[('kidshealth.org', Timestamp('2006-03-01 11:5...","['COMM', 'CTRL', 'HUMR', 'NEWS', 'PUBH']"
1,"[('dogpile.com', Timestamp('2006-03-12 16:09:3...","['COMT', 'CULTR', 'HOST', 'NEWS', 'SRCH']"
2,"[('foxnews.com', Timestamp('2006-03-12 20:33:2...","['GOVT', 'HATE', 'NEWS', 'REL', 'SRCH']"
3,"[('foxnews.com', Timestamp('2006-03-22 00:22:5...","['GOVT', 'HUMR', 'NEWS', 'POLR', 'REL']"
4,"[('technorati.com', Timestamp('2006-03-03 20:4...","['COMT', 'CULTR', 'NEWS', 'POLR', 'SRCH']"
...,...,...
211308,"[('yahoo.com', Timestamp('2006-03-06 10:04:29'...","['COMM', 'COMT', 'GOVT', 'NEWS', 'SRCH']"
211309,"[('flickr.com', Timestamp('2006-03-12 21:19:11...","['GOVT', 'GRP', 'HOST', 'LGBT', 'NEWS']"
211310,"[('ytmnd.com', Timestamp('2006-03-12 20:00:14'...","['COMM', 'HUMR', 'POLR', 'REL', 'SRCH']"
211311,"[('globalwitness.org', Timestamp('2006-03-04 0...","['CTRL', 'GOVT', 'HOST', 'NEWS', 'SRCH']"


In [52]:
%%time
E3U = BVM(temp)
E3U.qids(['sTopics'])
E3U.sensitive(['BrowsingHistory'])
results = E3U.assess()

CPU times: user 4.36 s, sys: 24.8 ms, total: 4.39 s
Wall time: 4.35 s


In [53]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.079337,28066,5e-06,0.132817,"{'0': 0.5034664218481589, '1': 0.1051331437251..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],BrowsingHistory,0.079337,28066.0,5e-06,0.132817,"{'0': 0.5034664218481589, '1': 0.1051331437251..."


In [54]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.732316516257873e-06

4.732316516257873e-06

In [55]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.1328171953452935

0.1328171953452935

In [56]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

28066.000000000004

28066.000000000004

### Unique sets

In [57]:
%%time
display(aol_reduced_citizen_lab_generalization['sTopics'].map(lambda x : str(x)).nunique())

28066

CPU times: user 372 ms, sys: 15.8 ms, total: 388 ms
Wall time: 386 ms


## Experiments 4 and 5: Topics with Generalization and Bounded Noise on AOL reduced dataset with Citizen Lab Classification; and Topics with Generalization, Bounded Noise, and Differential Privacy on AOL reduced dataset with Citizen Lab Classification

In [58]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR...","['COMM', 'CTRL', 'HUMR', 'NEWS', 'PUBH']"
1,3,"[('dogpile.com', Timestamp('2006-03-12 16:09:3...","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT...","['COMT', 'CULTR', 'HOST', 'NEWS', 'SRCH']"
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR...","['GOVT', 'HATE', 'NEWS', 'REL', 'SRCH']"
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'...","['GOVT', 'HUMR', 'NEWS', 'POLR', 'REL']"
4,6,"[('technorati.com', Timestamp('2006-03-03 20:4...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW...","['COMT', 'CULTR', 'NEWS', 'POLR', 'SRCH']"
...,...,...,...,...
211308,521674,"[('yahoo.com', Timestamp('2006-03-06 10:04:29'...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC...","['COMM', 'COMT', 'GOVT', 'NEWS', 'SRCH']"
211309,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'...","['GOVT', 'GRP', 'HOST', 'LGBT', 'NEWS']"
211310,521688,"[('ytmnd.com', Timestamp('2006-03-12 20:00:14'...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']","['COMM', 'HUMR', 'POLR', 'REL', 'SRCH']"
211311,521689,"[('globalwitness.org', Timestamp('2006-03-04 0...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',...","['CTRL', 'GOVT', 'HOST', 'NEWS', 'SRCH']"


### Privacy

In [59]:
%%time
temp_top_S = {tup.RandID : {t.replace('\'','') : 1 for t in tup.sTopics} for tup in aol_reduced_citizen_lab_generalization.itertuples()}

CPU times: user 689 ms, sys: 71.6 ms, total: 761 ms
Wall time: 759 ms


In [60]:
%%time
S = 5
M = len(aol_reduced_citizen_lab_seen_topics)
R = 0.05
# Defines BN (bounded noise) and DP (differential privacy) channels for AOL-reduced-Citizen-Lab-Classification data.
aol_reduced_citizen_lab_channel_bn = pandas.DataFrame.from_dict({k : [1/S if c in v else 0 for c in sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics])] for k,v in temp_top_S.items()}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics]))
aol_reduced_citizen_lab_channel_bn_dp = pandas.DataFrame.from_dict({k : [((1-R)/S) + (R/M) if c in v else (R/M) for c in sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics])] for k,v in temp_top_S.items()}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics]))

CPU times: user 8.23 s, sys: 368 ms, total: 8.6 s
Wall time: 8.6 s


In [61]:
# Cummulative sums of channels rows for sanity check.
display(aol_reduced_citizen_lab_channel_bn.cumsum(axis=1))
display(aol_reduced_citizen_lab_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
1,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,0.8,0.8,0.8,1.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.8,1.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.4,0.6,0.8,0.8,0.8,0.8,1.0,1.0,1.0
6,0.0,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.4,0.4,...,0.4,0.4,0.6,0.8,0.8,0.8,0.8,0.8,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521674,0.0,0.0,0.2,0.4,0.4,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
521686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521688,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.4,0.4,0.4,0.6,0.6,0.6,0.6,0.8,1.0,1.0
521689,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0


Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
1,0.001613,0.003226,0.194839,0.196452,0.388065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.995161,0.996774,0.998387,1.0
3,0.001613,0.003226,0.004839,0.196452,0.198065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
4,0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.415484,0.417097,0.60871,0.610323,0.611935,0.613548,0.615161,0.806774,0.998387,1.0
5,0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.415484,0.417097,0.60871,0.800323,0.801935,0.803548,0.805161,0.996774,0.998387,1.0
6,0.001613,0.003226,0.004839,0.196452,0.198065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.415484,0.417097,0.60871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521674,0.001613,0.003226,0.194839,0.386452,0.388065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
521686,0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.795484,0.797097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
521688,0.001613,0.003226,0.194839,0.196452,0.198065,0.199677,0.20129,0.202903,0.204516,0.206129,...,0.415484,0.417097,0.41871,0.610323,0.611935,0.613548,0.615161,0.806774,0.998387,1.0
521689,0.001613,0.003226,0.004839,0.006452,0.198065,0.199677,0.20129,0.202903,0.204516,0.206129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0


In [62]:
%%time
# Defines uniform prior probability distribution on individuals, i.e. RandIDs.
N = len(aol_reduced_citizen_lab_channel_bn.index)
aol_reduced_citizen_lab_bn_dp_prior = numpy.array([1/N for row in range(N)])

CPU times: user 24.5 ms, sys: 29 µs, total: 24.5 ms
Wall time: 23.6 ms


In [63]:
%%time
# Prior and Posterior Bayes vulnerabilities.
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 4.732316516257873e-06'

'BN Posterior: 2.934036240079882e-05'

'DP Posterior: 2.8109960106571762e-05'

CPU times: user 22.3 ms, sys: 130 µs, total: 22.4 ms
Wall time: 20.9 ms


In [64]:
%%time
# Bayes leakages.
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))

'BN Leakage: 6.200000000000002'

'DP Leakage: 5.9399999999999995'

CPU times: user 20.5 ms, sys: 28 µs, total: 20.5 ms
Wall time: 19.2 ms


### Utility

In [65]:
%%time
# Defines (non-uniform) prior probability distribution on lists of sTopics.

# Defines list of lists of topics.
aol_reduced_citizen_lab_top_S = [str(list(v.keys())).replace("\"","").replace("\'","") for k,v in temp_top_S.items()]

# Computes occurrences for each list of topics and total number of lists.
aol_reduced_citizen_lab_top_S_counts = dict(Counter(aol_reduced_citizen_lab_top_S))
total = sum(aol_reduced_citizen_lab_top_S_counts.values())

# Defines the prior probability distribution as a DataFrame.
d = {case : aol_reduced_citizen_lab_top_S_counts[case]/total for case in aol_reduced_citizen_lab_top_S_counts}
aol_reduced_citizen_lab_top_S_prior = pandas.DataFrame.from_dict(d, orient='index', columns=['prior'])
aol_reduced_citizen_lab_top_S_prior.index.names = ['topics']

CPU times: user 344 ms, sys: 231 µs, total: 344 ms
Wall time: 343 ms


In [66]:
display(aol_reduced_citizen_lab_top_S_prior)

Unnamed: 0_level_0,prior
topics,Unnamed: 1_level_1
"[COMM, CTRL, HUMR, NEWS, PUBH]",0.000535
"[COMT, CULTR, HOST, NEWS, SRCH]",0.000696
"[GOVT, HATE, NEWS, REL, SRCH]",0.000024
"[GOVT, HUMR, NEWS, POLR, REL]",0.000700
"[COMT, CULTR, NEWS, POLR, SRCH]",0.031011
...,...
"[COMM, FILE, GRP, LGBT, PUBH]",0.000005
"[COMM, GMB, IGO, PROV, REL]",0.000005
"[CTRL, CULTR, HACK, HOST, MMED]",0.000005
"[CTRL, FILE, GOVT, MMED, POLR]",0.000005


In [67]:
%%time
S = 5
M = len(aol_reduced_citizen_lab_seen_topics)
R = 0.05
# Defines BN (bounded noise) and DP (differential privacy) channels for AOL-reduced-Citizen-Lab-Classification data.
aol_reduced_citizen_lab_channel_bn = pandas.DataFrame.from_dict({case : [1/S if c in case.replace('[','').replace(']','').split(', ') else 0 for c in sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics]))
aol_reduced_citizen_lab_channel_bn_dp = pandas.DataFrame.from_dict({case : [((1-R)/S) + (R/M) if c in case.replace('[','').replace(']','').split(', ') else (R/M) for c in sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics]))

CPU times: user 2.04 s, sys: 16 ms, total: 2.05 s
Wall time: 2.05 s


In [68]:
# Cummulative sums of channels rows for sanity check.
display(aol_reduced_citizen_lab_channel_bn.cumsum(axis=1))
display(aol_reduced_citizen_lab_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
"[ALDR, ANON, COMM, COMT, CTRL]",0.2,0.4,0.6,0.8,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, FILE]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, POLR]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.8,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, PROV]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.8,0.8,0.8,0.8,0.8,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, PUBH]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[NEWS, PORN, PUBH, REL, SRCH]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.2,0.4,0.4,0.6,0.8,1.0,1.0
"[NEWS, PORN, PUBH, SRCH, XED]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.2,0.4,0.4,0.6,0.6,0.8,1.0
"[NEWS, PORN, REL, SRCH, XED]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.6,0.8,1.0
"[NEWS, PROV, PUBH, REL, SRCH]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.2,0.2,0.4,0.6,0.8,1.0,1.0


Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
"[ALDR, ANON, COMM, COMT, CTRL]",0.191613,0.383226,0.574839,0.766452,0.958065,0.959677,0.96129,0.962903,0.964516,0.966129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, FILE]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.966129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, POLR]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.776129,...,0.795484,0.797097,0.79871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, PROV]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.776129,...,0.795484,0.797097,0.79871,0.800323,0.801935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, PUBH]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.776129,...,0.795484,0.797097,0.79871,0.800323,0.801935,0.803548,0.995161,0.996774,0.998387,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[NEWS, PORN, PUBH, REL, SRCH]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.22871,0.230323,0.421935,0.423548,0.615161,0.806774,0.998387,1.0
"[NEWS, PORN, PUBH, SRCH, XED]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.22871,0.230323,0.421935,0.423548,0.615161,0.616774,0.808387,1.0
"[NEWS, PORN, REL, SRCH, XED]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.22871,0.230323,0.421935,0.423548,0.425161,0.616774,0.808387,1.0
"[NEWS, PROV, PUBH, REL, SRCH]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.22871,0.230323,0.231935,0.423548,0.615161,0.806774,0.998387,1.0


In [69]:
%%time
# Prior and Posterior Bayes vulnerabilities.
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 0.03208037366371212'

'BN Posterior: 0.10295817105431279'

'DP Posterior: 0.09864057033268031'

CPU times: user 9.76 ms, sys: 3.88 ms, total: 13.6 ms
Wall time: 12.1 ms


In [70]:
%%time
# Bayes leakages.
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))

'BN Leakage: 3.209381914736687'

'DP Leakage: 3.0747949312154708'

CPU times: user 10.7 ms, sys: 151 µs, total: 10.9 ms
Wall time: 9.76 ms


#### Bayes leakages over uniform prior

In [71]:
N = len(aol_reduced_citizen_lab_channel_bn.index)

In [72]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))

'BN Leakage: 6.200000000000001'

'DP Leakage: 5.939999999999999'

CPU times: user 19.1 ms, sys: 0 ns, total: 19.1 ms
Wall time: 17.9 ms


#### IBA gain

In [73]:
%%time
# Defines IBA gain function matrix for AOL-reduced-Citizen-Lab-Classification data.
G_IBA = pandas.DataFrame.from_dict({case : [1 if c in case.replace('[','').replace(']','').split(', ') else 0 for c in sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics]))
display(G_IBA.T)

Unnamed: 0,"[ALDR, ANON, COMM, COMT, CTRL]","[ALDR, ANON, COMM, COMT, FILE]","[ALDR, ANON, COMM, COMT, POLR]","[ALDR, ANON, COMM, COMT, PROV]","[ALDR, ANON, COMM, COMT, PUBH]","[ALDR, ANON, COMM, CTRL, CULTR]","[ALDR, ANON, COMM, CTRL, LGBT]","[ALDR, ANON, COMM, CTRL, PUBH]","[ALDR, ANON, COMM, CULTR, DATE]","[ALDR, ANON, COMM, CULTR, FILE]",...,"[NEWS, POLR, PUBH, REL, SRCH]","[NEWS, POLR, PUBH, REL, XED]","[NEWS, POLR, REL, SRCH, XED]","[NEWS, PORN, PROV, PUBH, REL]","[NEWS, PORN, PROV, PUBH, SRCH]","[NEWS, PORN, PUBH, REL, SRCH]","[NEWS, PORN, PUBH, SRCH, XED]","[NEWS, PORN, REL, SRCH, XED]","[NEWS, PROV, PUBH, REL, SRCH]","[NEWS, PUBH, REL, SRCH, XED]"
ALDR,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
ANON,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
COMM,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
COMT,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CTRL,1,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CULTR,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
DATE,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ECON,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FILE,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 1.03 s, sys: 8.12 ms, total: 1.04 s
Wall time: 1.04 s


In [74]:
%%time
# Prior and Posterior IBA gains.
display("Prior: " + str(qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 0.7168844320983349'

'BN Posterior: 1.0'

'DP Posterior: 0.9580645161290323'

CPU times: user 96.9 ms, sys: 435 ms, total: 532 ms
Wall time: 93.8 ms


In [75]:
%%time
# IBA leakages.
display("BN Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))

'BN Leakage: 1.39492497706078'

'DP Leakage: 1.3364281231840376'

CPU times: user 107 ms, sys: 225 ms, total: 332 ms
Wall time: 41.3 ms
