# QIF Analyses (AOL reduced dataset - Google Topics v1 Classification)

In [1]:
import numpy, pandas, random, qif
from collections import Counter
from bvmlib.bvm import BVM

## AOL reduced dataset with Google Topics Classification v1

In [2]:
%%time
# Loads AOL-reduced-Google-Topics-Classification-v1 dataset into aol_reduced_google DataFrame.
aol_reduced_google = pandas.read_csv('AOL-reduced-Google-Topics-Classification-v1.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 3.29 s, sys: 694 ms, total: 3.98 s
Wall time: 3.47 s


In [3]:
%%time
aol_reduced_google['QueryTime'] = pandas.to_datetime(aol_reduced_google['QueryTime'])

CPU times: user 868 ms, sys: 14.1 ms, total: 882 ms
Wall time: 869 ms


In [4]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2493890,521691,2006-03-07 21:29:17,ups.com,'103'
2493891,521691,2006-03-07 21:41:20,fedex.com,'103'
2493892,521691,2006-03-07 21:42:51,ups.com,'103'
2493893,521691,2006-03-15 19:24:17,google.com,'219'


## Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_reduced_google[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

127359
CPU times: user 703 ms, sys: 66.6 ms, total: 769 ms
Wall time: 767 ms


In [6]:
aol_reduced_google = aol_reduced_google[~aol_reduced_google.RandID.isin(singletons)]

In [7]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [8]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2262666,521691,2006-03-07 21:29:17,ups.com,'103'
2262667,521691,2006-03-07 21:41:20,fedex.com,'103'
2262668,521691,2006-03-07 21:42:51,ups.com,'103'
2262669,521691,2006-03-15 19:24:17,google.com,'219'


## Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_reduced_google['RandID'].value_counts())

RandID
463921    19011
132847      933
342660      851
182883      739
196292      723
          ...  
89596         2
199364        2
199377        2
199380        2
261173        2
Name: count, Length: 198024, dtype: int64

CPU times: user 42.5 ms, sys: 24.9 ms, total: 67.4 ms
Wall time: 66.2 ms


In [10]:
aol_reduced_google = aol_reduced_google.drop(aol_reduced_google[aol_reduced_google['RandID'] == 463921].index.to_list())

In [11]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [12]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,'103'
2243656,521691,2006-03-07 21:41:20,fedex.com,'103'
2243657,521691,2006-03-07 21:42:51,ups.com,'103'
2243658,521691,2006-03-15 19:24:17,google.com,'219'


## Experiment 6: Third-party cookies on AOL reduced dataset with Google Taxonomy Classification v1

### Define browsing histories

In [13]:
%%time
aol_reduced_google['BrowsingHistory'] = list(zip(aol_reduced_google.Domain, aol_reduced_google.QueryTime))

CPU times: user 7.42 s, sys: 212 ms, total: 7.63 s
Wall time: 7.62 s


In [14]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149',"(bankofamerica.com, 2006-05-09 19:13:53)"
4,1,2006-05-22 16:46:29,bankofamerica.com,'149',"(bankofamerica.com, 2006-05-22 16:46:29)"
...,...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,'103',"(ups.com, 2006-03-07 21:29:17)"
2243656,521691,2006-03-07 21:41:20,fedex.com,'103',"(fedex.com, 2006-03-07 21:41:20)"
2243657,521691,2006-03-07 21:42:51,ups.com,'103',"(ups.com, 2006-03-07 21:42:51)"
2243658,521691,2006-03-15 19:24:17,google.com,'219',"(google.com, 2006-03-15 19:24:17)"


In [15]:
%%time
aol_reduced_google_browsing_history = aol_reduced_google[['RandID','BrowsingHistory']].groupby('RandID', sort=False)[['BrowsingHistory']].agg(lambda x : list(x)).reset_index()

CPU times: user 6.98 s, sys: 75.1 ms, total: 7.05 s
Wall time: 7.05 s


In [16]:
%%time
aol_reduced_google_browsing_history['BrowsingHistory'] = aol_reduced_google_browsing_history['BrowsingHistory'].map(lambda x : str(x))

CPU times: user 6.97 s, sys: 11.2 ms, total: 6.98 s
Wall time: 6.98 s


In [17]:
display(aol_reduced_google_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
1,3,"[('google.com', Timestamp('2006-03-18 15:06:00..."
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5..."
4,6,"[('google.com', Timestamp('2006-03-06 15:07:17..."
...,...,...
198018,521683,"[('imdb.com', Timestamp('2006-04-08 17:05:00')..."
198019,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11..."
198020,521688,"[('aol.com', Timestamp('2006-03-16 16:59:47'))..."
198021,521689,"[('msn.com', Timestamp('2006-03-04 01:07:45'))..."


### Privacy

In [18]:
temp = aol_reduced_google_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,1,1
1,3,3
2,4,4
3,5,5
4,6,6
...,...,...
198018,521683,521683
198019,521686,521686
198020,521688,521688
198021,521689,521689


RandID    198023
UID       198023
dtype: int64

In [19]:
%%time
E6P = BVM(temp)
E6P.qids(['UID'])
E6P.sensitive(['RandID'])
results = E6P.assess()

CPU times: user 1.69 s, sys: 3.99 ms, total: 1.69 s
Wall time: 1.69 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,198023,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,198023.0,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

5.0499184438171325e-06

5.0499184438171325e-06

In [22]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

198023.0

198023.0

### Utility

In [24]:
temp = aol_reduced_google_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
1,3,"[('google.com', Timestamp('2006-03-18 15:06:00..."
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5..."
4,6,"[('google.com', Timestamp('2006-03-06 15:07:17..."
...,...,...
198018,521683,"[('imdb.com', Timestamp('2006-04-08 17:05:00')..."
198019,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11..."
198020,521688,"[('aol.com', Timestamp('2006-03-16 16:59:47'))..."
198021,521689,"[('msn.com', Timestamp('2006-03-04 01:07:45'))..."


UID                198023
BrowsingHistory    198023
dtype: int64

In [25]:
%%time
E6U = BVM(temp)
E6U.qids(['UID'])
E6U.sensitive(['BrowsingHistory'])
results = E6U.assess()

CPU times: user 2.68 s, sys: 0 ns, total: 2.68 s
Wall time: 2.68 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,198023,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,198023.0,5e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

5.0499184438171325e-06

5.0499184438171325e-06

In [28]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

198023.0

198023.0

## Experiment 7: Topics with Generalization on AOL reduced dataset with Google Taxonomy Classification v1

### Define browsing histories and lists of topics

In [30]:
%%time
aol_reduced_google['topics'] = aol_reduced_google['topics'].map(lambda x : x.split(","))

CPU times: user 3.58 s, sys: 293 ms, total: 3.87 s
Wall time: 3.85 s


In [31]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-05-09 19:13:53,bankofamerica.com,['149'],"(bankofamerica.com, 2006-05-09 19:13:53)"
4,1,2006-05-22 16:46:29,bankofamerica.com,['149'],"(bankofamerica.com, 2006-05-22 16:46:29)"
...,...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,['103'],"(ups.com, 2006-03-07 21:29:17)"
2243656,521691,2006-03-07 21:41:20,fedex.com,['103'],"(fedex.com, 2006-03-07 21:41:20)"
2243657,521691,2006-03-07 21:42:51,ups.com,['103'],"(ups.com, 2006-03-07 21:42:51)"
2243658,521691,2006-03-15 19:24:17,google.com,['219'],"(google.com, 2006-03-15 19:24:17)"


In [32]:
%%time
aol_reduced_google_generalization = aol_reduced_google[['RandID','BrowsingHistory','topics']].groupby('RandID', sort=False)[['BrowsingHistory','topics']].agg(lambda x : list(x)).reset_index()

CPU times: user 15.3 s, sys: 195 ms, total: 15.5 s
Wall time: 15.5 s


In [33]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","[['258', '276'], ['258', '276'], ['258', '276'..."
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","[['219'], ['239'], ['289'], ['258', '276'], ['..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","[['1', '243'], ['219']]"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","[['1', '243'], ['103'], ['1', '215', '243']]"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","[['219'], ['219'], ['219'], ['219'], ['219'], ..."
...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","[['12', '45'], ['1', '243']]"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","[['42', '250', '253'], ['57', '62', '83'], ['2..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","[['1', '215', '243'], ['289'], ['275']]"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","[['1', '215', '243'], ['219'], ['219'], ['219'..."


In [34]:
%%time
aol_reduced_google_generalization['topics'] = aol_reduced_google_generalization['topics'].map(lambda x : [topic for t in x for topic in t])

CPU times: user 470 ms, sys: 20.1 ms, total: 490 ms
Wall time: 488 ms


In [35]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14..."
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21..."
...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'..."


In [36]:
aol_reduced_google_generalization = aol_reduced_google_generalization.rename(columns={'topics' : 'AllTopics'})

In [37]:
%%time
# Computes the number of unique seen topics.
aol_reduced_google_seen_topics = sorted(list(set([x for y in aol_reduced_google_generalization['AllTopics'] for x in y])))
display(len(aol_reduced_google_seen_topics))

169

CPU times: user 272 ms, sys: 4.25 ms, total: 276 ms
Wall time: 272 ms


In [38]:
%%time
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['AllTopics'].map(lambda x : dict(Counter(x)))

CPU times: user 773 ms, sys: 27.6 ms, total: 801 ms
Wall time: 797 ms


In [39]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14...","{''258'': 3, ''276'': 3, ''149'': 2}"
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27...","{''219'': 2, ''239'': 1, ''289'': 1, ''258'': ..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']","{''1'': 1, ''243'': 1, ''219'': 1}"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']","{''1'': 2, ''243'': 2, ''103'': 1, ''215'': 1}"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21...","{''219'': 10, ''12'': 2, ''289'': 1, ''299'': ..."
...,...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']","{''12'': 1, ''45'': 1, ''1'': 1, ''243'': 1}"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ...","{''42'': 1, ''250'': 1, ''253'': 1, ''57'': 1,..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']","{''1'': 1, ''215'': 1, ''243'': 1, ''289'': 1,..."
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'...","{''1'': 1, ''215'': 1, ''243'': 2, ''219'': 5,..."


In [40]:
def s_top(topics, s, allTopics):
    # If RandID's AllTopics list has exactly S topics, this is already their sTopics list.
    if len(topics) == s:
        top_s = list(topics.keys())
    # If RandID's AllTopics list has less than S topics, selects the remaining topics at random from the taxonomy.
    elif len(topics) < s:
        top_s = list(topics.keys())
        top_s.extend(random.sample(list(set(allTopics) - set(top_s)), k=s-len(top_s)))
    # If RandID's AllTopics list has more than S topics, selects only S of them, in decreasing order of occurrence.
    elif len(topics) > s:
        # Creates a temporary copy of RandID's AllTopics list (here a dictionary mapping topic to count).
        temp = topics.copy()
        # Creates the sTopics list with RandID's most seen topic and deletes it from temporary list.
        top_s = [max(temp, key=temp.get)]
        val = temp[top_s[0]]
        del temp[top_s[0]]
        # Creates a temporary list, alternatives, to account for topics with the same number of occurrences.
        v = temp[max(temp, key=temp.get)]
        alternatives = [top_s[0]]
        # While the sTopics list has less than S topics OR the next topic has the same number of occurrences.
        while len(top_s) < s or v == val:
            t = max(temp, key=temp.get)
            # If the current topic has the same number of occurrences as the previous one,
            # append it to alternatives.
            if v == val:
                alternatives.append(t)
            # Otherwise, clear alternatives and append the current topic to it.
            else:
                alternatives.clear()
                alternatives.append(t)
            top_s.append(t)
            val = v
            del temp[t]
            # If the temporary list is empty, break the loop.
            if len(temp) == 0:
                break
            else:
                v = temp[max(temp, key=temp.get)]
        # If the sTopics list has more than S topics, randomly selects the appropriate number of topics
        # from alternatives, i.e. from the topics with the same number of occurrences.
        if len(top_s) > s:
            delta = s - (len(top_s) - len(alternatives))
            top_s = top_s[0:s-delta]
            top_s.extend(random.sample(alternatives, k=delta))
    
    return sorted(top_s)

In [41]:
%%time
# Selects S topics to be in sTopics list for each RandID.
random.SystemRandom()
S = 5
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['sTopics'].map(lambda x : s_top(x, S, aol_reduced_google_seen_topics))

CPU times: user 3.44 s, sys: 27.8 ms, total: 3.47 s
Wall time: 3.47 s


In [42]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14...","['149', '188', '258', '276', '62']"
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27...","['151', '180', '183', '186', '258']"
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']","['1', '148', '219', '243', '304']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']","['1', '103', '215', '243', '6']"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21...","['12', '219', '289', '300', '314']"
...,...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']","['1', '12', '243', '45', '97']"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ...","['248', '250', '253', '57', '62']"
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']","['1', '215', '243', '275', '289']"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'...","['1', '215', '219', '243', '334']"


In [43]:
# Checks if every sTopics list has exactly S topics.
display(aol_reduced_google_generalization['sTopics'].map(lambda x : True if len(x) == S else False).all())

True

In [44]:
%%time
aol_reduced_google_generalization['BrowsingHistory'] = aol_reduced_google_generalization['BrowsingHistory'].map(lambda x : str(x))

CPU times: user 6.76 s, sys: 43.9 ms, total: 6.81 s
Wall time: 6.81 s


### Privacy

In [45]:
temp = aol_reduced_google_generalization.copy()
temp = temp.drop(columns=['BrowsingHistory','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,RandID,sTopics
0,1,"['149', '188', '258', '276', '62']"
1,3,"['151', '180', '183', '186', '258']"
2,4,"['1', '148', '219', '243', '304']"
3,5,"['1', '103', '215', '243', '6']"
4,6,"['12', '219', '289', '300', '314']"
...,...,...
198018,521683,"['1', '12', '243', '45', '97']"
198019,521686,"['248', '250', '253', '57', '62']"
198020,521688,"['1', '215', '243', '275', '289']"
198021,521689,"['1', '215', '219', '243', '334']"


In [46]:
%%time
E7P = BVM(temp)
E7P.qids(['sTopics'])
E7P.sensitive(['RandID'])
results = E7P.assess()

CPU times: user 2.43 s, sys: 16.2 ms, total: 2.44 s
Wall time: 2.42 s


In [47]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.486605,110733,5e-06,0.559193,"{'0': 0.09218121127343794, '1': 0.065754988056..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],RandID,0.486605,110733.0,5e-06,0.559193,"{'0': 0.09218121127343794, '1': 0.065754988056..."


In [48]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

5.0499184438171325e-06

5.0499184438171325e-06

In [49]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.5591926190392025

0.5591926190392025

In [50]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

110732.99999999999

110732.99999999999

### Utility

In [51]:
temp = aol_reduced_google_generalization.copy()
temp = temp.drop(columns=['RandID','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,BrowsingHistory,sTopics
0,"[('kidshealth.org', Timestamp('2006-03-01 11:5...","['149', '188', '258', '276', '62']"
1,"[('google.com', Timestamp('2006-03-18 15:06:00...","['151', '180', '183', '186', '258']"
2,"[('foxnews.com', Timestamp('2006-03-12 20:33:2...","['1', '148', '219', '243', '304']"
3,"[('foxnews.com', Timestamp('2006-03-22 00:22:5...","['1', '103', '215', '243', '6']"
4,"[('google.com', Timestamp('2006-03-06 15:07:17...","['12', '219', '289', '300', '314']"
...,...,...
198018,"[('imdb.com', Timestamp('2006-04-08 17:05:00')...","['1', '12', '243', '45', '97']"
198019,"[('flickr.com', Timestamp('2006-03-12 21:19:11...","['248', '250', '253', '57', '62']"
198020,"[('aol.com', Timestamp('2006-03-16 16:59:47'))...","['1', '215', '243', '275', '289']"
198021,"[('msn.com', Timestamp('2006-03-04 01:07:45'))...","['1', '215', '219', '243', '334']"


In [52]:
%%time
E7U = BVM(temp)
E7U.qids(['sTopics'])
E7U.sensitive(['BrowsingHistory'])
results = E7U.assess()

CPU times: user 3.16 s, sys: 19.4 ms, total: 3.18 s
Wall time: 3.16 s


In [53]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.486605,110733,5e-06,0.559193,"{'0': 0.09218121127343794, '1': 0.065754988056..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],BrowsingHistory,0.486605,110733.0,5e-06,0.559193,"{'0': 0.09218121127343794, '1': 0.065754988056..."


In [54]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

5.0499184438171325e-06

5.0499184438171325e-06

In [55]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.5591926190392025

0.5591926190392025

In [56]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

110732.99999999999

110732.99999999999

### Unique sets

In [57]:
%%time
display(aol_reduced_google_generalization['sTopics'].map(lambda x : str(x)).nunique())

110733

CPU times: user 295 ms, sys: 27.6 ms, total: 323 ms
Wall time: 321 ms


## Experiments 8 and 9: Topics with Generalization and Bounded Noise on AOL reduced dataset with Google Taxonomy Classification v1; and Topics with Generalization, Bounded Noise, and Differential Privacy on AOL reduced dataset with Google Taxonomy Classification v1

In [58]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5...","['258', '276', '258', '276', '258', '276', '14...","['149', '188', '258', '276', '62']"
1,3,"[('google.com', Timestamp('2006-03-18 15:06:00...","['219', '239', '289', '258', '276', '258', '27...","['151', '180', '183', '186', '258']"
2,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2...","['1', '243', '219']","['1', '148', '219', '243', '304']"
3,5,"[('foxnews.com', Timestamp('2006-03-22 00:22:5...","['1', '243', '103', '1', '215', '243']","['1', '103', '215', '243', '6']"
4,6,"[('google.com', Timestamp('2006-03-06 15:07:17...","['219', '219', '219', '219', '219', '219', '21...","['12', '219', '289', '300', '314']"
...,...,...,...,...
198018,521683,"[('imdb.com', Timestamp('2006-04-08 17:05:00')...","['12', '45', '1', '243']","['1', '12', '243', '45', '97']"
198019,521686,"[('flickr.com', Timestamp('2006-03-12 21:19:11...","['42', '250', '253', '57', '62', '83', '248', ...","['248', '250', '253', '57', '62']"
198020,521688,"[('aol.com', Timestamp('2006-03-16 16:59:47'))...","['1', '215', '243', '289', '275']","['1', '215', '243', '275', '289']"
198021,521689,"[('msn.com', Timestamp('2006-03-04 01:07:45'))...","['1', '215', '243', '219', '219', '219', '334'...","['1', '215', '219', '243', '334']"


### Privacy

In [59]:
%%time
temp_top_S = {tup.RandID : {t.replace('\'','') : 1 for t in tup.sTopics} for tup in aol_reduced_google_generalization.itertuples()}

CPU times: user 633 ms, sys: 44.1 ms, total: 677 ms
Wall time: 675 ms


In [60]:
%%time
S = 5
M = len(aol_reduced_google_seen_topics)
R = 0.05
# Defines BN (bounded noise) channel for AOL-reduced-Google-Topics-Classification-v1 data.
aol_reduced_google_channel_bn = pandas.DataFrame.from_dict({k : [1/S if c in v else 0 for c in sorted([c.replace("'","") for c in aol_reduced_google_seen_topics])] for k,v in temp_top_S.items()}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_seen_topics]))

CPU times: user 19.9 s, sys: 580 ms, total: 20.5 s
Wall time: 20.5 s


In [61]:
%%time
aol_reduced_google_all_topics = ["'" + str(i) + "'" for i in range(1,350)]
M = len(aol_reduced_google_all_topics)
# Defines DP (differential privacy) channel for AOL-reduced-Google-Topics-Classification-v1 data.
aol_reduced_google_channel_bn_dp = pandas.DataFrame.from_dict({k : [((1-R)/S) + (R/M) if c in v else (R/M) for c in sorted([c.replace("'","") for c in aol_reduced_google_all_topics])] for k,v in temp_top_S.items()}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_all_topics]))

CPU times: user 44.9 s, sys: 2.11 s, total: 47 s
Wall time: 47 s


In [62]:
# Cummulative sums of channels rows for sanity check.
display(aol_reduced_google_channel_bn.cumsum(axis=1))
display(aol_reduced_google_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,1,100,102,103,104,108,109,11,112,114,...,82,83,84,86,88,92,94,96,97,99
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.4,0.4,0.4,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521683,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
521686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521688,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521689,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,1,10,100,101,102,103,104,105,106,107,...,90,91,92,93,94,95,96,97,98,99
1,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
3,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
4,0.190143,0.190287,0.19043,0.190573,0.190716,0.19086,0.191003,0.191146,0.191289,0.191433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
5,0.190143,0.190287,0.19043,0.190573,0.190716,0.38086,0.381003,0.381146,0.381289,0.381433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
6,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521683,0.190143,0.190287,0.19043,0.190573,0.190716,0.19086,0.191003,0.191146,0.191289,0.191433,...,0.808711,0.808854,0.808997,0.80914,0.809284,0.809427,0.80957,0.999713,0.999857,1.0
521686,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
521688,0.190143,0.190287,0.19043,0.190573,0.190716,0.19086,0.191003,0.191146,0.191289,0.191433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
521689,0.190143,0.190287,0.19043,0.190573,0.190716,0.19086,0.191003,0.191146,0.191289,0.191433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0


In [63]:
%%time
# Defines uniform prior probability distribution on individuals, i.e. RandIDs.
N = len(aol_reduced_google_channel_bn.index)
aol_reduced_google_bn_dp_prior = numpy.array([1/N for row in range(N)])

CPU times: user 22.5 ms, sys: 0 ns, total: 22.5 ms
Wall time: 22 ms


In [64]:
%%time
# Prior and Posterior Bayes vulnerabilities.
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn_dp.to_numpy())))

'Prior: 5.0499184438171325e-06'

'BN Posterior: 0.00017068724340101878'

'DP Posterior: 0.00016240537715315882'

CPU times: user 111 ms, sys: 15.4 ms, total: 126 ms
Wall time: 122 ms


In [65]:
%%time
# Bayes leakages.
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))

'BN Leakage: 33.79999999999994'

'DP Leakage: 32.15999999999997'

CPU times: user 117 ms, sys: 7.88 ms, total: 125 ms
Wall time: 122 ms


### Utility

In [66]:
%%time
# Defines (non-uniform) prior probability distribution on lists of sTopics.

# Defines list of lists of topics.
aol_reduced_google_top_S = [str(list(v.keys())).replace("\"","").replace("\'","") for k,v in temp_top_S.items()]

# Computes occurrences for each list of topics and total number of lists.
aol_reduced_google_top_S_counts = dict(Counter(aol_reduced_google_top_S))
total = sum(aol_reduced_google_top_S_counts.values())

# Defines the prior probability distribution as a DataFrame.
d = {case : aol_reduced_google_top_S_counts[case]/total for case in aol_reduced_google_top_S_counts}
aol_reduced_google_top_S_prior = pandas.DataFrame.from_dict(d, orient='index', columns=['prior'])
aol_reduced_google_top_S_prior.index.names = ['topics']

CPU times: user 416 ms, sys: 3.98 ms, total: 420 ms
Wall time: 416 ms


In [67]:
display(aol_reduced_google_top_S_prior)

Unnamed: 0_level_0,prior
topics,Unnamed: 1_level_1
"[149, 188, 258, 276, 62]",0.000005
"[151, 180, 183, 186, 258]",0.000005
"[1, 148, 219, 243, 304]",0.000005
"[1, 103, 215, 243, 6]",0.000015
"[12, 219, 289, 300, 314]",0.000010
...,...
"[129, 219, 289, 332, 349]",0.000005
"[179, 251, 275, 337, 99]",0.000005
"[166, 248, 284, 289, 332]",0.000005
"[248, 250, 253, 57, 62]",0.000005


In [68]:
%%time
S = 5
M = len(aol_reduced_google_seen_topics)
R = 0.05
# Defines BN (bounded noise) channel for AOL-reduced-Google-Topics-Classification-v1 data.
aol_reduced_google_channel_bn = pandas.DataFrame.from_dict({case : [1/S if c in case.replace('[','').replace(']','').split(', ') else 0 for c in sorted([c.replace("'","") for c in aol_reduced_google_seen_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_seen_topics]))

CPU times: user 21.4 s, sys: 55.5 ms, total: 21.4 s
Wall time: 21.4 s


In [69]:
%%time
M = len(aol_reduced_google_all_topics)
# Defines DP (differential privacy) channel for AOL-reduced-Google-Topics-Classification-v1 data.
aol_reduced_google_channel_bn_dp = pandas.DataFrame.from_dict({case : [((1-R)/S) + (R/M) if c in case.replace('[','').replace(']','').split(', ') else (R/M) for c in sorted([c.replace("'","") for c in aol_reduced_google_all_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_all_topics]))

CPU times: user 44.3 s, sys: 976 ms, total: 45.2 s
Wall time: 45.2 s


In [70]:
# Cummulative sums of channels rows for sanity check.
display(aol_reduced_google_channel_bn.cumsum(axis=1))
display(aol_reduced_google_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,1,100,102,103,104,108,109,11,112,114,...,82,83,84,86,88,92,94,96,97,99
"[1, 100, 102, 23, 243]",0.2,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 103, 112, 243]",0.2,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.8,0.8,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 103, 12, 243]",0.2,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 103, 126, 289]",0.2,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 103, 149, 243]",0.2,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[57, 62, 81, 82, 83]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[57, 62, 81, 83, 96]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0,1.0
"[57, 62, 81, 83, 99]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,1.0
"[57, 62, 83, 88, 92]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.6,0.6,0.6,0.8,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,1,10,100,101,102,103,104,105,106,107,...,90,91,92,93,94,95,96,97,98,99
"[1, 100, 102, 23, 243]",0.190143,0.190287,0.38043,0.380573,0.570716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 103, 112, 243]",0.190143,0.190287,0.38043,0.380573,0.380716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 103, 12, 243]",0.190143,0.190287,0.38043,0.380573,0.380716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 103, 126, 289]",0.190143,0.190287,0.38043,0.380573,0.380716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 103, 149, 243]",0.190143,0.190287,0.38043,0.380573,0.380716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[57, 62, 81, 82, 83]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[57, 62, 81, 83, 96]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.808711,0.808854,0.808997,0.80914,0.809284,0.809427,0.99957,0.999713,0.999857,1.0
"[57, 62, 81, 83, 99]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.808711,0.808854,0.808997,0.80914,0.809284,0.809427,0.80957,0.809713,0.809857,1.0
"[57, 62, 83, 88, 92]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.808711,0.808854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0


In [71]:
%%time
# Prior and Posterior Bayes vulnerabilities.
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())))

'Prior: 0.014917459083035808'

'BN Posterior: 0.04486953535700399'

'DP Posterior: 0.04304289101203809'

CPU times: user 146 ms, sys: 88.1 ms, total: 234 ms
Wall time: 228 ms


In [72]:
%%time
# Bayes leakages.
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))

'BN Leakage: 3.0078537576167914'

'DP Leakage: 2.885403658387549'

CPU times: user 198 ms, sys: 67.9 ms, total: 266 ms
Wall time: 260 ms


#### Bayes over uniform prior

In [73]:
N = len(aol_reduced_google_channel_bn.index)

In [74]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))

'BN Leakage: 33.79999999999994'

'DP Leakage: 32.15999999999997'

CPU times: user 116 ms, sys: 133 µs, total: 116 ms
Wall time: 112 ms


#### IBA gain

In [75]:
%%time
# Defines IBA gain function matrix for BN (bounded noise) channel for AOL-reduced-Google-Topics-Classification-v1 data.
G_IBA_bn = pandas.DataFrame.from_dict({case : [1 if c in case.replace('[','').replace(']','').split(', ') else 0 for c in sorted([c.replace("'","") for c in aol_reduced_google_seen_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_seen_topics]))
display(G_IBA_bn.T)

Unnamed: 0,"[1, 100, 102, 23, 243]","[1, 100, 103, 112, 243]","[1, 100, 103, 12, 243]","[1, 100, 103, 126, 289]","[1, 100, 103, 149, 243]","[1, 100, 103, 152, 243]","[1, 100, 103, 173, 209]","[1, 100, 103, 173, 243]","[1, 100, 103, 194, 243]","[1, 100, 103, 207, 243]",...,"[42, 56, 57, 62, 83]","[42, 57, 62, 81, 83]","[42, 57, 62, 83, 92]","[45, 57, 62, 81, 83]","[53, 57, 62, 81, 83]","[57, 62, 81, 82, 83]","[57, 62, 81, 83, 96]","[57, 62, 81, 83, 99]","[57, 62, 83, 88, 92]","[57, 62, 83, 96, 99]"
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
100,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
102,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
94,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 20.7 s, sys: 7.81 ms, total: 20.7 s
Wall time: 20.7 s


In [76]:
%%time
# Defines IBA gain function matrix for DP (differential privacy) channel for AOL-reduced-Google-Topics-Classification-v1 data.
G_IBA_bn_dp = pandas.DataFrame.from_dict({case : [1 if c in case.replace('[','').replace(']','').split(', ') else 0 for c in sorted([c.replace("'","") for c in aol_reduced_google_all_topics])] for case in sorted(d.keys())}, orient='index', columns=sorted([c.replace("'","") for c in aol_reduced_google_all_topics]))
display(G_IBA_bn_dp.T)

Unnamed: 0,"[1, 100, 102, 23, 243]","[1, 100, 103, 112, 243]","[1, 100, 103, 12, 243]","[1, 100, 103, 126, 289]","[1, 100, 103, 149, 243]","[1, 100, 103, 152, 243]","[1, 100, 103, 173, 209]","[1, 100, 103, 173, 243]","[1, 100, 103, 194, 243]","[1, 100, 103, 207, 243]",...,"[42, 56, 57, 62, 83]","[42, 57, 62, 81, 83]","[42, 57, 62, 83, 92]","[45, 57, 62, 81, 83]","[53, 57, 62, 81, 83]","[57, 62, 81, 82, 83]","[57, 62, 81, 83, 96]","[57, 62, 81, 83, 99]","[57, 62, 83, 88, 92]","[57, 62, 83, 96, 99]"
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 44.2 s, sys: 259 ms, total: 44.4 s
Wall time: 44.4 s


In [77]:
%%time
# Prior and Posterior IBA gains for BN (bounded noise) channel.
display("BN Prior: " + str(qif.measure.g_vuln.prior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())))

'BN Prior: 0.4187947864641014'

'BN Posterior: 1.0000000000000002'

CPU times: user 1.25 s, sys: 748 ms, total: 2 s
Wall time: 465 ms


In [78]:
%%time
# IBA leakage for BN (bounded noise) channel.
display("BN Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())/qif.measure.g_vuln.prior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))

'BN Leakage: 2.38780431925391'

CPU times: user 1.34 s, sys: 1.15 s, total: 2.49 s
Wall time: 445 ms


In [79]:
%%time
# Prior and Posterior IBA gains for DP (differential privacy) channel.
display("DP Prior: " + str(qif.measure.g_vuln.prior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("DP Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())))

'DP Prior: 0.4187947864641014'

'DP Posterior: 0.9615161979317409'

CPU times: user 3.3 s, sys: 1.87 s, total: 5.17 s
Wall time: 1.13 s


In [80]:
%%time
# IBA leakage for DP (differential privacy) channel.
display("DP Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.g_vuln.prior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))

'DP Leakage: 2.295912530454008'

CPU times: user 3.5 s, sys: 1.69 s, total: 5.2 s
Wall time: 1.16 s
