# QIF Analyses (AOL reduced dataset - Google Topics v1 Classification)

In [1]:
import numpy, pandas, random, qif
from bvmlib.bvm import BVM

## AOL reduced dataset with Google Topics Classification v1

In [2]:
%%time
aol_reduced_google = pandas.read_csv('AOL-reduced-Google-Topics-Classification-v1.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 2.52 s, sys: 673 ms, total: 3.19 s
Wall time: 2.63 s


In [3]:
%%time
aol_reduced_google['QueryTime'] = pandas.to_datetime(aol_reduced_google['QueryTime'])

CPU times: user 733 ms, sys: 5.83 ms, total: 739 ms
Wall time: 733 ms


In [4]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,"'12','45'"
1,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
2,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
3,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
4,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
...,...,...,...,...
2495271,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2495272,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495273,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495274,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


## Drop singletons

In [5]:
%%time
singletons = (aol_reduced_google[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_reduced_google[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 466 ms, sys: 43 ms, total: 509 ms
Wall time: 508 ms


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_reduced_google[aol_reduced_google['RandID'] == i].index.to_list()[0])

CPU times: user 4min 32s, sys: 28.8 ms, total: 4min 32s
Wall time: 4min 32s


In [7]:
display(len(rows_to_drop))

94047

In [8]:
aol_reduced_google = aol_reduced_google.drop(rows_to_drop)

In [9]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [10]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
1,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
2,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
3,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
4,33,2006-05-20 02:17:44,imdb.com,"'12','45'"
...,...,...,...,...
2401224,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2401225,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2401226,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2401227,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


## Drop outlier

In [11]:
%%time
display(aol_reduced_google['RandID'].value_counts())

RandID
83937     18938
211028      930
443325      851
97578       771
146607      736
          ...  
280676        2
280813        2
280985        2
281645        2
468776        2
Name: count, Length: 230927, dtype: int64

CPU times: user 72.7 ms, sys: 4.03 ms, total: 76.7 ms
Wall time: 75 ms


In [12]:
aol_reduced_google = aol_reduced_google.drop(aol_reduced_google[aol_reduced_google['RandID'] == 83937].index.to_list())

In [13]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [14]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
1,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
2,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
3,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
4,33,2006-05-20 02:17:44,imdb.com,"'12','45'"
...,...,...,...,...
2382286,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2382287,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2382288,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2382289,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


## Experiment 6: Third-party cookies on AOL reduced dataset with Google Taxonomy Classification v1

### Define browsing histories

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': []}
for k, v in aol_reduced_google.groupby('RandID').__iter__():
    temp = []
    for tup in v.itertuples():
        temp.append(tuple([tup.Domain, tup.QueryTime]))
    d['BrowsingHistory'].append(str(temp))
    d['RandID'].append(k)

CPU times: user 2min 3s, sys: 2.11 s, total: 2min 5s
Wall time: 2min 3s


In [16]:
aol_reduced_google_browsing_history = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory'])

In [17]:
display(aol_reduced_google_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14..."
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22..."
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17..."
...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'..."
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1..."
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')..."


### Privacy

In [18]:
temp = aol_reduced_google_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,3,3
1,4,4
2,7,7
3,10,10
4,12,12
...,...,...
230921,521685,521685
230922,521686,521686
230923,521687,521687
230924,521689,521689


RandID    230926
UID       230926
dtype: int64

In [19]:
%%time
E6P = BVM(temp)
E6P.qids(['UID'])
E6P.sensitive(['RandID'])
results = E6P.assess()

CPU times: user 1.45 s, sys: 15 µs, total: 1.45 s
Wall time: 1.45 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,230926,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,230926.0,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.330391554004313e-06

4.330391554004313e-06

In [22]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

230926.0

230926.0

### Utility

In [24]:
temp = aol_reduced_google_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14..."
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22..."
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17..."
...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'..."
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1..."
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')..."


UID                230926
BrowsingHistory    230926
dtype: int64

In [25]:
%%time
E6U = BVM(temp)
E6U.qids(['UID'])
E6U.sensitive(['BrowsingHistory'])
results = E6U.assess()

CPU times: user 2.28 s, sys: 3.87 ms, total: 2.29 s
Wall time: 2.28 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,230926,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,230926.0,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.330391554004313e-06

4.330391554004313e-06

In [28]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

230926.0

230926.0

## Experiment 7: Topics with Generalization on AOL reduced dataset with Google Taxonomy Classification v1

### Define browsing histories and lists of topics

In [30]:
%%time
d = {'RandID': [], 'BrowsingHistory': [], 'AllTopics': []}
for k, v in aol_reduced_google.groupby('RandID').__iter__():
    temp_history = []
    temp_topics = []
    for tup in v.itertuples():
        temp_history.append(tuple([tup.Domain, tup.QueryTime]))
        temp_topics.extend(tup.topics.split(","))
    d['BrowsingHistory'].append(str(temp_history))
    d['AllTopics'].append(temp_topics)
    d['RandID'].append(k)

CPU times: user 2min 1s, sys: 2.22 s, total: 2min 3s
Wall time: 2min 1s


In [31]:
%%time
aol_reduced_google_seen_topics = []
for case in d['AllTopics']:
    aol_reduced_google_seen_topics.extend(case)
aol_reduced_google_seen_topics = list(set(aol_reduced_google_seen_topics))
display(len(aol_reduced_google_seen_topics))

169

CPU times: user 220 ms, sys: 4.01 ms, total: 224 ms
Wall time: 222 ms


In [32]:
aol_reduced_google_generalization = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory', 'AllTopics'])

In [33]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21..."
...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']"
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']"


In [34]:
aol_reduced_google_generalization['TopicsSet'] = aol_reduced_google_generalization['AllTopics']

In [35]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'TopicsSet'] = sorted(set(aol_reduced_google_generalization.at[row.Index, 'TopicsSet']))

CPU times: user 15.1 s, sys: 71.8 ms, total: 15.1 s
Wall time: 15.1 s


In [36]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']"
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']"
...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']"
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142']
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']"


In [37]:
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['AllTopics']

In [38]:
%%time
random.SystemRandom()
S = 5

aol_reduced_google_topics_by_users = {i : {t : 0 for t in aol_reduced_google_seen_topics} for i in aol_reduced_google_generalization['RandID'].unique()}

for row in aol_reduced_google_generalization.itertuples():
    for topic in row.AllTopics:
        aol_reduced_google_topics_by_users[row.RandID][topic] = aol_reduced_google_topics_by_users[row.RandID][topic] + 1

temp_top_S = {i : dict() for i in aol_reduced_google_generalization['RandID'].unique()}

for k,v in aol_reduced_google_topics_by_users.items():
    temp_dict = dict()
    for t,c in v.items():
        if c != 0:
            temp_dict[t] = c
    temp_dict = dict(sorted(temp_dict.items(), key=lambda item: item[1]))
    
    if len(temp_dict) <= S:
        temp_set = list({t for t in aol_reduced_google_seen_topics} - {k for k in temp_dict.keys()})
        
        while len(temp_dict) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            temp_dict[topic] = 0
        
        temp_top_S[k].update(dict(sorted(temp_dict.items())))
    
    else:
        t,c = temp_dict.popitem()
        max_c = c
        final = dict()
        temp = {t : c}
        
        while len(final) + len(temp) < S:
            t,c = temp_dict.popitem()
            max_c = max(max_c, c)
            
            if max_c == c:
                temp[t] = c
            else:
                final.update(temp)
                temp.clear()
                temp[t] = c
        
        t,c = temp_dict.popitem()
        while (max_c == c) and (len(temp_dict) > 0):
            temp[t] = c
            t,c = temp_dict.popitem()
        
        temp_set = list({k for k in temp.keys()})
        
        while len(final) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            final[topic] = temp[topic]
        
        temp_top_S[k].update(dict(sorted(final.items())))

CPU times: user 10.2 s, sys: 397 ms, total: 10.6 s
Wall time: 10.6 s


In [39]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'sTopics'] = temp_top_S[row.RandID]

CPU times: user 5.22 s, sys: 0 ns, total: 5.22 s
Wall time: 5.22 s


In [40]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']","{''157'': 0, ''179'': 1, ''198'': 0, ''228'': ..."
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']","{''129'': 2, ''140'': 2, ''145'': 2, ''268'': ..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']","{''100'': 2, ''254'': 2, ''262'': 0, ''275'': ..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']","{''190'': 0, ''266'': 0, ''275'': 3, ''289'': ..."
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']","{''150'': 0, ''219'': 6, ''248'': 1, ''332'': ..."
...,...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']","{''129'': 1, ''183'': 1, ''23'': 2, ''243'': 1..."
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']","{''139'': 0, ''219'': 10, ''266'': 0, ''275'':..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142'],"{''142'': 2, ''190'': 0, ''207'': 0, ''263'': ..."
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']","{''12'': 1, ''215'': 1, ''217'': 1, ''275'': 1..."


In [41]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    for k,v in row.sTopics.items():
        if ((v == 0) and (k in row.AllTopics)) or ((v > 0) and (k not in row.AllTopics)):
            print("Error!")

CPU times: user 651 ms, sys: 9 µs, total: 651 ms
Wall time: 649 ms


In [42]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'sTopics'] = list(temp_top_S[row.RandID].keys())

CPU times: user 5.44 s, sys: 3.94 ms, total: 5.45 s
Wall time: 5.44 s


In [43]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']","['157', '179', '198', '228', '289']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']","['129', '140', '145', '268', '275']"
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']","['100', '254', '262', '275', '7']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']","['190', '266', '275', '289', '297']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']","['150', '219', '248', '332', '344']"
...,...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']","['129', '183', '23', '243', '289']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']","['139', '219', '266', '275', '293']"
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142'],"['142', '190', '207', '263', '344']"
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']","['12', '215', '217', '275', '45']"


### Privacy

In [44]:
temp = aol_reduced_google_generalization.copy()
temp = temp.drop(columns=['BrowsingHistory','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,RandID,TopicsSet,sTopics
0,3,"['179', '289']","['157', '179', '198', '228', '289']"
1,4,"['129', '140', '145', '275']","['129', '140', '145', '268', '275']"
2,7,"['100', '254', '275']","['100', '254', '262', '275', '7']"
3,10,"['275', '289']","['190', '266', '275', '289', '297']"
4,12,"['219', '248', '332', '344']","['150', '219', '248', '332', '344']"
...,...,...,...
230921,521685,"['129', '183', '23', '243', '289']","['129', '183', '23', '243', '289']"
230922,521686,"['219', '275']","['139', '219', '266', '275', '293']"
230923,521687,['142'],"['142', '190', '207', '263', '344']"
230924,521689,"['12', '215', '217', '219', '275', '45']","['12', '215', '217', '275', '45']"


In [45]:
%%time
E7P = BVM(temp)
E7P.qids(['sTopics'])
E7P.sensitive(['RandID'])
results = E7P.assess()

CPU times: user 1.73 s, sys: 2 µs, total: 1.73 s
Wall time: 1.72 s


In [46]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.54213,139937,4e-06,0.605982,"{'0': 0.08451625195950217, '1': 0.059257078024..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],RandID,0.54213,139937.0,4e-06,0.605982,"{'0': 0.08451625195950217, '1': 0.059257078024..."


In [47]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.330391554004313e-06

4.330391554004313e-06

In [48]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.6059820028927015

0.6059820028927015

In [49]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

139937.0

139937.0

### Utility

In [50]:
temp = aol_reduced_google_generalization.copy()
temp = temp.drop(columns=['RandID','AllTopics'], inplace=False)
display(temp)

Unnamed: 0,BrowsingHistory,TopicsSet,sTopics
0,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['179', '289']","['157', '179', '198', '228', '289']"
1,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275']","['129', '140', '145', '268', '275']"
2,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['100', '254', '275']","['100', '254', '262', '275', '7']"
3,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '289']","['190', '266', '275', '289', '297']"
4,"[('priceline.com', Timestamp('2006-04-04 21:17...","['219', '248', '332', '344']","['150', '219', '248', '332', '344']"
...,...,...,...
230921,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['129', '183', '23', '243', '289']","['129', '183', '23', '243', '289']"
230922,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '275']","['139', '219', '266', '275', '293']"
230923,"[('1001freefonts.com', Timestamp('2006-04-08 1...",['142'],"['142', '190', '207', '263', '344']"
230924,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '215', '217', '219', '275', '45']","['12', '215', '217', '275', '45']"


In [51]:
%%time
E7U = BVM(temp)
E7U.qids(['sTopics'])
E7U.sensitive(['BrowsingHistory'])
results = E7U.assess()

CPU times: user 2.5 s, sys: 0 ns, total: 2.5 s
Wall time: 2.5 s


In [52]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.54213,139937,4e-06,0.605982,"{'0': 0.08451625195950217, '1': 0.059257078024..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],BrowsingHistory,0.54213,139937.0,4e-06,0.605982,"{'0': 0.08451625195950217, '1': 0.059257078024..."


In [53]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.330391554004313e-06

4.330391554004313e-06

In [54]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.6059820028927015

0.6059820028927015

In [55]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

139937.0

139937.0

### Unique sets

In [56]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'sTopics'] = str(aol_reduced_google_generalization.at[row.Index, 'sTopics'])

CPU times: user 14.5 s, sys: 27.7 ms, total: 14.5 s
Wall time: 14.5 s


In [57]:
%%time
display(aol_reduced_google_generalization['sTopics'].nunique())

139937

CPU times: user 61.4 ms, sys: 0 ns, total: 61.4 ms
Wall time: 59.7 ms


## Experiments 8 and 9: Topics with Generalization and Bounded Noise on AOL reduced dataset with Google Taxonomy Classification v1; and Topics with Generalization, Bounded Noise, and Differential Privacy on AOL reduced dataset with Google Taxonomy Classification v1

In [58]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']","[""'157'"", ""'179'"", ""'198'"", ""'228'"", ""'289'""]"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']","[""'129'"", ""'140'"", ""'145'"", ""'268'"", ""'275'""]"
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']","[""'100'"", ""'254'"", ""'262'"", ""'275'"", ""'7'""]"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']","[""'190'"", ""'266'"", ""'275'"", ""'289'"", ""'297'""]"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']","[""'150'"", ""'219'"", ""'248'"", ""'332'"", ""'344'""]"
...,...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']","[""'129'"", ""'183'"", ""'23'"", ""'243'"", ""'289'""]"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']","[""'139'"", ""'219'"", ""'266'"", ""'275'"", ""'293'""]"
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142'],"[""'142'"", ""'190'"", ""'207'"", ""'263'"", ""'344'""]"
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']","[""'12'"", ""'215'"", ""'217'"", ""'275'"", ""'45'""]"


### Privacy

In [59]:
%%time
S = 5
M = len(aol_reduced_google_seen_topics)
R = 0.05

aol_reduced_google_channel_bn = pandas.DataFrame(data=temp_top_S).T.applymap(lambda x: 1/S, na_action='ignore').fillna(0)
aol_reduced_google_channel_bn = aol_reduced_google_channel_bn[sorted(aol_reduced_google_channel_bn.columns.to_list())]
aol_reduced_google_channel_bn = aol_reduced_google_channel_bn.sort_index()

aol_reduced_google_unseen_topics = []
for i in range(1,350):
    if "'" + str(i) + "'" not in aol_reduced_google_seen_topics:
        aol_reduced_google_unseen_topics.append("'" + str(i) + "'")
aol_reduced_google_all_topics = aol_reduced_google_seen_topics + aol_reduced_google_unseen_topics

M = len(aol_reduced_google_all_topics)

aol_reduced_google_channel_bn_dp = pandas.DataFrame(data=temp_top_S).T.applymap(lambda x: ((1-R)/S) + (R/M), na_action='ignore').fillna(R/M)
aol_reduced_google_channel_bn_dp = pandas.concat(axis=1, objs=[aol_reduced_google_channel_bn_dp, pandas.DataFrame(columns=aol_reduced_google_unseen_topics,index=aol_reduced_google_channel_bn_dp.index).fillna(R/M)])
aol_reduced_google_channel_bn_dp = aol_reduced_google_channel_bn_dp[sorted(aol_reduced_google_channel_bn_dp.columns.to_list())]
aol_reduced_google_channel_bn_dp = aol_reduced_google_channel_bn_dp.sort_index()

CPU times: user 38 s, sys: 3.2 s, total: 41.2 s
Wall time: 40.5 s


In [60]:
display(aol_reduced_google_channel_bn.cumsum(axis=1))
display(aol_reduced_google_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,'1','100','102','103','104','108','109','11','112','114',...,'82','83','84','86','88','92','94','96','97','99'
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,'1','10','100','101','102','103','104','105','106','107',...,'90','91','92','93','94','95','96','97','98','99'
3,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
4,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
7,0.000143,0.000287,0.19043,0.190573,0.190716,0.19086,0.191003,0.191146,0.191289,0.191433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
10,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
12,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521685,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
521686,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
521687,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
521689,0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0


In [61]:
%%time
N = len(aol_reduced_google_channel_bn.index)
aol_reduced_google_bn_dp_prior = numpy.array([1/N for row in range(N)])

CPU times: user 21.9 ms, sys: 33 µs, total: 21.9 ms
Wall time: 21.4 ms


In [62]:
%%time
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn_dp.to_numpy())))

'Prior: 4.330391554004313e-06'

'BN Posterior: 0.00014636723452534552'

'DP Posterior: 0.00013926539237677857'

CPU times: user 121 ms, sys: 0 ns, total: 121 ms
Wall time: 119 ms


In [63]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_bn_dp_prior, aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_bn_dp_prior)))

'BN Leakage: 33.79999999999994'

'DP Leakage: 32.15999999999997'

CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 118 ms


### Utility

In [64]:
%%time
aol_reduced_google_top_S = []
for k,v in temp_top_S.items():
    temp = str(list(v.keys()))
    temp = temp.replace("\"","")
    temp = temp.replace("\'","")
    aol_reduced_google_top_S.append(temp)

aol_reduced_google_top_S_count = 0
aol_reduced_google_top_S_counts = dict()
for case in aol_reduced_google_top_S:
    if case in aol_reduced_google_top_S_counts:
        aol_reduced_google_top_S_counts[case] = aol_reduced_google_top_S_counts[case] + 1
    else:
        aol_reduced_google_top_S_counts[case] = 1
    aol_reduced_google_top_S_count += 1

aol_reduced_google_top_S_prior = aol_reduced_google_top_S_counts
for case in aol_reduced_google_top_S_prior:
    aol_reduced_google_top_S_prior[case] = aol_reduced_google_top_S_prior[case] / aol_reduced_google_top_S_count

d = {'topics' : [], 'prior' : []}
for k,v in aol_reduced_google_top_S_prior.items():
    d['topics'].append(k)
    d['prior'].append(v)
aol_reduced_google_top_S_prior = pandas.DataFrame(data=d)
aol_reduced_google_top_S_prior = aol_reduced_google_top_S_prior.set_index('topics').sort_index()

CPU times: user 753 ms, sys: 7.93 ms, total: 761 ms
Wall time: 759 ms


In [65]:
display(aol_reduced_google_top_S_prior)

Unnamed: 0_level_0,prior
topics,Unnamed: 1_level_1
"[1, 100, 102, 103, 243]",0.000004
"[1, 100, 102, 215, 219]",0.000004
"[1, 100, 102, 215, 243]",0.000004
"[1, 100, 102, 215, 289]",0.000004
"[1, 100, 102, 215, 83]",0.000004
...,...
"[53, 57, 62, 81, 83]",0.000004
"[57, 62, 81, 82, 83]",0.000004
"[57, 62, 83, 84, 97]",0.000004
"[57, 62, 83, 88, 92]",0.000004


In [66]:
%%time
S = 5
M = len(aol_reduced_google_seen_topics)
R = 0.05

aol_reduced_google_channel_bn = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_google_seen_topics], index=d['topics'])
for tup in aol_reduced_google_channel_bn.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in aol_reduced_google_channel_bn.columns:
        aol_reduced_google_channel_bn.at[tup.Index, col] = 1/S if col in temp else 0

aol_reduced_google_channel_bn = aol_reduced_google_channel_bn[sorted(aol_reduced_google_channel_bn.columns.to_list())]
aol_reduced_google_channel_bn = aol_reduced_google_channel_bn.sort_index()

CPU times: user 7min 28s, sys: 200 ms, total: 7min 28s
Wall time: 7min 28s


In [67]:
%%time
M = len(aol_reduced_google_all_topics)

aol_reduced_google_channel_bn_dp = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_google_all_topics], index=d['topics'])
for tup in aol_reduced_google_channel_bn_dp.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in aol_reduced_google_channel_bn_dp.columns:
        aol_reduced_google_channel_bn_dp.at[tup.Index, col] = ((1-R)/S) + (R/M) if col in temp else (R/M)

aol_reduced_google_channel_bn_dp = aol_reduced_google_channel_bn_dp[sorted(aol_reduced_google_channel_bn_dp.columns.to_list())]
aol_reduced_google_channel_bn_dp = aol_reduced_google_channel_bn_dp.sort_index()

CPU times: user 14min 49s, sys: 2.2 s, total: 14min 51s
Wall time: 14min 51s


In [68]:
display(aol_reduced_google_channel_bn.cumsum(axis=1))
display(aol_reduced_google_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,1,100,102,103,104,108,109,11,112,114,...,82,83,84,86,88,92,94,96,97,99
"[1, 100, 102, 103, 243]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 102, 215, 219]",0.2,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 102, 215, 243]",0.2,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 102, 215, 289]",0.2,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[1, 100, 102, 215, 83]",0.2,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,...,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[53, 57, 62, 81, 83]",0,0,0,0,0,0,0,0,0,0,...,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[57, 62, 81, 82, 83]",0,0,0,0,0,0,0,0,0,0,...,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[57, 62, 83, 84, 97]",0,0,0,0,0,0,0,0,0,0,...,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
"[57, 62, 83, 88, 92]",0,0,0,0,0,0,0,0,0,0,...,0.4,0.6,0.6,0.6,0.8,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,1,10,100,101,102,103,104,105,106,107,...,90,91,92,93,94,95,96,97,98,99
"[1, 100, 102, 103, 243]",0.190143,0.190287,0.38043,0.380573,0.570716,0.76086,0.761003,0.761146,0.761289,0.761433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 102, 215, 219]",0.190143,0.190287,0.38043,0.380573,0.570716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 102, 215, 243]",0.190143,0.190287,0.38043,0.380573,0.570716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 102, 215, 289]",0.190143,0.190287,0.38043,0.380573,0.570716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[1, 100, 102, 215, 83]",0.190143,0.190287,0.38043,0.380573,0.570716,0.57086,0.571003,0.571146,0.571289,0.571433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[53, 57, 62, 81, 83]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[57, 62, 81, 82, 83]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.998711,0.998854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0
"[57, 62, 83, 84, 97]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.808711,0.808854,0.808997,0.80914,0.809284,0.809427,0.80957,0.999713,0.999857,1.0
"[57, 62, 83, 88, 92]",0.000143,0.000287,0.00043,0.000573,0.000716,0.00086,0.001003,0.001146,0.001289,0.001433,...,0.808711,0.808854,0.998997,0.99914,0.999284,0.999427,0.99957,0.999713,0.999857,1.0


In [69]:
%%time
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())))

'Prior: 0.012817958999852767'

'BN Posterior: 0.027601049686912682'

'DP Posterior: 0.026571317814086774'

CPU times: user 6.16 s, sys: 296 ms, total: 6.45 s
Wall time: 6.42 s


In [70]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_google_top_S_prior['prior'].to_numpy())))

'BN Leakage: 2.15331081081081'

'DP Leakage: 2.0729757221404737'

CPU times: user 6.87 s, sys: 320 ms, total: 7.19 s
Wall time: 7.13 s


#### Bayes over uniform prior

In [71]:
N = len(aol_reduced_google_channel_bn.index)

In [72]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_google_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))

'BN Leakage: 33.79999999999994'

'DP Leakage: 32.15999999999997'

CPU times: user 6.14 s, sys: 148 ms, total: 6.29 s
Wall time: 6.26 s


#### IBA gain

In [73]:
%%time
G_IBA_bn = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_google_seen_topics], index=d['topics'])
for tup in G_IBA_bn.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in G_IBA_bn.columns:
        G_IBA_bn.at[tup.Index, col] = 1 if col in temp else 0
G_IBA_bn = G_IBA_bn[sorted(G_IBA_bn.columns.to_list())]
G_IBA_bn = G_IBA_bn.sort_index()
display(G_IBA_bn.T)

Unnamed: 0,"[1, 100, 102, 103, 243]","[1, 100, 102, 215, 219]","[1, 100, 102, 215, 243]","[1, 100, 102, 215, 289]","[1, 100, 102, 215, 83]","[1, 100, 103, 12, 215]","[1, 100, 103, 12, 219]","[1, 100, 103, 12, 243]","[1, 100, 103, 126, 289]","[1, 100, 103, 126, 45]",...,"[39, 45, 57, 62, 83]","[43, 83, 88, 94, 99]","[45, 57, 62, 83, 84]","[48, 57, 62, 81, 83]","[48, 57, 62, 83, 84]","[53, 57, 62, 81, 83]","[57, 62, 81, 82, 83]","[57, 62, 83, 84, 97]","[57, 62, 83, 88, 92]","[57, 62, 83, 88, 96]"
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
100,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
102,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
94,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


CPU times: user 7min 1s, sys: 215 ms, total: 7min 1s
Wall time: 7min 1s


In [201]:
%%time
G_IBA_bn_dp = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_google_all_topics], index=d['topics'])
for tup in G_IBA_bn_dp.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in G_IBA_bn_dp.columns:
        G_IBA_bn_dp.at[tup.Index, col] = 1 if col in temp else 0
G_IBA_bn_dp = G_IBA_bn_dp[sorted(G_IBA_bn_dp.columns.to_list())]
G_IBA_bn_dp = G_IBA_bn_dp.sort_index()
display(G_IBA_bn_dp.T)

Unnamed: 0,"[1, 100, 102, 263, 277]","[1, 100, 103, 12, 243]","[1, 100, 103, 126, 243]","[1, 100, 103, 126, 289]","[1, 100, 103, 140, 215]","[1, 100, 103, 157, 243]","[1, 100, 103, 173, 243]","[1, 100, 103, 173, 289]","[1, 100, 103, 198, 243]","[1, 100, 103, 207, 243]",...,"[346, 45, 62, 82, 83]","[346, 57, 62, 81, 83]","[346, 57, 62, 83, 97]","[4, 57, 62, 81, 83]","[4, 57, 62, 83, 88]","[43, 57, 62, 83, 84]","[53, 57, 62, 81, 83]","[57, 62, 81, 82, 83]","[57, 62, 81, 83, 94]","[57, 62, 83, 88, 92]"
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 19min 18s, sys: 340 ms, total: 19min 19s
Wall time: 19min 19s


In [202]:
%%time
display("BN Prior: " + str(qif.measure.g_vuln.prior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())))

'BN Prior: 0.4405359408630445'

'BN Posterior: 0.9999999999999989'

CPU times: user 8.27 s, sys: 1.12 s, total: 9.4 s
Wall time: 7.18 s


In [203]:
%%time
display("BN Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn.to_numpy())/qif.measure.g_vuln.prior(G_IBA_bn.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))

'BN Leakage: 2.2699623509512534'

CPU times: user 8.56 s, sys: 1.66 s, total: 10.2 s
Wall time: 7.24 s


In [204]:
%%time
display("DP Prior: " + str(qif.measure.g_vuln.prior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))
display("DP Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())))

'DP Prior: 0.4405359408630445'

'DP Posterior: 0.9620768580738301'

CPU times: user 18.1 s, sys: 2.71 s, total: 20.8 s
Wall time: 14.8 s


In [205]:
%%time
display("DP Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy(), aol_reduced_google_channel_bn_dp.to_numpy())/qif.measure.g_vuln.prior(G_IBA_bn_dp.T.to_numpy(dtype=numpy.int32), aol_reduced_google_top_S_prior['prior'].to_numpy())))

'DP Leakage: 2.1838782465490696'

CPU times: user 18.1 s, sys: 2.6 s, total: 20.7 s
Wall time: 14.8 s
