# QIF Analyses (AOL reduced dataset - Citizen Lab Classification)

In [1]:
import numpy, pandas, random, qif
from bvmlib.bvm import BVM

## AOL reduced dataset with Citizen Lab Classification

In [2]:
%%time
aol_reduced_citizen_lab = pandas.read_csv('AOL-reduced-Citizen-Lab-Classification.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 2.96 s, sys: 721 ms, total: 3.68 s
Wall time: 3.12 s


In [3]:
%%time
aol_reduced_citizen_lab['QueryTime'] = pandas.to_datetime(aol_reduced_citizen_lab['QueryTime'])

CPU times: user 916 ms, sys: 9.96 ms, total: 926 ms
Wall time: 917 ms


In [4]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3128280,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3128281,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3128282,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3128283,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


## Drop singletons

In [5]:
%%time
singletons = (aol_reduced_citizen_lab[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_reduced_citizen_lab[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 490 ms, sys: 28.2 ms, total: 518 ms
Wall time: 517 ms


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == i].index.to_list()[0])

CPU times: user 3min 58s, sys: 35.7 ms, total: 3min 58s
Wall time: 3min 58s


In [7]:
display(len(rows_to_drop))

93659

In [8]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(rows_to_drop)

In [9]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [10]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3034621,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3034622,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3034623,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3034624,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


## Drop outlier

In [11]:
%%time
display(aol_reduced_citizen_lab['RandID'].value_counts())

RandID
83937     23437
75099      2516
25702      1379
43358      1167
443325     1011
          ...  
63719         2
63685         2
63490         2
63485         2
133897        2
Name: count, Length: 247987, dtype: int64

CPU times: user 42.2 ms, sys: 12 ms, total: 54.1 ms
Wall time: 52.8 ms


In [12]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == 83937].index.to_list())

In [13]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [14]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3011184,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3011185,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3011186,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3011187,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


## Experiment 2: Third-party cookies on AOL reduced dataset with Citizen Lab Classification

### Define browsing histories

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': []}
for k, v in aol_reduced_citizen_lab.groupby('RandID').__iter__():
    temp = []
    for tup in v.itertuples():
        temp.append(tuple([tup.Domain, tup.QueryTime]))
    d['BrowsingHistory'].append(str(temp))
    d['RandID'].append(k)

CPU times: user 2min 2s, sys: 2.77 s, total: 2min 4s
Wall time: 2min 2s


In [16]:
aol_reduced_citizen_lab_browsing_history = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory'])

In [17]:
display(aol_reduced_citizen_lab_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5..."
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'..."
...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4..."
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')..."


### Privacy

In [18]:
temp = aol_reduced_citizen_lab_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,0,0
1,3,3
2,4,4
3,10,10
4,12,12
...,...,...
247981,521683,521683
247982,521685,521685
247983,521686,521686
247984,521689,521689


RandID    247986
UID       247986
dtype: int64

In [19]:
%%time
E2P = BVM(temp)
E2P.qids(['UID'])
E2P.sensitive(['RandID'])
results = E2P.assess()

CPU times: user 1.52 s, sys: 0 ns, total: 1.52 s
Wall time: 1.52 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,247986,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,247986.0,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.032485704838176e-06

4.032485704838176e-06

In [22]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

247986.0

247986.0

### Utility

In [24]:
temp = aol_reduced_citizen_lab_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5..."
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'..."
...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4..."
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')..."


UID                247986
BrowsingHistory    247986
dtype: int64

In [25]:
%%time
E2U = BVM(temp)
E2U.qids(['UID'])
E2U.sensitive(['BrowsingHistory'])
results = E2U.assess()

CPU times: user 2.2 s, sys: 6 µs, total: 2.2 s
Wall time: 2.2 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,247986,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,247986.0,4e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.032485704838176e-06

4.032485704838176e-06

In [28]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

247986.0

247986.0

## Experiment 3: Topics with Generalization on AOL reduced dataset with Citizen Lab Classification

### Define browsing histories and lists of topics

In [30]:
%%time
d = {'RandID': [], 'BrowsingHistory': [], 'AllTopics': []}
for k, v in aol_reduced_citizen_lab.groupby('RandID').__iter__():
    temp_history = []
    temp_topics = []
    for tup in v.itertuples():
        temp_history.append(tuple([tup.Domain, tup.QueryTime]))
        temp_topics.extend(tup.topics.replace(" ", "").split(","))
    d['BrowsingHistory'].append(str(temp_history))
    d['AllTopics'].append(temp_topics)
    d['RandID'].append(k)

CPU times: user 2min 10s, sys: 2.59 s, total: 2min 13s
Wall time: 2min 10s


In [31]:
%%time
aol_reduced_citizen_lab_seen_topics = []
for case in d['AllTopics']:
    aol_reduced_citizen_lab_seen_topics.extend(case)
aol_reduced_citizen_lab_seen_topics = list(set(aol_reduced_citizen_lab_seen_topics))
display(len(aol_reduced_citizen_lab_seen_topics))

31

CPU times: user 705 ms, sys: 40.1 ms, total: 745 ms
Wall time: 741 ms


In [32]:
aol_reduced_citizen_lab_generalization = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory', 'AllTopics'])

In [33]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM..."
...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU..."


In [34]:
aol_reduced_citizen_lab_generalization['TopicsSet'] = aol_reduced_citizen_lab_generalization['AllTopics']

In [35]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'TopicsSet'] = sorted(set(aol_reduced_citizen_lab_generalization.at[row.Index, 'TopicsSet']))

CPU times: user 16.8 s, sys: 71.7 ms, total: 16.9 s
Wall time: 16.9 s


In [36]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV..."
...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']"
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE..."


In [37]:
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['AllTopics']

In [38]:
%%time
random.SystemRandom()
S = 5

aol_reduced_citizen_lab_topics_by_users = {i : {t : 0 for t in aol_reduced_citizen_lab_seen_topics} for i in aol_reduced_citizen_lab_generalization['RandID'].unique()}

for row in aol_reduced_citizen_lab_generalization.itertuples():
    for topic in row.AllTopics:
        aol_reduced_citizen_lab_topics_by_users[row.RandID][topic] = aol_reduced_citizen_lab_topics_by_users[row.RandID][topic] + 1

temp_top_S = {i : dict() for i in aol_reduced_citizen_lab_generalization['RandID'].unique()}

for k,v in aol_reduced_citizen_lab_topics_by_users.items():
    temp_dict = dict()
    for t,c in v.items():
        if c != 0:
            temp_dict[t] = c
    temp_dict = dict(sorted(temp_dict.items(), key=lambda item: item[1]))
    
    if len(temp_dict) <= S:
        temp_set = list({t for t in aol_reduced_citizen_lab_seen_topics} - {k for k in temp_dict.keys()})
        
        while len(temp_dict) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            temp_dict[topic] = 0
        
        temp_top_S[k].update(dict(sorted(temp_dict.items())))
    
    else:
        t,c = temp_dict.popitem()
        max_c = c
        final = dict()
        temp = {t : c}
        
        while len(final) + len(temp) < S:
            t,c = temp_dict.popitem()
            max_c = max(max_c, c)
            
            if max_c == c:
                temp[t] = c
            else:
                final.update(temp)
                temp.clear()
                temp[t] = c
        
        t,c = temp_dict.popitem()
        while (max_c == c) and (len(temp_dict) > 0):
            temp[t] = c
            t,c = temp_dict.popitem()
        
        temp_set = list({k for k in temp.keys()})
        
        while len(final) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            final[topic] = temp[topic]
        
        temp_top_S[k].update(dict(sorted(final.items())))

CPU times: user 8.85 s, sys: 60 ms, total: 8.91 s
Wall time: 8.91 s


In [39]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'] = temp_top_S[row.RandID]

CPU times: user 5.59 s, sys: 3.96 ms, total: 5.6 s
Wall time: 5.59 s


In [40]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL...","{''COMM'': 4, ''COMT'': 4, ''CULTR'': 5, ''GOV..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']","{''ALDR'': 1, ''COMM'': 2, ''CULTR'': 1, ''GRP..."
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']","{''COMM'': 2, ''COMT'': 2, ''FILE'': 2, ''PUBH..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW...","{''COMM'': 2, ''CULTR'': 3, ''MILX'': 2, ''NEW..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","{''CULTR'': 17, ''GRP'': 13, ''HOST'': 16, ''N..."
...,...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']","{''COMM'': 1, ''COMT'': 1, ''GRP'': 1, ''NEWS'..."
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']","{''CULTR'': 2, ''GAME'': 1, ''MILX'': 1, ''NEW..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","{''COMT'': 14, ''CULTR'': 14, ''GOVT'': 14, ''..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE...","{''CULTR'': 5, ''DATE'': 3, ''NEWS'': 3, ''POL..."


In [41]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    for k,v in row.sTopics.items():
        if ((v == 0) and (k in row.AllTopics)) or ((v > 0) and (k not in row.AllTopics)):
            print("Error!")

CPU times: user 699 ms, sys: 20 µs, total: 699 ms
Wall time: 697 ms


In [42]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'] = list(temp_top_S[row.RandID].keys())

CPU times: user 5.71 s, sys: 7.95 ms, total: 5.72 s
Wall time: 5.72 s


In [43]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL...","['COMM', 'COMT', 'CULTR', 'GOVT', 'POLR']"
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']","['COMM', 'COMT', 'FILE', 'PUBH', 'REL']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW...","['COMM', 'CULTR', 'MILX', 'NEWS', 'POLR']"
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","['CULTR', 'GRP', 'HOST', 'NEWS', 'SRCH']"
...,...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']","['CULTR', 'GAME', 'MILX', 'NEWS', 'POLR']"
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","['COMT', 'CULTR', 'GOVT', 'NEWS', 'SRCH']"
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE...","['CULTR', 'DATE', 'NEWS', 'POLR', 'REL']"


### Privacy

In [44]:
temp = aol_reduced_citizen_lab_generalization.copy()
temp = temp.drop(columns=['BrowsingHistory','AllTopics','TopicsSet'], inplace=False)
display(temp)

Unnamed: 0,RandID,sTopics
0,0,"['COMM', 'COMT', 'CULTR', 'GOVT', 'POLR']"
1,3,"['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,4,"['COMM', 'COMT', 'FILE', 'PUBH', 'REL']"
3,10,"['COMM', 'CULTR', 'MILX', 'NEWS', 'POLR']"
4,12,"['CULTR', 'GRP', 'HOST', 'NEWS', 'SRCH']"
...,...,...
247981,521683,"['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,521685,"['CULTR', 'GAME', 'MILX', 'NEWS', 'POLR']"
247983,521686,"['COMT', 'CULTR', 'GOVT', 'NEWS', 'SRCH']"
247984,521689,"['CULTR', 'DATE', 'NEWS', 'POLR', 'REL']"


In [45]:
%%time
E3P = BVM(temp)
E3P.qids(['sTopics'])
E3P.sensitive(['RandID'])
results = E3P.assess()

CPU times: user 967 ms, sys: 22 µs, total: 967 ms
Wall time: 965 ms


In [46]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.087146,34217,4e-06,0.13798,"{'0': 0.5461356689490535, '1': 0.0976627712854..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],RandID,0.087146,34217.0,4e-06,0.13798,"{'0': 0.5461356689490535, '1': 0.0976627712854..."


In [47]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.032485704838176e-06

4.032485704838176e-06

In [48]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.13797956336244788

0.13797956336244788

In [49]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

34217.0

34217.0

### Utility

In [50]:
temp = aol_reduced_citizen_lab_generalization.copy()
temp = temp.drop(columns=['RandID','AllTopics','TopicsSet'], inplace=False)
display(temp)

Unnamed: 0,BrowsingHistory,sTopics
0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['COMM', 'COMT', 'CULTR', 'GOVT', 'POLR']"
1,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMM', 'COMT', 'FILE', 'PUBH', 'REL']"
3,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['COMM', 'CULTR', 'MILX', 'NEWS', 'POLR']"
4,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GRP', 'HOST', 'NEWS', 'SRCH']"
...,...,...
247981,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['CULTR', 'GAME', 'MILX', 'NEWS', 'POLR']"
247983,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['COMT', 'CULTR', 'GOVT', 'NEWS', 'SRCH']"
247984,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'DATE', 'NEWS', 'POLR', 'REL']"


In [51]:
%%time
E3U = BVM(temp)
E3U.qids(['sTopics'])
E3U.sensitive(['BrowsingHistory'])
results = E3U.assess()

CPU times: user 1.73 s, sys: 25 µs, total: 1.73 s
Wall time: 1.73 s


In [52]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['sTopics'],0.087146,34217,4e-06,0.13798,"{'0': 0.5461356689490535, '1': 0.0976627712854..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['sTopics'],BrowsingHistory,0.087146,34217.0,4e-06,0.13798,"{'0': 0.5461356689490535, '1': 0.0976627712854..."


In [53]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

4.032485704838176e-06

4.032485704838176e-06

In [54]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

0.13797956336244788

0.13797956336244788

In [55]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

34217.0

34217.0

### Unique sets

In [56]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'] = str(aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'])

CPU times: user 15.8 s, sys: 23.9 ms, total: 15.8 s
Wall time: 15.8 s


In [57]:
%%time
display(aol_reduced_citizen_lab_generalization['sTopics'].nunique())

34217

CPU times: user 32.3 ms, sys: 3.85 ms, total: 36.1 ms
Wall time: 34.4 ms


## Experiments 4 and 5: Topics with Generalization and Bounded Noise on AOL reduced dataset with Citizen Lab Classification; and Topics with Generalization, Bounded Noise, and Differential Privacy on AOL reduced dataset with Citizen Lab Classification

In [58]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL...","[""'COMM'"", ""'COMT'"", ""'CULTR'"", ""'GOVT'"", ""'PO..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']","[""'ALDR'"", ""'COMM'"", ""'CULTR'"", ""'GRP'"", ""'HOS..."
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']","[""'COMM'"", ""'COMT'"", ""'FILE'"", ""'PUBH'"", ""'REL'""]"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW...","[""'COMM'"", ""'CULTR'"", ""'MILX'"", ""'NEWS'"", ""'PO..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","[""'CULTR'"", ""'GRP'"", ""'HOST'"", ""'NEWS'"", ""'SRC..."
...,...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']","[""'COMM'"", ""'COMT'"", ""'GRP'"", ""'NEWS'"", ""'SRCH'""]"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']","[""'CULTR'"", ""'GAME'"", ""'MILX'"", ""'NEWS'"", ""'PO..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","[""'COMT'"", ""'CULTR'"", ""'GOVT'"", ""'NEWS'"", ""'SR..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE...","[""'CULTR'"", ""'DATE'"", ""'NEWS'"", ""'POLR'"", ""'RE..."


### Privacy

In [59]:
%%time
S = 5
M = len(aol_reduced_citizen_lab_seen_topics)
R = 0.05

aol_reduced_citizen_lab_channel_bn = pandas.DataFrame(data=temp_top_S).T.applymap(lambda x: 1/S, na_action='ignore').fillna(0)
aol_reduced_citizen_lab_channel_bn = aol_reduced_citizen_lab_channel_bn[sorted(aol_reduced_citizen_lab_channel_bn.columns.to_list())]
aol_reduced_citizen_lab_channel_bn = aol_reduced_citizen_lab_channel_bn.sort_index()

aol_reduced_citizen_lab_channel_bn_dp = pandas.DataFrame(data=temp_top_S).T.applymap(lambda x: ((1-R)/S) + (R/M), na_action='ignore').fillna(R/M)
aol_reduced_citizen_lab_channel_bn_dp = aol_reduced_citizen_lab_channel_bn_dp[sorted(aol_reduced_citizen_lab_channel_bn_dp.columns.to_list())]
aol_reduced_citizen_lab_channel_bn_dp = aol_reduced_citizen_lab_channel_bn_dp.sort_index()

CPU times: user 19.1 s, sys: 208 ms, total: 19.3 s
Wall time: 19.3 s


In [60]:
display(aol_reduced_citizen_lab_channel_bn.cumsum(axis=1))
display(aol_reduced_citizen_lab_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,'ALDR','ANON','COMM','COMT','CTRL','CULTR','DATE','ECON','ENV','FILE',...,'MISC','MMED','NEWS','POLR','PORN','PROV','PUBH','REL','SRCH','XED'
0,0.0,0.0,0.2,0.4,0.4,0.6,0.6,0.6,0.6,0.6,...,0.8,0.8,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.2,0.2,0.4,0.4,0.4,0.6,0.6,0.6,0.6,0.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.2,0.4,0.4,0.4,0.4,0.4,0.4,0.6,...,0.6,0.6,0.6,0.6,0.6,0.6,0.8,1.0,1.0,1.0
10,0.0,0.0,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521683,0.0,0.0,0.2,0.4,0.4,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
521685,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,...,0.6,0.6,0.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
521686,0.0,0.0,0.0,0.2,0.2,0.4,0.4,0.4,0.4,0.4,...,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,1.0
521689,0.0,0.0,0.0,0.0,0.0,0.2,0.4,0.4,0.4,0.4,...,0.4,0.4,0.6,0.8,0.8,0.8,0.8,1.0,1.0,1.0


Unnamed: 0,'ALDR','ANON','COMM','COMT','CTRL','CULTR','DATE','ECON','ENV','FILE',...,'MISC','MMED','NEWS','POLR','PORN','PROV','PUBH','REL','SRCH','XED'
0,0.001613,0.003226,0.194839,0.386452,0.388065,0.579677,0.58129,0.582903,0.584516,0.586129,...,0.795484,0.797097,0.79871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
3,0.191613,0.193226,0.384839,0.386452,0.388065,0.579677,0.58129,0.582903,0.584516,0.586129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
4,0.001613,0.003226,0.194839,0.386452,0.388065,0.389677,0.39129,0.392903,0.394516,0.586129,...,0.605484,0.607097,0.60871,0.610323,0.611935,0.613548,0.805161,0.996774,0.998387,1.0
10,0.001613,0.003226,0.194839,0.196452,0.198065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
12,0.001613,0.003226,0.004839,0.006452,0.008065,0.199677,0.20129,0.202903,0.204516,0.206129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521683,0.001613,0.003226,0.194839,0.386452,0.388065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
521685,0.001613,0.003226,0.004839,0.006452,0.008065,0.199677,0.20129,0.202903,0.204516,0.206129,...,0.605484,0.607097,0.79871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
521686,0.001613,0.003226,0.004839,0.196452,0.198065,0.389677,0.39129,0.392903,0.394516,0.396129,...,0.605484,0.607097,0.79871,0.800323,0.801935,0.803548,0.805161,0.806774,0.998387,1.0
521689,0.001613,0.003226,0.004839,0.006452,0.008065,0.199677,0.39129,0.392903,0.394516,0.396129,...,0.415484,0.417097,0.60871,0.800323,0.801935,0.803548,0.805161,0.996774,0.998387,1.0


In [61]:
%%time
N = len(aol_reduced_citizen_lab_channel_bn.index)
aol_reduced_citizen_lab_bn_dp_prior = numpy.array([1/N for row in range(N)])

CPU times: user 22.6 ms, sys: 37 µs, total: 22.6 ms
Wall time: 22.3 ms


In [62]:
%%time
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 4.032485704838176e-06'

'BN Posterior: 2.5001411369996698e-05'

'DP Posterior: 2.3952965086738762e-05'

CPU times: user 21.4 ms, sys: 3 µs, total: 21.4 ms
Wall time: 20.2 ms


In [63]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_bn_dp_prior, aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_bn_dp_prior)))

'BN Leakage: 6.200000000000001'

'DP Leakage: 5.9399999999999995'

CPU times: user 19.3 ms, sys: 0 ns, total: 19.3 ms
Wall time: 18.6 ms


### Utility

In [64]:
%%time
aol_reduced_citizen_lab_top_S = []
for k,v in temp_top_S.items():
    temp = str(list(v.keys()))
    temp = temp.replace("\"","")
    temp = temp.replace("\'","")
    aol_reduced_citizen_lab_top_S.append(temp)

aol_reduced_citizen_lab_top_S_count = 0
aol_reduced_citizen_lab_top_S_counts = dict()
for case in aol_reduced_citizen_lab_top_S:
    if case in aol_reduced_citizen_lab_top_S_counts:
        aol_reduced_citizen_lab_top_S_counts[case] = aol_reduced_citizen_lab_top_S_counts[case] + 1
    else:
        aol_reduced_citizen_lab_top_S_counts[case] = 1
    aol_reduced_citizen_lab_top_S_count += 1

aol_reduced_citizen_lab_top_S_prior = aol_reduced_citizen_lab_top_S_counts
for case in aol_reduced_citizen_lab_top_S_prior:
    aol_reduced_citizen_lab_top_S_prior[case] = aol_reduced_citizen_lab_top_S_prior[case] / aol_reduced_citizen_lab_top_S_count

d = {'topics' : [], 'prior' : []}
for k,v in aol_reduced_citizen_lab_top_S_prior.items():
    d['topics'].append(k)
    d['prior'].append(v)
aol_reduced_citizen_lab_top_S_prior = pandas.DataFrame(data=d)
aol_reduced_citizen_lab_top_S_prior = aol_reduced_citizen_lab_top_S_prior.set_index('topics').sort_index()

CPU times: user 536 ms, sys: 0 ns, total: 536 ms
Wall time: 534 ms


In [65]:
display(aol_reduced_citizen_lab_top_S_prior)

Unnamed: 0_level_0,prior
topics,Unnamed: 1_level_1
"[ALDR, ANON, COMM, COMT, CULTR]",0.000012
"[ALDR, ANON, COMM, COMT, ENV]",0.000008
"[ALDR, ANON, COMM, COMT, FILE]",0.000008
"[ALDR, ANON, COMM, COMT, GMB]",0.000004
"[ALDR, ANON, COMM, COMT, HACK]",0.000008
...,...
"[POLR, PROV, REL, SRCH, XED]",0.000012
"[POLR, PUBH, REL, SRCH, XED]",0.000008
"[PORN, PROV, PUBH, REL, SRCH]",0.000008
"[PORN, PROV, PUBH, SRCH, XED]",0.000004


In [66]:
%%time
S = 5
M = len(aol_reduced_citizen_lab_seen_topics)
R = 0.05

aol_reduced_citizen_lab_channel_bn = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics], index=d['topics'])
aol_reduced_citizen_lab_channel_bn_dp = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics], index=d['topics'])
for tup in aol_reduced_citizen_lab_channel_bn.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in aol_reduced_citizen_lab_channel_bn.columns:
        aol_reduced_citizen_lab_channel_bn.at[tup.Index, col] = 1/S if col in temp else 0
        aol_reduced_citizen_lab_channel_bn_dp.at[tup.Index, col] = ((1-R)/S) + (R/M) if col in temp else (R/M)

aol_reduced_citizen_lab_channel_bn = aol_reduced_citizen_lab_channel_bn[sorted(aol_reduced_citizen_lab_channel_bn.columns.to_list())]
aol_reduced_citizen_lab_channel_bn = aol_reduced_citizen_lab_channel_bn.sort_index()

aol_reduced_citizen_lab_channel_bn_dp = aol_reduced_citizen_lab_channel_bn_dp[sorted(aol_reduced_citizen_lab_channel_bn_dp.columns.to_list())]
aol_reduced_citizen_lab_channel_bn_dp = aol_reduced_citizen_lab_channel_bn_dp.sort_index()

CPU times: user 39.9 s, sys: 3.7 ms, total: 39.9 s
Wall time: 39.9 s


In [67]:
display(aol_reduced_citizen_lab_channel_bn.cumsum(axis=1))
display(aol_reduced_citizen_lab_channel_bn_dp.cumsum(axis=1))

Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
"[ALDR, ANON, COMM, COMT, CULTR]",0.2,0.4,0.6,0.8,0.8,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, ENV]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, FILE]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, GMB]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"[ALDR, ANON, COMM, COMT, HACK]",0.2,0.4,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[POLR, PROV, REL, SRCH, XED]",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.2,0.2,0.4,0.4,0.6,0.8,1.0
"[POLR, PUBH, REL, SRCH, XED]",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.2,0.2,0.2,0.4,0.6,0.8,1.0
"[PORN, PROV, PUBH, REL, SRCH]",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.2,0.4,0.6,0.8,1.0,1.0
"[PORN, PROV, PUBH, SRCH, XED]",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.2,0.4,0.6,0.6,0.8,1.0


Unnamed: 0,ALDR,ANON,COMM,COMT,CTRL,CULTR,DATE,ECON,ENV,FILE,...,MISC,MMED,NEWS,POLR,PORN,PROV,PUBH,REL,SRCH,XED
"[ALDR, ANON, COMM, COMT, CULTR]",0.191613,0.383226,0.574839,0.766452,0.768065,0.959677,0.96129,0.962903,0.964516,0.966129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, ENV]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.964516,0.966129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, FILE]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.966129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, GMB]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.776129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
"[ALDR, ANON, COMM, COMT, HACK]",0.191613,0.383226,0.574839,0.766452,0.768065,0.769677,0.77129,0.772903,0.774516,0.776129,...,0.985484,0.987097,0.98871,0.990323,0.991935,0.993548,0.995161,0.996774,0.998387,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"[POLR, PROV, REL, SRCH, XED]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.03871,0.230323,0.231935,0.423548,0.425161,0.616774,0.808387,1.0
"[POLR, PUBH, REL, SRCH, XED]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.03871,0.230323,0.231935,0.233548,0.425161,0.616774,0.808387,1.0
"[PORN, PROV, PUBH, REL, SRCH]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.03871,0.040323,0.231935,0.423548,0.615161,0.806774,0.998387,1.0
"[PORN, PROV, PUBH, SRCH, XED]",0.001613,0.003226,0.004839,0.006452,0.008065,0.009677,0.01129,0.012903,0.014516,0.016129,...,0.035484,0.037097,0.03871,0.040323,0.231935,0.423548,0.615161,0.616774,0.808387,1.0


In [68]:
%%time
display("Prior: " + str(qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 0.05405547087335576'

'BN Posterior: 0.08808803722790803'

'DP Posterior: 0.08461180691001549'

CPU times: user 161 ms, sys: 0 ns, total: 161 ms
Wall time: 158 ms


In [69]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))

'BN Leakage: 1.62958597538232'

'DP Leakage: 1.5652774001034764'

CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 157 ms


#### Bayes over uniform prior

In [70]:
N = len(aol_reduced_citizen_lab_channel_bn.index)

In [71]:
%%time
display("BN Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))
display("DP Leakage: " + str(qif.measure.bayes_vuln.posterior(numpy.array([1/N for row in range(N)]), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.bayes_vuln.prior(numpy.array([1/N for row in range(N)]))))

'BN Leakage: 6.200000000000002'

'DP Leakage: 5.9399999999999995'

CPU times: user 167 ms, sys: 0 ns, total: 167 ms
Wall time: 166 ms


#### IBA gain

In [72]:
%%time
G_IBA = pandas.DataFrame(columns=[c.replace("'","") for c in aol_reduced_citizen_lab_seen_topics], index=d['topics'])
for tup in G_IBA.itertuples():
    temp = tup.Index.replace('[','').replace(']','').split(', ')
    for col in G_IBA.columns:
        G_IBA.at[tup.Index, col] = 1 if col in temp else 0
G_IBA = G_IBA[sorted(G_IBA.columns.to_list())]
G_IBA = G_IBA.sort_index()
display(G_IBA.T)

Unnamed: 0,"[ALDR, ANON, COMM, COMT, CULTR]","[ALDR, ANON, COMM, COMT, ENV]","[ALDR, ANON, COMM, COMT, FILE]","[ALDR, ANON, COMM, COMT, GMB]","[ALDR, ANON, COMM, COMT, HACK]","[ALDR, ANON, COMM, COMT, NEWS]","[ALDR, ANON, COMM, COMT, PORN]","[ALDR, ANON, COMM, COMT, PROV]","[ALDR, ANON, COMM, CTRL, CULTR]","[ALDR, ANON, COMM, CTRL, ECON]",...,"[NEWS, PORN, PUBH, REL, XED]","[NEWS, PORN, PUBH, SRCH, XED]","[NEWS, PROV, REL, SRCH, XED]","[POLR, PORN, PROV, PUBH, SRCH]","[POLR, PORN, PROV, PUBH, XED]","[POLR, PROV, REL, SRCH, XED]","[POLR, PUBH, REL, SRCH, XED]","[PORN, PROV, PUBH, REL, SRCH]","[PORN, PROV, PUBH, SRCH, XED]","[PROV, PUBH, REL, SRCH, XED]"
ALDR,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
ANON,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
COMM,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
COMT,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CTRL,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
CULTR,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
DATE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ECON,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ENV,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FILE,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


CPU times: user 20.2 s, sys: 15.7 ms, total: 20.2 s
Wall time: 20.2 s


In [73]:
%%time
display("Prior: " + str(qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("BN Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())))
display("DP Posterior: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())))

'Prior: 0.6122926294226185'

'BN Posterior: 1.0000000000000002'

'DP Posterior: 0.9580645161290327'

CPU times: user 1.7 s, sys: 885 ms, total: 2.59 s
Wall time: 1.28 s


In [74]:
%%time
display("BN Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn.to_numpy())/qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))
display("DP Leakage: " + str(qif.measure.g_vuln.posterior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy(), aol_reduced_citizen_lab_channel_bn_dp.to_numpy())/qif.measure.g_vuln.prior(G_IBA.T.to_numpy(dtype=numpy.int32), aol_reduced_citizen_lab_top_S_prior['prior'].to_numpy())))

'BN Leakage: 1.6332060063224723'

'DP Leakage: 1.5647167221863687'

CPU times: user 2.2 s, sys: 1.37 s, total: 3.57 s
Wall time: 1.64 s
