# QIF Analyses (AOL treated dataset)

In [1]:
import pandas, random
from bvmlib.bvm import BVM

## AOL treated dataset

In [2]:
%%time
# Loads AOL-treated dataset into aol_treated DataFrame.
aol_treated = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 22.7 s, sys: 1.68 s, total: 24.4 s
Wall time: 24.2 s


In [3]:
%%time
aol_treated['QueryTime'] = pandas.to_datetime(aol_treated['QueryTime'])

CPU times: user 24 s, sys: 371 ms, total: 24.3 s
Wall time: 24.2 s


In [4]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19426288,521691,2006-03-07 21:42:51,ups.com
19426289,521691,2006-03-15 19:24:17,google.com
19426290,521691,2006-04-01 18:45:41,google.com
19426291,521691,2006-04-02 01:33:52,supercross.com


## Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_treated[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

85601
CPU times: user 13.2 s, sys: 552 ms, total: 13.8 s
Wall time: 13.8 s


In [6]:
aol_treated = aol_treated[~aol_treated.RandID.isin(singletons)]

In [7]:
aol_treated = aol_treated.reset_index(drop=True)

In [8]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19305776,521691,2006-03-07 21:42:51,ups.com
19305777,521691,2006-03-15 19:24:17,google.com
19305778,521691,2006-04-01 18:45:41,google.com
19305779,521691,2006-04-02 01:33:52,supercross.com


## Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_treated['RandID'].value_counts())

RandID
463921    150802
362554      6227
304342      5741
501741      4272
342660      3775
           ...  
54859          2
418762         2
288640         2
483619         2
360861         2
Name: count, Length: 436006, dtype: int64

CPU times: user 273 ms, sys: 136 ms, total: 410 ms
Wall time: 405 ms


In [10]:
aol_treated = aol_treated.drop(aol_treated[aol_treated['RandID'] == 463921].index.to_list())

In [11]:
aol_treated = aol_treated.reset_index(drop=True)

In [12]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19154974,521691,2006-03-07 21:42:51,ups.com
19154975,521691,2006-03-15 19:24:17,google.com
19154976,521691,2006-04-01 18:45:41,google.com
19154977,521691,2006-04-02 01:33:52,supercross.com


## Experiment 1: Third-party cookies on AOL treated dataset

### Define browsing histories

In [13]:
%%time
aol_treated['BrowsingHistory'] = list(zip(aol_treated.Domain, aol_treated.QueryTime))

CPU times: user 1min 2s, sys: 2.05 s, total: 1min 4s
Wall time: 1min 4s


In [14]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain,BrowsingHistory
0,0,2006-05-13 00:12:16,edzapp.com,"(edzapp.com, 2006-05-13 00:12:16)"
1,0,2006-05-13 00:15:28,riverdale.k12.or.us,"(riverdale.k12.or.us, 2006-05-13 00:15:28)"
2,0,2006-05-13 00:15:28,riverdale.k12.or.us,"(riverdale.k12.or.us, 2006-05-13 00:15:28)"
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us,"(salkeiz.k12.or.us, 2006-05-13 00:22:39)"
4,0,2006-05-13 00:22:39,greatschools.net,"(greatschools.net, 2006-05-13 00:22:39)"
...,...,...,...,...
19154974,521691,2006-03-07 21:42:51,ups.com,"(ups.com, 2006-03-07 21:42:51)"
19154975,521691,2006-03-15 19:24:17,google.com,"(google.com, 2006-03-15 19:24:17)"
19154976,521691,2006-04-01 18:45:41,google.com,"(google.com, 2006-04-01 18:45:41)"
19154977,521691,2006-04-02 01:33:52,supercross.com,"(supercross.com, 2006-04-02 01:33:52)"


In [15]:
%%time
aol_treated_browsing_history = aol_treated[['RandID','BrowsingHistory']].groupby('RandID', sort=False)[['BrowsingHistory']].agg(lambda x : list(x)).reset_index()

CPU times: user 18.8 s, sys: 519 ms, total: 19.3 s
Wall time: 19.3 s


In [16]:
%%time
aol_treated_browsing_history['BrowsingHistory'] = aol_treated_browsing_history['BrowsingHistory'].map(lambda x : str(x))

CPU times: user 1min 1s, sys: 865 ms, total: 1min 2s
Wall time: 1min 2s


In [17]:
display(aol_treated_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,0,"[('edzapp.com', Timestamp('2006-05-13 00:12:16..."
1,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
2,3,"[('lawyers.com', Timestamp('2006-03-02 17:38:4..."
3,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
4,5,"[('tvsquad.com', Timestamp('2006-03-06 23:15:2..."
...,...,...
436000,521687,"[('christianspeakers.com', Timestamp('2006-03-..."
436001,521688,"[('winonamanufacturing.com', Timestamp('2006-0..."
436002,521689,"[('dilithiumnetworks.com', Timestamp('2006-03-..."
436003,521690,"[('dfas.mil', Timestamp('2006-03-01 22:06:56')..."


### Privacy

In [18]:
temp = aol_treated_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,0,0
1,1,1
2,3,3
3,4,4
4,5,5
...,...,...
436000,521687,521687
436001,521688,521688
436002,521689,521689
436003,521690,521690


RandID    436005
UID       436005
dtype: int64

In [19]:
%%time
E1P = BVM(temp)
E1P.qids(['UID'])
E1P.sensitive(['RandID'])
results = E1P.assess()

CPU times: user 3.61 s, sys: 0 ns, total: 3.61 s
Wall time: 3.6 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,436005,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,436005.0,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

2.2935516794532174e-06

2.2935516794532174e-06

In [22]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

436005.0

436005.0

### Utility

In [24]:
temp = aol_treated_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,0,"[('edzapp.com', Timestamp('2006-05-13 00:12:16..."
1,1,"[('kidshealth.org', Timestamp('2006-03-01 11:5..."
2,3,"[('lawyers.com', Timestamp('2006-03-02 17:38:4..."
3,4,"[('foxnews.com', Timestamp('2006-03-12 20:33:2..."
4,5,"[('tvsquad.com', Timestamp('2006-03-06 23:15:2..."
...,...,...
436000,521687,"[('christianspeakers.com', Timestamp('2006-03-..."
436001,521688,"[('winonamanufacturing.com', Timestamp('2006-0..."
436002,521689,"[('dilithiumnetworks.com', Timestamp('2006-03-..."
436003,521690,"[('dfas.mil', Timestamp('2006-03-01 22:06:56')..."


UID                436005
BrowsingHistory    436005
dtype: int64

In [25]:
%%time
E1U = BVM(temp)
E1U.qids(['UID'])
E1U.sensitive(['BrowsingHistory'])
results = E1U.assess()

CPU times: user 7.68 s, sys: 4.23 ms, total: 7.68 s
Wall time: 7.67 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,436005,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,436005.0,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
# Prior Bayes vulnerability.
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

2.2935516794532174e-06

2.2935516794532174e-06

In [28]:
# Posterior Bayes vulnerability.
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
# Bayes leakage.
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

436005.0

436005.0