# QIF Analyses (AOL treated dataset)

In [1]:
import pandas, random
from bvmlib.bvm import BVM

## AOL treated dataset

In [2]:
%%time
aol_treated = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 14.3 s, sys: 1.37 s, total: 15.7 s
Wall time: 15.7 s


In [3]:
%%time
aol_treated['QueryTime'] = pandas.to_datetime(aol_treated['QueryTime'])

CPU times: user 12.6 s, sys: 321 ms, total: 12.9 s
Wall time: 12.8 s


In [4]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19426288,521691,2006-05-29 19:49:16,sing365.com
19426289,521691,2006-05-29 20:03:50,azlyrics.com
19426290,521691,2006-05-29 20:07:23,azlyrics.com
19426291,521691,2006-05-29 20:35:09,sing365.com


## Drop singletons

In [5]:
%%time
singletons = (aol_treated[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_treated[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 3.35 s, sys: 312 ms, total: 3.66 s
Wall time: 3.65 s


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_treated[aol_treated['RandID'] == i].index.to_list()[0])

CPU times: user 20min 34s, sys: 512 ms, total: 20min 35s
Wall time: 20min 34s


In [7]:
display(len(rows_to_drop))

70406

In [8]:
aol_treated = aol_treated.drop(rows_to_drop)

In [9]:
aol_treated = aol_treated.reset_index(drop=True)

In [10]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19355882,521691,2006-05-29 19:49:16,sing365.com
19355883,521691,2006-05-29 20:03:50,azlyrics.com
19355884,521691,2006-05-29 20:07:23,azlyrics.com
19355885,521691,2006-05-29 20:35:09,sing365.com


## Drop outlier

In [11]:
%%time
display(aol_treated['RandID'].value_counts())

RandID
83937     150802
183239      6227
125488      5741
475224      4272
443325      3775
           ...  
134002         2
235997         2
478236         2
320889         2
385208         2
Name: count, Length: 451201, dtype: int64

CPU times: user 173 ms, sys: 116 ms, total: 289 ms
Wall time: 288 ms


In [12]:
aol_treated = aol_treated.drop(aol_treated[aol_treated['RandID'] == 83937].index.to_list())

In [13]:
aol_treated = aol_treated.reset_index(drop=True)

In [14]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19205080,521691,2006-05-29 19:49:16,sing365.com
19205081,521691,2006-05-29 20:03:50,azlyrics.com
19205082,521691,2006-05-29 20:07:23,azlyrics.com
19205083,521691,2006-05-29 20:35:09,sing365.com


## Experiment 1: Third-party cookies on AOL treated dataset

### Define browsing histories

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': []}
for k, v in aol_treated.groupby('RandID').__iter__():
    temp = []
    for tup in v.itertuples():
        temp.append(tuple([tup.Domain, tup.QueryTime]))
    d['BrowsingHistory'].append(str(temp))
    d['RandID'].append(k)

CPU times: user 4min 13s, sys: 3.9 s, total: 4min 17s
Wall time: 4min 14s


In [16]:
aol_treated_browsing_history = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory'])

In [17]:
display(aol_treated_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,0,"[('about.com', Timestamp('2006-03-06 13:59:48'..."
1,2,"[('jr.com', Timestamp('2006-03-01 21:59:34')),..."
2,3,"[('webdate.com', Timestamp('2006-05-19 00:39:4..."
3,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
4,5,"[('about.com', Timestamp('2006-05-30 21:58:47'..."
...,...,...
451195,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1..."
451196,521688,"[('pluggedincleveland.com', Timestamp('2006-04..."
451197,521689,"[('arab2.com', Timestamp('2006-03-08 03:17:38'..."
451198,521690,"[('washingtonpost.com', Timestamp('2006-05-11 ..."


### Privacy

In [18]:
temp = aol_treated_browsing_history.copy()
temp['UID'] = temp['RandID']
temp = temp.drop(columns=['BrowsingHistory'], inplace=False)
display(temp, temp.nunique())

Unnamed: 0,RandID,UID
0,0,0
1,2,2
2,3,3
3,4,4
4,5,5
...,...,...
451195,521687,521687
451196,521688,521688
451197,521689,521689
451198,521690,521690


RandID    451200
UID       451200
dtype: int64

In [19]:
%%time
E1P = BVM(temp)
E1P.qids(['UID'])
E1P.sensitive(['RandID'])
results = E1P.assess()

CPU times: user 2.81 s, sys: 3.99 ms, total: 2.81 s
Wall time: 2.81 s


In [20]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,451200,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],RandID,0.0,451200.0,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [21]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

2.2163120567375886e-06

2.2163120567375886e-06

In [22]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [23]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

451200.0

451200.0

### Utility

In [24]:
temp = aol_treated_browsing_history.copy()
temp = temp.rename(columns={'RandID':'UID'}, inplace=False)
display(temp, temp.nunique())

Unnamed: 0,UID,BrowsingHistory
0,0,"[('about.com', Timestamp('2006-03-06 13:59:48'..."
1,2,"[('jr.com', Timestamp('2006-03-01 21:59:34')),..."
2,3,"[('webdate.com', Timestamp('2006-05-19 00:39:4..."
3,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
4,5,"[('about.com', Timestamp('2006-05-30 21:58:47'..."
...,...,...
451195,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1..."
451196,521688,"[('pluggedincleveland.com', Timestamp('2006-04..."
451197,521689,"[('arab2.com', Timestamp('2006-03-08 03:17:38'..."
451198,521690,"[('washingtonpost.com', Timestamp('2006-05-11 ..."


UID                451200
BrowsingHistory    451200
dtype: int64

In [25]:
%%time
E1U = BVM(temp)
E1U.qids(['UID'])
E1U.sensitive(['BrowsingHistory'])
results = E1U.assess()

CPU times: user 5.63 s, sys: 4 ms, total: 5.63 s
Wall time: 5.63 s


In [26]:
display(results['re_id'], results['att_inf'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,['UID'],1.0,451200,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,['UID'],BrowsingHistory,0.0,451200.0,2e-06,1.0,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


In [27]:
display(results['re_id']['Prior'][0], results['att_inf']['Prior'][0])

2.2163120567375886e-06

2.2163120567375886e-06

In [28]:
display(results['re_id']['Posterior'][0], results['att_inf']['Posterior'][0])

1.0

1.0

In [29]:
display(results['re_id']['Posterior'][0]/results['re_id']['Prior'][0], results['att_inf']['Posterior'][0]/results['att_inf']['Prior'][0])

451200.0

451200.0