# AOL Experimental Dataset

In [1]:
import pandas, random

## AOL treated dataset

In [2]:
%%time
aol_treated = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 32.6 s, sys: 1.67 s, total: 34.3 s
Wall time: 34.3 s


In [3]:
%%time
aol_treated['QueryTime'] = pandas.to_datetime(aol_treated['QueryTime'])

CPU times: user 26.7 s, sys: 257 ms, total: 27 s
Wall time: 26.9 s


In [4]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19426288,521691,2006-05-29 19:49:16,sing365.com
19426289,521691,2006-05-29 20:03:50,azlyrics.com
19426290,521691,2006-05-29 20:07:23,azlyrics.com
19426291,521691,2006-05-29 20:35:09,sing365.com


### Drop singletons

In [5]:
%%time
singletons = (aol_treated[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_treated[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 6.3 s, sys: 284 ms, total: 6.58 s
Wall time: 6.56 s


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_treated[aol_treated['RandID'] == i].index.to_list()[0])

CPU times: user 32min 40s, sys: 895 ms, total: 32min 41s
Wall time: 32min 41s


In [7]:
display(len(rows_to_drop))

70406

In [8]:
aol_treated = aol_treated.drop(rows_to_drop)

In [9]:
aol_treated = aol_treated.reset_index(drop=True)

In [10]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19355882,521691,2006-05-29 19:49:16,sing365.com
19355883,521691,2006-05-29 20:03:50,azlyrics.com
19355884,521691,2006-05-29 20:07:23,azlyrics.com
19355885,521691,2006-05-29 20:35:09,sing365.com


### Drop outlier

In [11]:
%%time
display(aol_treated['RandID'].value_counts())

RandID
83937     150802
183239      6227
125488      5741
475224      4272
443325      3775
           ...  
134002         2
235997         2
478236         2
320889         2
385208         2
Name: count, Length: 451201, dtype: int64

CPU times: user 392 ms, sys: 100 ms, total: 492 ms
Wall time: 506 ms


In [12]:
aol_treated = aol_treated.drop(aol_treated[aol_treated['RandID'] == 83937].index.to_list())

In [13]:
aol_treated = aol_treated.reset_index(drop=True)

In [14]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19205080,521691,2006-05-29 19:49:16,sing365.com
19205081,521691,2006-05-29 20:03:50,azlyrics.com
19205082,521691,2006-05-29 20:07:23,azlyrics.com
19205083,521691,2006-05-29 20:35:09,sing365.com


### Define browsing histories

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': []}
for k, v in aol_treated.groupby('RandID').__iter__():
    temp = []
    for tup in v.itertuples():
        temp.append(tuple([tup.Domain, tup.QueryTime]))
    d['BrowsingHistory'].append(str(temp))
    d['RandID'].append(k)

CPU times: user 9min 43s, sys: 3.63 s, total: 9min 46s
Wall time: 9min 47s


In [16]:
aol_treated_browsing_history = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory'])

In [17]:
display(aol_treated_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,0,"[('about.com', Timestamp('2006-03-06 13:59:48'..."
1,2,"[('jr.com', Timestamp('2006-03-01 21:59:34')),..."
2,3,"[('webdate.com', Timestamp('2006-05-19 00:39:4..."
3,4,"[('microsoft.com', Timestamp('2006-03-29 22:32..."
4,5,"[('about.com', Timestamp('2006-05-30 21:58:47'..."
...,...,...
451195,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1..."
451196,521688,"[('pluggedincleveland.com', Timestamp('2006-04..."
451197,521689,"[('arab2.com', Timestamp('2006-03-08 03:17:38'..."
451198,521690,"[('washingtonpost.com', Timestamp('2006-05-11 ..."


### Statistics

`RandID` statistics.

In [18]:
display(aol_treated_browsing_history['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    451200.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
80%           1.0
85%           1.0
90%           1.0
95%           1.0
96%           1.0
97%           1.0
98%           1.0
99%           1.0
max           1.0
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 150802 records and 70406 `RandID`s have 1 record.

### Save to file

In [19]:
aol_treated_browsing_history.to_csv('AOL-experimental.csv')