# AOL Experimental Dataset

In [1]:
import pandas
from collections import Counter

## AOL treated dataset

In [2]:
%%time
# Loads AOL-treated dataset into data DataFrame.
aol_treated = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 23.8 s, sys: 1.93 s, total: 25.8 s
Wall time: 25.6 s


In [3]:
%%time
aol_treated['QueryTime'] = pandas.to_datetime(aol_treated['QueryTime'])

CPU times: user 22.9 s, sys: 352 ms, total: 23.3 s
Wall time: 23.1 s


In [4]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19426288,521691,2006-03-07 21:42:51,ups.com
19426289,521691,2006-03-15 19:24:17,google.com
19426290,521691,2006-04-01 18:45:41,google.com
19426291,521691,2006-04-02 01:33:52,supercross.com


### Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_treated[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

85601
CPU times: user 13.6 s, sys: 491 ms, total: 14.1 s
Wall time: 14.1 s


In [6]:
aol_treated = aol_treated[~aol_treated.RandID.isin(singletons)]

In [7]:
aol_treated = aol_treated.reset_index(drop=True)

In [8]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19305776,521691,2006-03-07 21:42:51,ups.com
19305777,521691,2006-03-15 19:24:17,google.com
19305778,521691,2006-04-01 18:45:41,google.com
19305779,521691,2006-04-02 01:33:52,supercross.com


### Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_treated['RandID'].value_counts())

RandID
463921    150802
362554      6227
304342      5741
501741      4272
342660      3775
           ...  
54859          2
418762         2
288640         2
483619         2
360861         2
Name: count, Length: 436006, dtype: int64

CPU times: user 227 ms, sys: 132 ms, total: 360 ms
Wall time: 357 ms


In [10]:
aol_treated = aol_treated.drop(aol_treated[aol_treated['RandID'] == 463921].index.to_list())

In [11]:
aol_treated = aol_treated.reset_index(drop=True)

In [12]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19154974,521691,2006-03-07 21:42:51,ups.com
19154975,521691,2006-03-15 19:24:17,google.com
19154976,521691,2006-04-01 18:45:41,google.com
19154977,521691,2006-04-02 01:33:52,supercross.com


### Statistics

In [13]:
%%time
aol_treated.info(verbose=True, memory_usage='deep', show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19154979 entries, 0 to 19154978
Data columns (total 3 columns):
 #   Column     Non-Null Count     Dtype         
---  ------     --------------     -----         
 0   RandID     19154979 non-null  int64         
 1   QueryTime  19154979 non-null  datetime64[ns]
 2   Domain     19154979 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 1.5 GB
CPU times: user 5.98 s, sys: 40 ms, total: 6.02 s
Wall time: 5.97 s


Number of unique values per attribute.

In [14]:
display(aol_treated.nunique(dropna=False))

RandID        436005
QueryTime    5431718
Domain       1291534
dtype: int64

`RandID` statistics.

In [15]:
display(aol_treated['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    436005.000000
mean         43.932934
std          95.040286
min           2.000000
25%           5.000000
50%          14.000000
75%          42.000000
80%          55.000000
85%          74.000000
90%         107.000000
95%         181.000000
96%         210.000000
97%         249.000000
98%         314.000000
99%         439.000000
max        6227.000000
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 6227 records and 35113 `RandID`s have 2 records.

In [16]:
with pandas.option_context('display.max_rows', None):
    display(aol_treated['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
6227        1
5741        1
4272        1
3775        1
3516        1
3445        1
3265        1
3238        1
3074        1
2954        1
2887        1
2879        1
2874        1
2791        1
2786        1
2752        1
2712        1
2697        1
2696        1
2645        1
2579        1
2571        1
2508        1
2466        1
2464        1
2410        1
2354        1
2325        1
2311        1
2303        1
2298        1
2297        1
2265        1
2237        1
2236        1
2212        1
2192        1
2189        1
2187        1
2180        1
2165        1
2145        1
2129        1
2089        1
2075        1
2072        1
2036        1
2026        1
2025        1
2021        1
2002        1
2000        1
1995        1
1994        1
1981        1
1975        1
1925        1
1877        1
1874        1
1839        1
1827        1
1819        1
1815        1
1808        1
1807        1
1789        1
1783        1
1772        1
1763        1
1757        1
1756        1


Date and time range.

In [17]:
display(aol_treated['QueryTime'].min())

Timestamp('2006-03-01 00:01:04')

In [18]:
display(aol_treated['QueryTime'].max())

Timestamp('2006-05-31 23:59:59')

Number of unique `Domain` values.

In [19]:
%%time
aol_treated_domains_counts = dict(Counter(aol_treated['Domain'].to_list()))
display(len(aol_treated_domains_counts))

1291534

CPU times: user 5.84 s, sys: 93.1 ms, total: 5.93 s
Wall time: 5.84 s


Top `Domain` values by number of records.

In [20]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(aol_treated_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                428998
google.com               389798
myspace.com              221460
about.com                209238
ebay.com                 160581
wikipedia.org            124046
amazon.com               105542
imdb.com                 102630
msn.com                  102600
mapquest.com              99474
go.com                    83026
craigslist.org            64031
aol.com                   63899
nih.gov                   49256
bankofamerica.com         48169
geocities.com             41270
citysearch.com            40903
ask.com                   39173
hotmail.com               37634
bizrate.com               34360
tripadvisor.com           32256
tripod.com                32024
superpages.com            31080
nextag.com                27695
answers.com               26719
southwest.com             25500
azlyrics.com              24620
microsoft.com             24422
ca.gov                    24357
tv.com                    23234
irs.gov                   22898
cnn.com 

### Define browsing histories

In [21]:
%%time
aol_treated['BrowsingHistory'] = list(zip(aol_treated.Domain, aol_treated.QueryTime))

CPU times: user 1min 4s, sys: 2.1 s, total: 1min 6s
Wall time: 1min 5s


In [22]:
display(aol_treated)

Unnamed: 0,RandID,QueryTime,Domain,BrowsingHistory
0,0,2006-05-13 00:12:16,edzapp.com,"(edzapp.com, 2006-05-13 00:12:16)"
1,0,2006-05-13 00:15:28,riverdale.k12.or.us,"(riverdale.k12.or.us, 2006-05-13 00:15:28)"
2,0,2006-05-13 00:15:28,riverdale.k12.or.us,"(riverdale.k12.or.us, 2006-05-13 00:15:28)"
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us,"(salkeiz.k12.or.us, 2006-05-13 00:22:39)"
4,0,2006-05-13 00:22:39,greatschools.net,"(greatschools.net, 2006-05-13 00:22:39)"
...,...,...,...,...
19154974,521691,2006-03-07 21:42:51,ups.com,"(ups.com, 2006-03-07 21:42:51)"
19154975,521691,2006-03-15 19:24:17,google.com,"(google.com, 2006-03-15 19:24:17)"
19154976,521691,2006-04-01 18:45:41,google.com,"(google.com, 2006-04-01 18:45:41)"
19154977,521691,2006-04-02 01:33:52,supercross.com,"(supercross.com, 2006-04-02 01:33:52)"


In [23]:
%%time
aol_treated_browsing_history = aol_treated[['RandID','BrowsingHistory']].groupby('RandID', sort=False)[['BrowsingHistory']].agg(lambda x : list(x)).reset_index()

CPU times: user 18.5 s, sys: 671 ms, total: 19.2 s
Wall time: 19.2 s


In [24]:
display(aol_treated_browsing_history)

Unnamed: 0,RandID,BrowsingHistory
0,0,"[(edzapp.com, 2006-05-13 00:12:16), (riverdale..."
1,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh..."
2,3,"[(lawyers.com, 2006-03-02 17:38:43), (cpted-wa..."
3,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c..."
4,5,"[(tvsquad.com, 2006-03-06 23:15:25), (sou.edu,..."
...,...,...
436000,521687,"[(christianspeakers.com, 2006-03-03 10:21:42),..."
436001,521688,"[(winonamanufacturing.com, 2006-03-02 20:39:12..."
436002,521689,"[(dilithiumnetworks.com, 2006-03-04 00:57:31),..."
436003,521690,"[(dfas.mil, 2006-03-01 22:06:56), (dfas.mil, 2..."


### Save to file

In [25]:
aol_treated_browsing_history.to_csv('AOL-experimental.csv')