# AOL Reduced Dataset (Google Topics v1)

In [1]:
import pandas
from collections import Counter

## Treated datasets

In [2]:
%%time
# Loads AOL-treated dataset into data DataFrame.
data = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 21.2 s, sys: 1.61 s, total: 22.8 s
Wall time: 22.7 s


In [3]:
%%time
# Loads domain match for AOL-treated dataset and Google Topics v1 classification into topics DataFrame.
topics = pandas.read_csv('AOL-treated-Google-Topics-Classification-v1-domain-match.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 6.7 ms, sys: 408 µs, total: 7.11 ms
Wall time: 5.76 ms


In [4]:
display(data)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-05-13 00:12:16,edzapp.com
1,0,2006-05-13 00:15:28,riverdale.k12.or.us
2,0,2006-05-13 00:15:28,riverdale.k12.or.us
3,0,2006-05-13 00:22:39,salkeiz.k12.or.us
4,0,2006-05-13 00:22:39,greatschools.net
...,...,...,...
19426288,521691,2006-03-07 21:42:51,ups.com
19426289,521691,2006-03-15 19:24:17,google.com
19426290,521691,2006-04-01 18:45:41,google.com
19426291,521691,2006-04-02 01:33:52,supercross.com


In [5]:
display(topics)

Unnamed: 0,domain,match,topics
0,officedepot.com.mx,officedepot.com.mx,"'103','289'"
1,google.com.br,google.com.br,'219'
2,google.co.nz,google.co.nz,'219'
3,independent.co.uk,independent.co.uk,'243'
4,unionbankonline.co.in,unionbankonline.co.in,'149'
...,...,...,...
2713,metro.sp.gov.br,gov.br,"'243','299'"
2714,seplan.go.gov.br,gov.br,"'243','299'"
2715,polmil.sp.gov.br,gov.br,"'243','299'"
2716,hyde.tameside.sch.uk,t.me,"'243','299'"


### Statistics

In [6]:
%%time
data.info(verbose=True, memory_usage='deep', show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 19426293 entries, 0 to 19426292
Data columns (total 3 columns):
 #   Column     Non-Null Count     Dtype 
---  ------     --------------     ----- 
 0   RandID     19426293 non-null  int64 
 1   QueryTime  19426293 non-null  object
 2   Domain     19426293 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.9 GB
CPU times: user 12.6 s, sys: 115 ms, total: 12.8 s
Wall time: 12.6 s


Number of unique values per attribute.

In [7]:
%%time
display(data.nunique())

RandID        521607
QueryTime    5469196
Domain       1300484
dtype: int64

CPU times: user 14.3 s, sys: 207 ms, total: 14.5 s
Wall time: 14.5 s


`RandID` statistics.

In [8]:
%%time
display(data['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    521607.000000
mean         37.243160
std         226.665095
min           1.000000
25%           3.000000
50%          10.000000
75%          33.000000
80%          44.000000
85%          62.000000
90%          92.000000
95%         160.000000
96%         186.000000
97%         224.000000
98%         284.000000
99%         404.000000
max      150802.000000
Name: count, dtype: float64

CPU times: user 305 ms, sys: 144 ms, total: 449 ms
Wall time: 438 ms


Number of `RandID`s per count of records, e.g. 1 `RandID` has 150802 records and 70406 `RandID`s have 1 record.

In [9]:
%%time
with pandas.option_context('display.max_rows', None):
    display(data['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
150802        1
6227          1
5741          1
4272          1
3775          1
3516          1
3445          1
3265          1
3238          1
3074          1
2954          1
2887          1
2879          1
2874          1
2791          1
2786          1
2752          1
2712          1
2697          1
2696          1
2645          1
2579          1
2571          1
2508          1
2466          1
2464          1
2410          1
2354          1
2325          1
2311          1
2303          1
2298          1
2297          1
2265          1
2237          1
2236          1
2212          1
2192          1
2189          1
2187          1
2180          1
2165          1
2145          1
2129          1
2089          1
2075          1
2072          1
2036          1
2026          1
2025          1
2021          1
2002          1
2000          1
1995          1
1994          1
1981          1
1975          1
1925          1
1877          1
1874          1
1839          1
1827          1
18

CPU times: user 290 ms, sys: 132 ms, total: 422 ms
Wall time: 411 ms


In [10]:
%%time
display(data[['RandID','Domain']].groupby(['RandID','Domain']).value_counts().max())

2997

CPU times: user 17 s, sys: 1.68 s, total: 18.7 s
Wall time: 18.6 s


Date and time range.

In [11]:
display(data['QueryTime'].min())

'2006-03-01 00:01:04'

In [12]:
display(data['QueryTime'].max())

'2006-05-31 23:59:59'

Number of unique `Domain` values.

In [13]:
%%time
data_domains_counts = dict(Counter(data['Domain'].to_list()))
display(len(data_domains_counts))

1300484

CPU times: user 6.05 s, sys: 179 ms, total: 6.23 s
Wall time: 6.05 s


Top `Domain` values by number of records.

In [14]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                438816
google.com               404663
myspace.com              228687
about.com                211395
ebay.com                 162841
wikipedia.org            125343
amazon.com               106647
msn.com                  104408
imdb.com                 103844
mapquest.com             101887
go.com                    84490
aol.com                   66069
craigslist.org            65153
nih.gov                   49781
bankofamerica.com         49362
geocities.com             41663
citysearch.com            41372
ask.com                   39935
hotmail.com               38392
bizrate.com               34706
tripadvisor.com           32624
tripod.com                32334
superpages.com            31453
nextag.com                27970
answers.com               26997
southwest.com             26096
microsoft.com             25028
azlyrics.com              24987
ca.gov                    24694
tv.com                    23474
irs.gov                   23378
cnn.com 

In [15]:
%%time
display(topics.nunique())

domain    2718
match     1452
topics     406
dtype: int64

CPU times: user 2.46 ms, sys: 3.89 ms, total: 6.35 ms
Wall time: 4.98 ms


In [16]:
%%time
all_topics = []
for tup in topics.itertuples():
    all_topics.extend(tup.topics.split(","))

CPU times: user 3.68 ms, sys: 0 ns, total: 3.68 ms
Wall time: 3.71 ms


In [17]:
# Computes the number of unique seen topics.
display(len(set(all_topics)))
with pandas.option_context('display.max_rows', None):
    display(set(all_topics))

171

{"'1'",
 "'100'",
 "'102'",
 "'103'",
 "'104'",
 "'108'",
 "'109'",
 "'11'",
 "'112'",
 "'114'",
 "'119'",
 "'12'",
 "'123'",
 "'126'",
 "'129'",
 "'137'",
 "'139'",
 "'140'",
 "'142'",
 "'144'",
 "'145'",
 "'148'",
 "'149'",
 "'150'",
 "'151'",
 "'152'",
 "'153'",
 "'154'",
 "'157'",
 "'158'",
 "'164'",
 "'165'",
 "'166'",
 "'169'",
 "'172'",
 "'173'",
 "'175'",
 "'178'",
 "'179'",
 "'180'",
 "'182'",
 "'183'",
 "'186'",
 "'188'",
 "'190'",
 "'194'",
 "'196'",
 "'198'",
 "'200'",
 "'201'",
 "'202'",
 "'203'",
 "'204'",
 "'206'",
 "'207'",
 "'209'",
 "'210'",
 "'211'",
 "'215'",
 "'216'",
 "'217'",
 "'218'",
 "'219'",
 "'220'",
 "'223'",
 "'224'",
 "'226'",
 "'227'",
 "'228'",
 "'229'",
 "'23'",
 "'230'",
 "'234'",
 "'236'",
 "'237'",
 "'238'",
 "'239'",
 "'240'",
 "'241'",
 "'243'",
 "'244'",
 "'245'",
 "'247'",
 "'248'",
 "'249'",
 "'250'",
 "'251'",
 "'252'",
 "'253'",
 "'254'",
 "'255'",
 "'256'",
 "'258'",
 "'260'",
 "'262'",
 "'263'",
 "'264'",
 "'265'",
 "'266'",
 "'268'",
 "'27

## Reduction

In [18]:
%%time
# Inner merge of data and topics DataFrames.
df = pandas.merge(data, topics, how='inner', left_on='Domain', right_on='domain', suffixes=['_AOL', '_Google'])

CPU times: user 6.42 s, sys: 325 ms, total: 6.75 s
Wall time: 6.67 s


In [19]:
display(df)

Unnamed: 0,RandID,QueryTime,Domain,domain,match,topics
0,1,2006-03-01 11:54:19,kidshealth.org,kidshealth.org,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,kidshealth.org,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,kidshealth.org,kidshealth.org,"'258','276'"
3,3,2006-05-08 18:24:29,kidshealth.org,kidshealth.org,kidshealth.org,"'258','276'"
4,3,2006-05-08 18:39:34,kidshealth.org,kidshealth.org,kidshealth.org,"'258','276'"
...,...,...,...,...,...,...
2493890,521468,2006-03-02 16:10:17,staffsamb.nhs.uk,staffsamb.nhs.uk,nhs.uk,"'243','299'"
2493891,521630,2006-03-19 07:20:42,ibama.gov.br,ibama.gov.br,gov.br,"'243','299'"
2493892,521630,2006-03-19 07:20:42,meioambiente.mg.gov.br,meioambiente.mg.gov.br,gov.br,"'243','299'"
2493893,521630,2006-04-07 00:47:52,woking.gov.uk,woking.gov.uk,gov.uk,"'243','299'"


In [20]:
%%time
df = df.drop(columns=['domain','match'])

CPU times: user 195 ms, sys: 15.7 ms, total: 211 ms
Wall time: 208 ms


In [21]:
df = df.sort_values(by=['RandID', 'QueryTime'], ignore_index=True)

In [22]:
display(df)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2493890,521691,2006-03-07 21:29:17,ups.com,'103'
2493891,521691,2006-03-07 21:41:20,fedex.com,'103'
2493892,521691,2006-03-07 21:42:51,ups.com,'103'
2493893,521691,2006-03-15 19:24:17,google.com,'219'


### Statistics

In [23]:
%%time
df.info(verbose=True, memory_usage='deep', show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2493895 entries, 0 to 2493894
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   RandID     2493895 non-null  int64 
 1   QueryTime  2493895 non-null  object
 2   Domain     2493895 non-null  object
 3   topics     2493895 non-null  object
dtypes: int64(1), object(3)
memory usage: 514.7 MB
CPU times: user 1.91 s, sys: 19.2 ms, total: 1.93 s
Wall time: 1.92 s


Number of unique values per attribute.

In [24]:
%%time
display(df.nunique())

RandID        325383
QueryTime    1765193
Domain          2718
topics           406
dtype: int64

CPU times: user 2.68 s, sys: 7.92 ms, total: 2.69 s
Wall time: 2.68 s


`RandID` statistics.

In [25]:
%%time
display(df['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    325383.000000
mean          7.664491
std          37.291912
min           1.000000
25%           1.000000
50%           3.000000
75%           7.000000
80%           9.000000
85%          12.000000
90%          17.000000
95%          28.000000
96%          33.000000
97%          39.000000
98%          49.000000
99%          71.000000
max       19011.000000
Name: count, dtype: float64

CPU times: user 72 ms, sys: 183 µs, total: 72.2 ms
Wall time: 70.5 ms


Number of `RandID`s per count of records, e.g. 1 `RandID` has 19011 records and 94118 `RandID`s have 1 record.

In [26]:
%%time
with pandas.option_context('display.max_rows', None):
    display(df['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
19011        1
933          1
851          1
739          1
723          1
717          1
657          1
640          1
626          1
622          1
581          1
577          1
565          1
561          2
555          1
548          1
547          1
542          1
541          1
530          1
527          1
523          1
517          1
516          1
502          1
500          1
498          1
497          1
495          1
488          1
487          1
486          1
475          2
474          1
473          1
454          1
450          1
448          1
446          1
445          1
431          1
424          1
423          1
418          1
412          3
410          1
406          1
392          1
389          2
388          1
387          1
383          2
380          2
379          1
374          2
372          1
370          2
367          2
365          1
363          2
361          1
356          1
354          1
350          1
348          2
347          1
345 

CPU times: user 61.8 ms, sys: 0 ns, total: 61.8 ms
Wall time: 60.8 ms


In [27]:
%%time
display(df[['RandID','Domain']].groupby(['RandID','Domain']).value_counts().max())

2893

CPU times: user 823 ms, sys: 15.8 ms, total: 839 ms
Wall time: 837 ms


Date and time range.

In [28]:
display(df['QueryTime'].min())

'2006-03-01 00:01:04'

In [29]:
display(df['QueryTime'].max())

'2006-05-31 23:59:58'

Number of unique `Domain` values.

In [30]:
%%time
df_domains_counts = dict(Counter(df['Domain'].to_list()))
display(len(df_domains_counts))

2718

CPU times: user 185 ms, sys: 262 µs, total: 185 ms
Wall time: 180 ms


Top `Domain` values by number of records.

In [31]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(df_domains_counts).sort_values(ascending=False).head(top))

google.com                404663
ebay.com                  162841
wikipedia.org             125343
amazon.com                106647
msn.com                   104408
imdb.com                  103844
mapquest.com              101887
aol.com                    66069
bankofamerica.com          49362
tripadvisor.com            32624
answers.com                26997
southwest.com              26096
microsoft.com              25028
azlyrics.com               24987
irs.gov                    23378
cnn.com                    23025
walmart.com                22537
nytimes.com                20145
mlb.com                    19382
pogo.com                   17137
target.com                 16259
wellsfargo.com             15878
bbc.co.uk                  15617
fidelity.com               14576
weather.com                14552
ign.com                    13987
usps.com                   13970
allrecipes.com             12494
qvc.com                    12114
kbb.com                    11425
sapo.pt   

In [32]:
all_topics = []
for tup in df.itertuples():
    all_topics.extend(tup.topics.split(","))

In [33]:
# Computes the number of unique seen topics.
display(len(set(all_topics)))
display(set(all_topics))

171

{"'1'",
 "'100'",
 "'102'",
 "'103'",
 "'104'",
 "'108'",
 "'109'",
 "'11'",
 "'112'",
 "'114'",
 "'119'",
 "'12'",
 "'123'",
 "'126'",
 "'129'",
 "'137'",
 "'139'",
 "'140'",
 "'142'",
 "'144'",
 "'145'",
 "'148'",
 "'149'",
 "'150'",
 "'151'",
 "'152'",
 "'153'",
 "'154'",
 "'157'",
 "'158'",
 "'164'",
 "'165'",
 "'166'",
 "'169'",
 "'172'",
 "'173'",
 "'175'",
 "'178'",
 "'179'",
 "'180'",
 "'182'",
 "'183'",
 "'186'",
 "'188'",
 "'190'",
 "'194'",
 "'196'",
 "'198'",
 "'200'",
 "'201'",
 "'202'",
 "'203'",
 "'204'",
 "'206'",
 "'207'",
 "'209'",
 "'210'",
 "'211'",
 "'215'",
 "'216'",
 "'217'",
 "'218'",
 "'219'",
 "'220'",
 "'223'",
 "'224'",
 "'226'",
 "'227'",
 "'228'",
 "'229'",
 "'23'",
 "'230'",
 "'234'",
 "'236'",
 "'237'",
 "'238'",
 "'239'",
 "'240'",
 "'241'",
 "'243'",
 "'244'",
 "'245'",
 "'247'",
 "'248'",
 "'249'",
 "'250'",
 "'251'",
 "'252'",
 "'253'",
 "'254'",
 "'255'",
 "'256'",
 "'258'",
 "'260'",
 "'262'",
 "'263'",
 "'264'",
 "'265'",
 "'266'",
 "'268'",
 "'27

## Save to file

In [34]:
df.to_csv('AOL-reduced-Google-Topics-Classification-v1.csv')