# AOL Reduced Dataset (Google Topics v1)

In [1]:
import pandas

## Treated datasets

In [2]:
%%time
data = pandas.read_csv('AOL-treated.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 29.1 s, sys: 1.58 s, total: 30.7 s
Wall time: 30.7 s


In [3]:
%%time
topics = pandas.read_csv('AOL-treated-Google-Topics-Classification-v1-domain-match.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 6.67 ms, sys: 3.92 ms, total: 10.6 ms
Wall time: 16.9 ms


In [4]:
display(data)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19426288,521691,2006-05-29 19:49:16,sing365.com
19426289,521691,2006-05-29 20:03:50,azlyrics.com
19426290,521691,2006-05-29 20:07:23,azlyrics.com
19426291,521691,2006-05-29 20:35:09,sing365.com


In [5]:
display(topics)

Unnamed: 0,domain,match,topics
0,computrabajo.com.mx,computrabajo.com.mx,'238'
1,walla.co.il,walla.co.il,"'1','215','219','243'"
2,indiapost.gov.in,indiapost.gov.in,'103'
3,makro.co.za,makro.co.za,'289'
4,lagaceta.com.ar,lagaceta.com.ar,'243'
...,...,...,...
2713,prefeitura.sp.gov.br,gov.br,'243'
2714,sespa.pa.gov.br,gov.br,'243'
2715,camaranh.rs.gov.br,gov.br,'243'
2716,sjc.sp.gov.br,gov.br,'243'


### Statistics

In [6]:
%%time
display(data.nunique())

RandID        521607
QueryTime    5469196
Domain       1300484
dtype: int64

CPU times: user 15.5 s, sys: 173 ms, total: 15.7 s
Wall time: 15.7 s


In [7]:
display(data['QueryTime'].min())

'2006-03-01 00:01:04'

In [8]:
display(data['QueryTime'].max())

'2006-05-31 23:59:59'

In [9]:
data_domains = data['Domain'].to_list()

data_domains_counts = {}
for domain in data_domains:
    if domain not in data_domains_counts:
        data_domains_counts[domain] = 1
    else:
        data_domains_counts[domain] = data_domains_counts[domain] + 1
display(len(data_domains_counts))

1300484

In [10]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                438816
google.com               404663
myspace.com              228687
about.com                211395
ebay.com                 162841
wikipedia.org            125343
amazon.com               106647
msn.com                  104408
imdb.com                 103844
mapquest.com             101887
go.com                    84490
aol.com                   66069
craigslist.org            65153
nih.gov                   49781
bankofamerica.com         49362
geocities.com             41663
citysearch.com            41372
ask.com                   39935
hotmail.com               38392
bizrate.com               34706
tripadvisor.com           32624
tripod.com                32334
superpages.com            31453
nextag.com                27970
answers.com               26997
southwest.com             26096
microsoft.com             25028
azlyrics.com              24987
ca.gov                    24694
tv.com                    23474
irs.gov                   23378
cnn.com 

In [11]:
%%time
display(topics.nunique())

domain    2718
match     1452
topics     406
dtype: int64

CPU times: user 8.51 ms, sys: 18 µs, total: 8.53 ms
Wall time: 14 ms


In [12]:
%%time
all_topics = []
for tup in topics.itertuples():
    all_topics.extend(tup.topics.split(","))

CPU times: user 6.95 ms, sys: 0 ns, total: 6.95 ms
Wall time: 6.91 ms


In [13]:
display(len(set(all_topics)))
with pandas.option_context('display.max_rows', None):
    display(set(all_topics))

171

{"'1'",
 "'100'",
 "'102'",
 "'103'",
 "'104'",
 "'108'",
 "'109'",
 "'11'",
 "'112'",
 "'114'",
 "'119'",
 "'12'",
 "'123'",
 "'126'",
 "'129'",
 "'137'",
 "'139'",
 "'140'",
 "'142'",
 "'144'",
 "'145'",
 "'148'",
 "'149'",
 "'150'",
 "'151'",
 "'152'",
 "'153'",
 "'154'",
 "'157'",
 "'158'",
 "'164'",
 "'165'",
 "'166'",
 "'169'",
 "'172'",
 "'173'",
 "'175'",
 "'178'",
 "'179'",
 "'180'",
 "'182'",
 "'183'",
 "'186'",
 "'188'",
 "'190'",
 "'194'",
 "'196'",
 "'198'",
 "'200'",
 "'201'",
 "'202'",
 "'203'",
 "'204'",
 "'206'",
 "'207'",
 "'209'",
 "'210'",
 "'211'",
 "'215'",
 "'216'",
 "'217'",
 "'218'",
 "'219'",
 "'220'",
 "'223'",
 "'224'",
 "'226'",
 "'227'",
 "'228'",
 "'229'",
 "'23'",
 "'230'",
 "'234'",
 "'236'",
 "'237'",
 "'238'",
 "'239'",
 "'240'",
 "'241'",
 "'243'",
 "'244'",
 "'245'",
 "'247'",
 "'248'",
 "'249'",
 "'250'",
 "'251'",
 "'252'",
 "'253'",
 "'254'",
 "'255'",
 "'256'",
 "'258'",
 "'260'",
 "'262'",
 "'263'",
 "'264'",
 "'265'",
 "'266'",
 "'268'",
 "'27

In [14]:
%%time
display(data['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    521607.000000
mean         37.243160
std         226.665095
min           1.000000
25%           3.000000
50%          10.000000
75%          33.000000
80%          44.000000
85%          62.000000
90%          92.000000
95%         160.000000
96%         186.000000
97%         224.000000
98%         284.000000
99%         404.000000
max      150802.000000
Name: count, dtype: float64

CPU times: user 451 ms, sys: 124 ms, total: 576 ms
Wall time: 585 ms


In [15]:
%%time
with pandas.option_context('display.max_rows', None):
    display(data['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
150802        1
6227          1
5741          1
4272          1
3775          1
3516          1
3445          1
3265          1
3238          1
3074          1
2954          1
2887          1
2879          1
2874          1
2791          1
2786          1
2752          1
2712          1
2697          1
2696          1
2645          1
2579          1
2571          1
2508          1
2466          1
2464          1
2410          1
2354          1
2325          1
2311          1
2303          1
2298          1
2297          1
2265          1
2237          1
2236          1
2212          1
2192          1
2189          1
2187          1
2180          1
2165          1
2145          1
2129          1
2089          1
2075          1
2072          1
2036          1
2026          1
2025          1
2021          1
2002          1
2000          1
1995          1
1994          1
1981          1
1975          1
1925          1
1877          1
1874          1
1839          1
1827          1
18

CPU times: user 430 ms, sys: 108 ms, total: 539 ms
Wall time: 548 ms


In [16]:
%%time
display(data[['RandID','Domain']].groupby(['RandID','Domain']).value_counts().max())

2997

CPU times: user 22.1 s, sys: 1.32 s, total: 23.4 s
Wall time: 23.4 s


## Reduction

In [17]:
%%time
df = pandas.merge(data, topics, how='inner', left_on='Domain', right_on='match', suffixes=['_AOL', '_Google'])

CPU times: user 6.06 s, sys: 182 ms, total: 6.25 s
Wall time: 6.2 s


In [18]:
display(df)

Unnamed: 0,RandID,QueryTime,Domain,domain,match,topics
0,0,2006-04-20 17:37:26,imdb.com,imdb.com,imdb.com,"'12','45'"
1,13,2006-03-10 11:51:57,imdb.com,imdb.com,imdb.com,"'12','45'"
2,13,2006-03-25 05:10:47,imdb.com,imdb.com,imdb.com,"'12','45'"
3,20,2006-04-07 21:13:02,imdb.com,imdb.com,imdb.com,"'12','45'"
4,21,2006-05-15 21:18:53,imdb.com,imdb.com,imdb.com,"'12','45'"
...,...,...,...,...,...,...
2495271,515243,2006-05-24 21:46:51,computrabajo.com.pe,computrabajo.com.pe,computrabajo.com.pe,'236'
2495272,515761,2006-04-18 19:49:49,atv.com.tr,atv.com.tr,atv.com.tr,"'1','48'"
2495273,515761,2006-04-18 19:49:49,atv.com.tr,atv.com.tr,atv.com.tr,"'1','48'"
2495274,515761,2006-04-18 19:54:33,atv.com.tr,atv.com.tr,atv.com.tr,"'1','48'"


In [19]:
%%time
df = df.drop(columns=['domain','match'])

CPU times: user 254 ms, sys: 16.1 ms, total: 271 ms
Wall time: 281 ms


In [20]:
display(df)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,"'12','45'"
1,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
2,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
3,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
4,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
...,...,...,...,...
2495271,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2495272,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495273,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495274,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


### Statistics

In [21]:
%%time
display(df.nunique())

RandID        324974
QueryTime    1760462
Domain          1445
topics           406
dtype: int64

CPU times: user 3.14 s, sys: 56 ms, total: 3.19 s
Wall time: 3.2 s


In [22]:
display(data['QueryTime'].min())

'2006-03-01 00:01:04'

In [23]:
display(data['QueryTime'].max())

'2006-05-31 23:59:59'

In [24]:
data_domains = data['Domain'].to_list()

data_domains_counts = {}
for domain in data_domains:
    if domain not in data_domains_counts:
        data_domains_counts[domain] = 1
    else:
        data_domains_counts[domain] = data_domains_counts[domain] + 1
display(len(data_domains_counts))

1300484

In [25]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                438816
google.com               404663
myspace.com              228687
about.com                211395
ebay.com                 162841
wikipedia.org            125343
amazon.com               106647
msn.com                  104408
imdb.com                 103844
mapquest.com             101887
go.com                    84490
aol.com                   66069
craigslist.org            65153
nih.gov                   49781
bankofamerica.com         49362
geocities.com             41663
citysearch.com            41372
ask.com                   39935
hotmail.com               38392
bizrate.com               34706
tripadvisor.com           32624
tripod.com                32334
superpages.com            31453
nextag.com                27970
answers.com               26997
southwest.com             26096
microsoft.com             25028
azlyrics.com              24987
ca.gov                    24694
tv.com                    23474
irs.gov                   23378
cnn.com 

In [26]:
all_topics = []
for tup in df.itertuples():
    all_topics.extend(tup.topics.split(","))

In [27]:
display(len(set(all_topics)))
display(set(all_topics))

171

{"'1'",
 "'100'",
 "'102'",
 "'103'",
 "'104'",
 "'108'",
 "'109'",
 "'11'",
 "'112'",
 "'114'",
 "'119'",
 "'12'",
 "'123'",
 "'126'",
 "'129'",
 "'137'",
 "'139'",
 "'140'",
 "'142'",
 "'144'",
 "'145'",
 "'148'",
 "'149'",
 "'150'",
 "'151'",
 "'152'",
 "'153'",
 "'154'",
 "'157'",
 "'158'",
 "'164'",
 "'165'",
 "'166'",
 "'169'",
 "'172'",
 "'173'",
 "'175'",
 "'178'",
 "'179'",
 "'180'",
 "'182'",
 "'183'",
 "'186'",
 "'188'",
 "'190'",
 "'194'",
 "'196'",
 "'198'",
 "'200'",
 "'201'",
 "'202'",
 "'203'",
 "'204'",
 "'206'",
 "'207'",
 "'209'",
 "'210'",
 "'211'",
 "'215'",
 "'216'",
 "'217'",
 "'218'",
 "'219'",
 "'220'",
 "'223'",
 "'224'",
 "'226'",
 "'227'",
 "'228'",
 "'229'",
 "'23'",
 "'230'",
 "'234'",
 "'236'",
 "'237'",
 "'238'",
 "'239'",
 "'240'",
 "'241'",
 "'243'",
 "'244'",
 "'245'",
 "'247'",
 "'248'",
 "'249'",
 "'250'",
 "'251'",
 "'252'",
 "'253'",
 "'254'",
 "'255'",
 "'256'",
 "'258'",
 "'260'",
 "'262'",
 "'263'",
 "'264'",
 "'265'",
 "'266'",
 "'268'",
 "'27

In [28]:
%%time
display(data['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    521607.000000
mean         37.243160
std         226.665095
min           1.000000
25%           3.000000
50%          10.000000
75%          33.000000
80%          44.000000
85%          62.000000
90%          92.000000
95%         160.000000
96%         186.000000
97%         224.000000
98%         284.000000
99%         404.000000
max      150802.000000
Name: count, dtype: float64

CPU times: user 462 ms, sys: 101 ms, total: 563 ms
Wall time: 572 ms


In [29]:
%%time
with pandas.option_context('display.max_rows', None):
    display(data['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
150802        1
6227          1
5741          1
4272          1
3775          1
3516          1
3445          1
3265          1
3238          1
3074          1
2954          1
2887          1
2879          1
2874          1
2791          1
2786          1
2752          1
2712          1
2697          1
2696          1
2645          1
2579          1
2571          1
2508          1
2466          1
2464          1
2410          1
2354          1
2325          1
2311          1
2303          1
2298          1
2297          1
2265          1
2237          1
2236          1
2212          1
2192          1
2189          1
2187          1
2180          1
2165          1
2145          1
2129          1
2089          1
2075          1
2072          1
2036          1
2026          1
2025          1
2021          1
2002          1
2000          1
1995          1
1994          1
1981          1
1975          1
1925          1
1877          1
1874          1
1839          1
1827          1
18

CPU times: user 430 ms, sys: 105 ms, total: 535 ms
Wall time: 536 ms


In [30]:
%%time
display(data[['RandID','Domain']].groupby(['RandID','Domain']).value_counts().max())

2997

CPU times: user 21.7 s, sys: 1.36 s, total: 23.1 s
Wall time: 23.1 s


## Save to file

In [31]:
df.to_csv('AOL-reduced-Google-Topics-Classification-v1.csv')