# AOL Data Treatment

In [1]:
import numpy, pandas, re, random, tldextract
from urllib.parse import urlparse

In [2]:
ascii_control = [r'\x00',r'\x01',r'\x02',r'\x03',r'\x04',r'\x05',r'\x06',r'\x07',r'\x08',r'\x09',
                 r'\x0a',r'\x0b',r'\x0c',r'\x0d',r'\x0e',r'\x0f',
                 r'\x10',r'\x11',r'\x12',r'\x13',r'\x14',r'\x15',r'\x16',r'\x17',r'\x18',r'\x19',
                 r'\x1a',r'\x1b',r'\x1c',r'\x1d',r'\x1e',r'\x1f',
                 r'\x7f']

In [3]:
ascii_special = [r'\x20',r'\x21',r'\x22',r'\x23',r'\x24',r'\x25',r'\x26',r'\x27',r'\x28',r'\x29',
                 r'\x2a',r'\x2b',r'\x2c',r'\x2d',r'\x2e',r'\x2f',
                 r'\x3a',r'\x3b',r'\x3c',r'\x3d',r'\x3e',r'\x3f',
                 r'\x40',
                 r'\x5b',r'\x5c',r'\x5d',r'\x5e',r'\x5f',
                 r'\x60',
                 r'\x7b',r'\x7c',r'\x7d',r'\x7e']

## Original data

In [4]:
%%time
data = pandas.read_csv('AOL-01.txt', low_memory=False, on_bad_lines='warn', sep='\t')
for i in range(2,10):
    temp = pandas.read_csv('AOL-0'+str(i)+'.txt', low_memory=False, on_bad_lines='warn', sep='\t')
    data = pandas.concat([data,temp])
temp = pandas.read_csv('AOL-10.txt', low_memory=False, on_bad_lines='warn', sep='\t')
data = pandas.concat([data,temp])

CPU times: user 1min 43s, sys: 12.8 s, total: 1min 56s
Wall time: 1min 56s


Number of unique `AnonID` values.

In [5]:
display(data['AnonID'].nunique(dropna=True), data['AnonID'].nunique(dropna=False))

657427

657427

Number of unique values per attribute.

In [6]:
display(data.nunique(dropna=False))

AnonID         657427
Query        10154741
QueryTime     6929081
ItemRank         1004
ClickURL      1632789
dtype: int64

### Fix data and types

Replace `AnonID` value prefixed with ASCII control character `\x19`.

In [7]:
data['AnonID'] = data['AnonID'].replace({'\x19403684' : 403684})

Update data types and drop unnecessary columns.

In [8]:
data = data.astype({'AnonID': 'int64'})

In [9]:
data['QueryTime'] = pandas.to_datetime(data['QueryTime'])

In [10]:
data = data.drop(columns=['Query', 'ItemRank'], inplace=False)

### Original data statistics

Number of unique `AnonID` values.

In [11]:
display(data['AnonID'].nunique(dropna=True), data['AnonID'].nunique(dropna=False))

657426

657426

Number of unique values per attribute.

In [12]:
display(data.nunique(dropna=False))

AnonID        657426
QueryTime    6929081
ClickURL     1632789
dtype: int64

`AnonID` statistics.

In [13]:
display(data['AnonID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    657426.000000
mean         55.351579
std         367.217866
min           1.000000
25%           5.000000
50%          17.000000
75%          52.000000
80%          68.000000
85%          93.000000
90%         135.000000
95%         228.000000
96%         265.000000
97%         316.000000
98%         397.000000
99%         566.000000
max      279430.000000
Name: count, dtype: float64

Number of `AnonID`s per count of records, e.g. 1 `AnonID` has 279430 records and 56959 `AnonID`s have 1 record.

In [14]:
with pandas.option_context('display.max_rows', None):
    display(data['AnonID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
279430        1
8695          1
8274          1
7545          1
7313          1
6925          1
5981          1
5847          1
5845          1
5165          1
5161          1
5095          1
4763          1
4663          1
4598          1
4508          1
4393          1
4363          1
4277          1
4234          1
4198          1
4195          1
4176          1
4113          1
4106          1
4080          1
4054          1
3893          1
3870          1
3781          1
3766          1
3755          1
3749          1
3737          1
3718          1
3646          1
3630          1
3600          1
3584          1
3561          1
3549          1
3456          1
3429          1
3417          1
3375          2
3363          1
3327          1
3306          1
3285          1
3282          1
3261          1
3258          1
3233          1
3224          1
3211          1
3199          1
3191          1
3178          1
3173          1
3159          1
3114          1
3113          1
30

Date and time range.

In [15]:
display(data['QueryTime'].min())

Timestamp('2006-03-01 00:01:03')

In [16]:
display(data['QueryTime'].max())

Timestamp('2006-05-31 23:59:59')

Number of unique `ClickURL` values.

In [17]:
data_domains = data['ClickURL'].to_list()

data_domains_counts = {}
for domain in data_domains:
    if domain not in data_domains_counts:
        data_domains_counts[domain] = 1
    else:
        data_domains_counts[domain] = data_domains_counts[domain] + 1
display(len(data_domains_counts))

1632789

Top `ClickURL` values by number of records.

In [18]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

NaN                                16946938
http://www.google.com                366623
http://www.myspace.com               167070
http://www.yahoo.com                 161082
http://en.wikipedia.org              122539
http://www.amazon.com                106119
http://www.imdb.com                   98549
http://www.mapquest.com               96136
http://www.ebay.com                   77947
http://mail.yahoo.com                 53856
http://www.bankofamerica.com          48534
http://www.geocities.com              40547
http://www.hotmail.com                38391
http://www.ask.com                    37752
http://www.bizrate.com                32868
http://profile.myspace.com            31083
http://www.tripadvisor.com            31027
http://www.msn.com                    29781
http://www.craigslist.org             27769
http://cgi.ebay.com                   27643
http://www.nextag.com                 27475
http://www.answers.com                26892
http://disney.go.com            

## Drop rows with an empty `ClickURL`

In [19]:
data = data.dropna(subset=['ClickURL'], ignore_index=True)

### New data statistics

Number of unique values per attribute.

In [20]:
display(data.nunique(dropna=False))

AnonID        521692
QueryTime    5470517
ClickURL     1632788
dtype: int64

`AnonID` statistics.

In [21]:
display(data['AnonID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    521692.000000
mean         37.268405
std         226.845215
min           1.000000
25%           3.000000
50%          10.000000
75%          33.000000
80%          44.000000
85%          62.000000
90%          92.000000
95%         160.000000
96%         186.000000
97%         224.000000
98%         284.000000
99%         404.000000
max      150936.000000
Name: count, dtype: float64

Number of `AnonID`s per count of records, e.g. 1 `AnonID` has 150936 records and 70376 `AnonID`s have 1 record.

In [22]:
with pandas.option_context('display.max_rows', None):
    display(data['AnonID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
150936        1
6233          1
5744          1
4273          1
3775          1
3516          1
3445          1
3265          1
3239          1
3080          1
2961          1
2887          1
2879          1
2875          1
2796          1
2791          1
2761          1
2713          1
2701          1
2698          1
2651          1
2581          1
2580          1
2508          1
2473          1
2466          1
2410          1
2364          1
2327          1
2311          1
2304          1
2301          1
2300          1
2267          1
2246          1
2237          1
2217          1
2196          1
2191          1
2187          1
2181          1
2165          1
2145          1
2132          1
2092          1
2079          1
2072          1
2038          1
2031          1
2026          1
2023          1
2007          1
2001          1
1995          2
1982          1
1975          1
1926          1
1878          1
1874          1
1839          1
1828          1
1820          1
18

Date and time range.

In [23]:
display(data['QueryTime'].min())

Timestamp('2006-03-01 00:01:04')

In [24]:
display(data['QueryTime'].max())

Timestamp('2006-05-31 23:59:59')

Number of unique `ClickURL` values.

In [25]:
data_domains = data['ClickURL'].to_list()

data_domains_counts = {}
for domain in data_domains:
    if domain not in data_domains_counts:
        data_domains_counts[domain] = 1
    else:
        data_domains_counts[domain] = data_domains_counts[domain] + 1
display(len(data_domains_counts))

1632788

Top `ClickURL` values by number of records.

In [26]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

http://www.google.com              366623
http://www.myspace.com             167070
http://www.yahoo.com               161082
http://en.wikipedia.org            122539
http://www.amazon.com              106119
http://www.imdb.com                 98549
http://www.mapquest.com             96136
http://www.ebay.com                 77947
http://mail.yahoo.com               53856
http://www.bankofamerica.com        48534
http://www.geocities.com            40547
http://www.hotmail.com              38391
http://www.ask.com                  37752
http://www.bizrate.com              32868
http://profile.myspace.com          31083
http://www.tripadvisor.com          31027
http://www.msn.com                  29781
http://www.craigslist.org           27769
http://cgi.ebay.com                 27643
http://www.nextag.com               27475
http://www.answers.com              26892
http://disney.go.com                26798
http://www.southwest.com            26084
http://www.superpages.com         

## Randomly remap `AnonID` to `RandID`

In [27]:
original_id = numpy.unique(data['AnonID'].to_numpy(copy=True)).tolist()

In [28]:
%%time
random.SystemRandom()
random_id = {}
for i in range(len(original_id)):
    temp = random.choice(original_id)
    random_id[temp] = i
    original_id.remove(temp)

CPU times: user 36min 23s, sys: 413 ms, total: 36min 23s
Wall time: 36min 28s


In [29]:
data['RandID'] = data['AnonID']

In [30]:
%%time
for row in data.itertuples():
    data.at[row.Index,'RandID'] = random_id[row.AnonID]

CPU times: user 27min 7s, sys: 353 ms, total: 27min 7s
Wall time: 27min 8s


In [31]:
data = data.drop(columns=['AnonID'], inplace=False)

Reorder columns.

In [32]:
data = data[['RandID','QueryTime','ClickURL']]

Sort records by `RandID` and `QueryTime`.

In [33]:
data = data.sort_values(by=['RandID','QueryTime'], ignore_index=True)

In [34]:
display(data)

Unnamed: 0,RandID,QueryTime,ClickURL
0,0,2006-03-06 13:59:48,http://chinesefood.about.com
1,0,2006-03-09 17:02:21,http://www.unclefed.com
2,0,2006-03-09 17:07:54,http://www.swanklaw.com
3,0,2006-03-09 17:09:53,http://www.ojd.state.or.us
4,0,2006-03-21 18:54:26,http://www.city-data.com
...,...,...,...
19442624,521691,2006-05-29 19:49:16,http://www.sing365.com
19442625,521691,2006-05-29 20:03:50,http://www.azlyrics.com
19442626,521691,2006-05-29 20:07:23,http://www.azlyrics.com
19442627,521691,2006-05-29 20:35:09,http://www.sing365.com


Number of unique values per attribute.

In [35]:
display(data.nunique(dropna=False))

RandID        521692
QueryTime    5470517
ClickURL     1632788
dtype: int64

## Remove additional ASCII control characters

In [36]:
count = 0
for row in data['ClickURL'].str.contains(r'\x0e|\x0f'):
    if row == True:
        count += 1
print(count)

2


In [37]:
data['ClickURL'] = data['ClickURL'].apply(lambda x: re.sub(r'\x0e|\x0f','',x))

Check for ASCII control characters.

In [38]:
%%time
for case in ascii_control:
    count = 0
    for row in data['ClickURL'].str.contains(case):
        if row == True:
            count += 1
    print(case + ' : ' + str(count) + '\n')

\x00 : 0

\x01 : 0

\x02 : 0

\x03 : 0

\x04 : 0

\x05 : 0

\x06 : 0

\x07 : 0

\x08 : 0

\x09 : 0

\x0a : 0

\x0b : 0

\x0c : 0

\x0d : 0

\x0e : 0

\x0f : 0

\x10 : 0

\x11 : 0

\x12 : 0

\x13 : 0

\x14 : 0

\x15 : 0

\x16 : 0

\x17 : 0

\x18 : 0

\x19 : 0

\x1a : 0

\x1b : 0

\x1c : 0

\x1d : 0

\x1e : 0

\x1f : 0

\x7f : 0

CPU times: user 13min 51s, sys: 1.42 s, total: 13min 53s
Wall time: 13min 53s


## Remove open and close brackets

Both `\x5b` (`[`) and `\x5d` (`]`) cause issues with `urllib.parse`'s `urlparse`.

In [39]:
count = 0
for row in data['ClickURL'].str.contains(r'\x5b|\x5d'):
    if row == True:
        count += 1
print(count)

11


In [40]:
data['ClickURL'] = data['ClickURL'].apply(lambda x: re.sub(r'\x5b|\x5d','',x))

In [41]:
count = 0
for row in data['ClickURL'].str.contains(r'\x5b|\x5d'):
    if row == True:
        count += 1
print(count)

0


## Domain definition

In [42]:
data['Domain'] = data['ClickURL']

In [43]:
%%time
for row in data.itertuples():
    temp = urlparse(row.ClickURL)
    temp_netloc = temp.netloc
    if temp_netloc != '':
        data.at[row.Index,'Domain'] = temp_netloc
    else:
        data.at[row.Index,'Domain'] = temp.path

CPU times: user 26min 44s, sys: 2.47 s, total: 26min 47s
Wall time: 26min 48s


In [44]:
display(data)

Unnamed: 0,RandID,QueryTime,ClickURL,Domain
0,0,2006-03-06 13:59:48,http://chinesefood.about.com,chinesefood.about.com
1,0,2006-03-09 17:02:21,http://www.unclefed.com,www.unclefed.com
2,0,2006-03-09 17:07:54,http://www.swanklaw.com,www.swanklaw.com
3,0,2006-03-09 17:09:53,http://www.ojd.state.or.us,www.ojd.state.or.us
4,0,2006-03-21 18:54:26,http://www.city-data.com,www.city-data.com
...,...,...,...,...
19442624,521691,2006-05-29 19:49:16,http://www.sing365.com,www.sing365.com
19442625,521691,2006-05-29 20:03:50,http://www.azlyrics.com,www.azlyrics.com
19442626,521691,2006-05-29 20:07:23,http://www.azlyrics.com,www.azlyrics.com
19442627,521691,2006-05-29 20:35:09,http://www.sing365.com,www.sing365.com


## Filtering by eTLD

Based on `tldextract` and on Mozilla's Public Suffix List.

In [45]:
extra_suffixes = ['bg.ac.yu','ac.yu','cg.yu','co.yu','edu.yu','gov.yu','net.yu','org.yu','yu','or.tp','tp','an']
extract = tldextract.TLDExtract(suffix_list_urls=["https://raw.githubusercontent.com/publicsuffix/list/5e6ac3a082505ac4cf08858bdb38382d9a912833/public_suffix_list.dat"],
                                cache_dir=None,fallback_to_snapshot=False,
                                extra_suffixes=extra_suffixes,include_psl_private_domains=True)

In [46]:
data['Treated'] = data['Domain']

In [47]:
%%time
check = []

for tup in data.itertuples():
    temp = extract(tup.Domain)
    if temp.suffix == '':
        check.append(tup.Index)
    elif (temp.domain == '') and (temp.suffix.count('.') == 0):
        check.append(tup.Index)
    elif (temp.domain == '') and (temp.suffix.count('.') != 0):
        data.at[tup.Index,'Treated'] = temp.suffix
    elif (re.search(r'^[w]+$',temp.domain) != None) and (temp.suffix.count('.') != 0):
        data.at[tup.Index,'Treated'] = temp.suffix
    elif re.search(r'^[w]+$',temp.domain) == None:
        if re.search(r'[^a-zA-Z0-9-]',temp.domain) != None:
            check.append(tup.Index)
        else:
            data.at[tup.Index,'Treated'] = temp.domain.strip('-') + '.' + temp.suffix
    else:
        check.append(tup.Index)

CPU times: user 30min 27s, sys: 2.77 s, total: 30min 30s
Wall time: 30min 32s


In [48]:
display(data)

Unnamed: 0,RandID,QueryTime,ClickURL,Domain,Treated
0,0,2006-03-06 13:59:48,http://chinesefood.about.com,chinesefood.about.com,about.com
1,0,2006-03-09 17:02:21,http://www.unclefed.com,www.unclefed.com,unclefed.com
2,0,2006-03-09 17:07:54,http://www.swanklaw.com,www.swanklaw.com,swanklaw.com
3,0,2006-03-09 17:09:53,http://www.ojd.state.or.us,www.ojd.state.or.us,state.or.us
4,0,2006-03-21 18:54:26,http://www.city-data.com,www.city-data.com,city-data.com
...,...,...,...,...,...
19442624,521691,2006-05-29 19:49:16,http://www.sing365.com,www.sing365.com,sing365.com
19442625,521691,2006-05-29 20:03:50,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19442626,521691,2006-05-29 20:07:23,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19442627,521691,2006-05-29 20:35:09,http://www.sing365.com,www.sing365.com,sing365.com


In [49]:
data = data.drop(check)

In [50]:
data = data.reset_index(drop=True)

In [51]:
display(data)

Unnamed: 0,RandID,QueryTime,ClickURL,Domain,Treated
0,0,2006-03-06 13:59:48,http://chinesefood.about.com,chinesefood.about.com,about.com
1,0,2006-03-09 17:02:21,http://www.unclefed.com,www.unclefed.com,unclefed.com
2,0,2006-03-09 17:07:54,http://www.swanklaw.com,www.swanklaw.com,swanklaw.com
3,0,2006-03-09 17:09:53,http://www.ojd.state.or.us,www.ojd.state.or.us,state.or.us
4,0,2006-03-21 18:54:26,http://www.city-data.com,www.city-data.com,city-data.com
...,...,...,...,...,...
19426288,521691,2006-05-29 19:49:16,http://www.sing365.com,www.sing365.com,sing365.com
19426289,521691,2006-05-29 20:03:50,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19426290,521691,2006-05-29 20:07:23,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19426291,521691,2006-05-29 20:35:09,http://www.sing365.com,www.sing365.com,sing365.com


## Final data

Check for ASCII special characters.

In [52]:
%%time
ascii_special_cases = {}
for case in ascii_special:
    i = 0
    temp = []
    count = 0
    for row in data['Treated'].str.contains(case):
        if row == True:
            temp.append(row)
            count += 1
        i += 1
    print(case + ' : ' + str(count) + '\n')
    if len(temp) > 0:
        ascii_special_cases[case] = temp

\x20 : 0

\x21 : 0

\x22 : 0

\x23 : 0

\x24 : 0

\x25 : 0

\x26 : 0

\x27 : 0

\x28 : 0

\x29 : 0

\x2a : 0

\x2b : 0

\x2c : 0

\x2d : 1027509

\x2e : 19426293

\x2f : 0

\x3a : 0

\x3b : 0

\x3c : 0

\x3d : 0

\x3e : 0

\x3f : 0

\x40 : 0

\x5b : 0

\x5c : 0

\x5d : 0

\x5e : 0

\x5f : 0

\x60 : 0

\x7b : 0

\x7c : 0

\x7d : 0

\x7e : 0

CPU times: user 13min 45s, sys: 1.3 s, total: 13min 46s
Wall time: 13min 47s


Update data types and drop unnecessary columns.

In [53]:
display(data)

Unnamed: 0,RandID,QueryTime,ClickURL,Domain,Treated
0,0,2006-03-06 13:59:48,http://chinesefood.about.com,chinesefood.about.com,about.com
1,0,2006-03-09 17:02:21,http://www.unclefed.com,www.unclefed.com,unclefed.com
2,0,2006-03-09 17:07:54,http://www.swanklaw.com,www.swanklaw.com,swanklaw.com
3,0,2006-03-09 17:09:53,http://www.ojd.state.or.us,www.ojd.state.or.us,state.or.us
4,0,2006-03-21 18:54:26,http://www.city-data.com,www.city-data.com,city-data.com
...,...,...,...,...,...
19426288,521691,2006-05-29 19:49:16,http://www.sing365.com,www.sing365.com,sing365.com
19426289,521691,2006-05-29 20:03:50,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19426290,521691,2006-05-29 20:07:23,http://www.azlyrics.com,www.azlyrics.com,azlyrics.com
19426291,521691,2006-05-29 20:35:09,http://www.sing365.com,www.sing365.com,sing365.com


In [54]:
data = data.drop(columns=['ClickURL', 'Domain'], inplace=False)

In [55]:
data = data.rename(columns={"Treated": "Domain"})

In [56]:
display(data)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19426288,521691,2006-05-29 19:49:16,sing365.com
19426289,521691,2006-05-29 20:03:50,azlyrics.com
19426290,521691,2006-05-29 20:07:23,azlyrics.com
19426291,521691,2006-05-29 20:35:09,sing365.com


### Final data statistics

Number of unique values per attribute.

In [57]:
display(data.nunique(dropna=False))

RandID        521607
QueryTime    5469196
Domain       1300484
dtype: int64

`RandID` statistics.

In [58]:
display(data['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    521607.000000
mean         37.243160
std         226.665095
min           1.000000
25%           3.000000
50%          10.000000
75%          33.000000
80%          44.000000
85%          62.000000
90%          92.000000
95%         160.000000
96%         186.000000
97%         224.000000
98%         284.000000
99%         404.000000
max      150802.000000
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 150802 records and 70406 `RandID`s have 1 record.

In [59]:
with pandas.option_context('display.max_rows', None):
    display(data['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
150802        1
6227          1
5741          1
4272          1
3775          1
3516          1
3445          1
3265          1
3238          1
3074          1
2954          1
2887          1
2879          1
2874          1
2791          1
2786          1
2752          1
2712          1
2697          1
2696          1
2645          1
2579          1
2571          1
2508          1
2466          1
2464          1
2410          1
2354          1
2325          1
2311          1
2303          1
2298          1
2297          1
2265          1
2237          1
2236          1
2212          1
2192          1
2189          1
2187          1
2180          1
2165          1
2145          1
2129          1
2089          1
2075          1
2072          1
2036          1
2026          1
2025          1
2021          1
2002          1
2000          1
1995          1
1994          1
1981          1
1975          1
1925          1
1877          1
1874          1
1839          1
1827          1
18

Date and time range.

In [60]:
display(data['QueryTime'].min())

Timestamp('2006-03-01 00:01:04')

In [61]:
display(data['QueryTime'].max())

Timestamp('2006-05-31 23:59:59')

Number of unique `Domain` values.

In [62]:
data_domains = data['Domain'].to_list()

data_domains_counts = {}
for domain in data_domains:
    if domain not in data_domains_counts:
        data_domains_counts[domain] = 1
    else:
        data_domains_counts[domain] = data_domains_counts[domain] + 1
display(len(data_domains_counts))

1300484

Top `Domain` values by number of records.

In [63]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(data_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                438816
google.com               404663
myspace.com              228687
about.com                211395
ebay.com                 162841
wikipedia.org            125343
amazon.com               106647
msn.com                  104408
imdb.com                 103844
mapquest.com             101887
go.com                    84490
aol.com                   66069
craigslist.org            65153
nih.gov                   49781
bankofamerica.com         49362
geocities.com             41663
citysearch.com            41372
ask.com                   39935
hotmail.com               38392
bizrate.com               34706
tripadvisor.com           32624
tripod.com                32334
superpages.com            31453
nextag.com                27970
answers.com               26997
southwest.com             26096
microsoft.com             25028
azlyrics.com              24987
ca.gov                    24694
tv.com                    23474
irs.gov                   23378
cnn.com 

### Save data to file

In [64]:
display(data)

Unnamed: 0,RandID,QueryTime,Domain
0,0,2006-03-06 13:59:48,about.com
1,0,2006-03-09 17:02:21,unclefed.com
2,0,2006-03-09 17:07:54,swanklaw.com
3,0,2006-03-09 17:09:53,state.or.us
4,0,2006-03-21 18:54:26,city-data.com
...,...,...,...
19426288,521691,2006-05-29 19:49:16,sing365.com
19426289,521691,2006-05-29 20:03:50,azlyrics.com
19426290,521691,2006-05-29 20:07:23,azlyrics.com
19426291,521691,2006-05-29 20:35:09,sing365.com


In [65]:
data[['RandID','QueryTime','Domain']].to_csv('AOL-treated.csv', index=True)

### Save unique domains to file

In [66]:
unique_domains = list(set(data['Domain'].to_list()))

In [67]:
unique = pandas.Series(unique_domains)

In [68]:
unique.to_csv('AOL-treated-unique-domains.csv', index=True)