# AOL Experimental Dataset (Google Topics v1)

In [1]:
import pandas, random
from collections import Counter

## AOL reduced dataset with Google Topics Classification v1

In [2]:
%%time
# Loads AOL-reduced-Google-Topics-Classification-v1 dataset into data DataFrame.
aol_reduced_google = pandas.read_csv('AOL-reduced-Google-Topics-Classification-v1.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 3.04 s, sys: 317 ms, total: 3.36 s
Wall time: 3.34 s


In [3]:
%%time
aol_reduced_google['QueryTime'] = pandas.to_datetime(aol_reduced_google['QueryTime'])

CPU times: user 803 ms, sys: 8.24 ms, total: 811 ms
Wall time: 800 ms


In [4]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2493890,521691,2006-03-07 21:29:17,ups.com,'103'
2493891,521691,2006-03-07 21:41:20,fedex.com,'103'
2493892,521691,2006-03-07 21:42:51,ups.com,'103'
2493893,521691,2006-03-15 19:24:17,google.com,'219'


### Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_reduced_google[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

127359
CPU times: user 621 ms, sys: 47.5 ms, total: 669 ms
Wall time: 666 ms


In [6]:
aol_reduced_google = aol_reduced_google[~aol_reduced_google.RandID.isin(singletons)]

In [7]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [8]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2262666,521691,2006-03-07 21:29:17,ups.com,'103'
2262667,521691,2006-03-07 21:41:20,fedex.com,'103'
2262668,521691,2006-03-07 21:42:51,ups.com,'103'
2262669,521691,2006-03-15 19:24:17,google.com,'219'


### Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_reduced_google['RandID'].value_counts())

RandID
463921    19011
132847      933
342660      851
182883      739
196292      723
          ...  
89596         2
199364        2
199377        2
199380        2
261173        2
Name: count, Length: 198024, dtype: int64

CPU times: user 40.6 ms, sys: 8.24 ms, total: 48.8 ms
Wall time: 47.9 ms


In [10]:
aol_reduced_google = aol_reduced_google.drop(aol_reduced_google[aol_reduced_google['RandID'] == 463921].index.to_list())

In [11]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [12]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149'
4,1,2006-05-22 16:46:29,bankofamerica.com,'149'
...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,'103'
2243656,521691,2006-03-07 21:41:20,fedex.com,'103'
2243657,521691,2006-03-07 21:42:51,ups.com,'103'
2243658,521691,2006-03-15 19:24:17,google.com,'219'


### Statistics

In [13]:
%%time
aol_reduced_google.info(verbose=True, memory_usage='deep', show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2243660 entries, 0 to 2243659
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   RandID     2243660 non-null  int64         
 1   QueryTime  2243660 non-null  datetime64[ns]
 2   Domain     2243660 non-null  object        
 3   topics     2243660 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 317.7 MB
CPU times: user 850 ms, sys: 4.16 ms, total: 854 ms
Wall time: 845 ms


Number of unique values per attribute.

In [14]:
display(aol_reduced_google.nunique(dropna=False))

RandID        198023
QueryTime    1613469
Domain          2652
topics           402
dtype: int64

`RandID` statistics.

In [15]:
display(aol_reduced_google['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    198023.000000
mean         11.330300
std          20.183602
min           2.000000
25%           3.000000
50%           6.000000
75%          12.000000
80%          14.000000
85%          18.000000
90%          24.000000
95%          38.000000
96%          43.000000
97%          51.000000
98%          63.000000
99%          90.000000
max         933.000000
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 933 records and 33249 `RandID`s have 2 records.

In [16]:
with pandas.option_context('display.max_rows', None):
    display(aol_reduced_google['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
933        1
851        1
739        1
723        1
717        1
657        1
622        1
581        1
577        1
565        1
561        2
555        1
548        1
547        1
542        1
541        1
530        1
527        1
523        1
517        1
516        1
502        1
500        1
498        1
497        1
487        1
486        1
475        2
474        1
473        1
450        1
448        1
446        1
445        1
431        1
424        1
423        1
418        1
412        3
410        1
406        1
392        1
389        2
388        1
387        1
383        2
380        2
379        1
374        2
372        1
370        2
367        2
365        1
363        2
361        1
356        1
354        1
350        1
348        2
347        1
345        1
343        1
342        1
341        1
339        1
338        1
335        2
332        1
330        1
327        1
326        1
325        2
324        1
322        2
318        3
317        1
315   

Date and time range.

In [17]:
display(aol_reduced_google['QueryTime'].min())

Timestamp('2006-03-01 00:01:04')

In [18]:
display(aol_reduced_google['QueryTime'].max())

Timestamp('2006-05-31 23:59:58')

Number of unique `Domain` values.

In [19]:
%%time
aol_reduced_google_domains_counts = dict(Counter(aol_reduced_google['Domain'].to_list()))
display(len(aol_reduced_google_domains_counts))

2652

CPU times: user 145 ms, sys: 447 µs, total: 146 ms
Wall time: 144 ms


Top `Domain` values by number of records.

In [20]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(aol_reduced_google_domains_counts).sort_values(ascending=False).head(top))

google.com                323106
ebay.com                  151782
wikipedia.org             118886
amazon.com                101173
imdb.com                   98433
msn.com                    95355
mapquest.com               88099
aol.com                    56887
bankofamerica.com          43168
tripadvisor.com            30566
answers.com                25827
southwest.com              22885
microsoft.com              22219
azlyrics.com               21988
cnn.com                    21765
walmart.com                21198
irs.gov                    20724
nytimes.com                19289
mlb.com                    17500
target.com                 15409
bbc.co.uk                  14876
pogo.com                   14577
wellsfargo.com             13385
ign.com                    12840
weather.com                12756
usps.com                   12690
fidelity.com               11958
allrecipes.com             11565
qvc.com                    10993
lowes.com                  10115
kbb.com   

### Define browsing histories and lists of topics

In [21]:
%%time
aol_reduced_google['BrowsingHistory'] = list(zip(aol_reduced_google.Domain, aol_reduced_google.QueryTime))

CPU times: user 7.86 s, sys: 261 ms, total: 8.12 s
Wall time: 8.11 s


In [22]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,"'258','276'","(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-05-09 19:13:53,bankofamerica.com,'149',"(bankofamerica.com, 2006-05-09 19:13:53)"
4,1,2006-05-22 16:46:29,bankofamerica.com,'149',"(bankofamerica.com, 2006-05-22 16:46:29)"
...,...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,'103',"(ups.com, 2006-03-07 21:29:17)"
2243656,521691,2006-03-07 21:41:20,fedex.com,'103',"(fedex.com, 2006-03-07 21:41:20)"
2243657,521691,2006-03-07 21:42:51,ups.com,'103',"(ups.com, 2006-03-07 21:42:51)"
2243658,521691,2006-03-15 19:24:17,google.com,'219',"(google.com, 2006-03-15 19:24:17)"


In [23]:
%%time
aol_reduced_google['topics'] = aol_reduced_google['topics'].map(lambda x : x.split(","))

CPU times: user 2.93 s, sys: 256 ms, total: 3.19 s
Wall time: 3.17 s


In [24]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,"['258', '276']","(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-05-09 19:13:53,bankofamerica.com,['149'],"(bankofamerica.com, 2006-05-09 19:13:53)"
4,1,2006-05-22 16:46:29,bankofamerica.com,['149'],"(bankofamerica.com, 2006-05-22 16:46:29)"
...,...,...,...,...,...
2243655,521691,2006-03-07 21:29:17,ups.com,['103'],"(ups.com, 2006-03-07 21:29:17)"
2243656,521691,2006-03-07 21:41:20,fedex.com,['103'],"(fedex.com, 2006-03-07 21:41:20)"
2243657,521691,2006-03-07 21:42:51,ups.com,['103'],"(ups.com, 2006-03-07 21:42:51)"
2243658,521691,2006-03-15 19:24:17,google.com,['219'],"(google.com, 2006-03-15 19:24:17)"


In [25]:
%%time
aol_reduced_google_generalization = aol_reduced_google[['RandID','BrowsingHistory','topics']].groupby('RandID', sort=False)[['BrowsingHistory','topics']].agg(lambda x : list(x)).reset_index()

CPU times: user 14.3 s, sys: 145 ms, total: 14.4 s
Wall time: 14.4 s


In [26]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","[['258', '276'], ['258', '276'], ['258', '276'..."
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","[['219'], ['239'], ['289'], ['258', '276'], ['..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","[['1', '243'], ['219']]"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","[['1', '243'], ['103'], ['1', '215', '243']]"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","[['219'], ['219'], ['219'], ['219'], ['219'], ..."
...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","[['12', '45'], ['1', '243']]"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","[['42', '250', '253'], ['57', '62', '83'], ['2..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","[['1', '215', '243'], ['289'], ['275']]"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","[['1', '215', '243'], ['219'], ['219'], ['219'..."


In [27]:
%%time
aol_reduced_google_generalization['topics'] = aol_reduced_google_generalization['topics'].map(lambda x : [topic for t in x for topic in t])

CPU times: user 499 ms, sys: 28.3 ms, total: 527 ms
Wall time: 525 ms


In [28]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14..."
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21..."
...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'..."


In [29]:
aol_reduced_google_generalization = aol_reduced_google_generalization.rename(columns={'topics' : 'AllTopics'})

In [30]:
%%time
# Computes the number of unique seen topics.
aol_reduced_google_seen_topics = sorted(list(set([x for y in aol_reduced_google_generalization['AllTopics'] for x in y])))
display(len(aol_reduced_google_seen_topics))

169

CPU times: user 324 ms, sys: 0 ns, total: 324 ms
Wall time: 319 ms


In [31]:
%%time
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['AllTopics'].map(lambda x : dict(Counter(x)))

CPU times: user 757 ms, sys: 44.1 ms, total: 801 ms
Wall time: 797 ms


In [32]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14...","{''258'': 3, ''276'': 3, ''149'': 2}"
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27...","{''219'': 2, ''239'': 1, ''289'': 1, ''258'': ..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']","{''1'': 1, ''243'': 1, ''219'': 1}"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']","{''1'': 2, ''243'': 2, ''103'': 1, ''215'': 1}"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21...","{''219'': 10, ''12'': 2, ''289'': 1, ''299'': ..."
...,...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']","{''12'': 1, ''45'': 1, ''1'': 1, ''243'': 1}"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ...","{''42'': 1, ''250'': 1, ''253'': 1, ''57'': 1,..."
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']","{''1'': 1, ''215'': 1, ''243'': 1, ''289'': 1,..."
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'...","{''1'': 1, ''215'': 1, ''243'': 2, ''219'': 5,..."


In [33]:
def s_top(topics, s, allTopics):
    # If RandID's AllTopics list has exactly S topics, this is already their sTopics list.
    if len(topics) == s:
        top_s = list(topics.keys())
    # If RandID's AllTopics list has less than S topics, selects the remaining topics at random from the taxonomy.
    elif len(topics) < s:
        top_s = list(topics.keys())
        top_s.extend(random.sample(list(set(allTopics) - set(top_s)), k=s-len(top_s)))
    # If RandID's AllTopics list has more than S topics, selects only S of them, in decreasing order of occurrence.
    elif len(topics) > s:
        # Creates a temporary copy of RandID's AllTopics list (here a dictionary mapping topic to count).
        temp = topics.copy()
        # Creates the sTopics list with RandID's most seen topic and deletes it from temporary list.
        top_s = [max(temp, key=temp.get)]
        val = temp[top_s[0]]
        del temp[top_s[0]]
        # Creates a temporary list, alternatives, to account for topics with the same number of occurrences.
        v = temp[max(temp, key=temp.get)]
        alternatives = [top_s[0]]
        # While the sTopics list has less than S topics OR the next topic has the same number of occurrences.
        while len(top_s) < s or v == val:
            t = max(temp, key=temp.get)
            # If the current topic has the same number of occurrences as the previous one,
            # append it to alternatives.
            if v == val:
                alternatives.append(t)
            # Otherwise, clear alternatives and append the current topic to it.
            else:
                alternatives.clear()
                alternatives.append(t)
            top_s.append(t)
            val = v
            del temp[t]
            # If the temporary list is empty, break the loop.
            if len(temp) == 0:
                break
            else:
                v = temp[max(temp, key=temp.get)]
        # If the sTopics list has more than S topics, randomly selects the appropriate number of topics
        # from alternatives, i.e. from the topics with the same number of occurrences.
        if len(top_s) > s:
            delta = s - (len(top_s) - len(alternatives))
            top_s = top_s[0:s-delta]
            top_s.extend(random.sample(alternatives, k=delta))
    
    return sorted(top_s)

In [34]:
%%time
# Selects S topics to be in sTopics list for each RandID.
random.SystemRandom()
S = 5
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['sTopics'].map(lambda x : s_top(x, S, aol_reduced_google_seen_topics))

CPU times: user 4.84 s, sys: 51.9 ms, total: 4.89 s
Wall time: 4.88 s


In [35]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['258', '276', '258', '276', '258', '276', '14...","['149', '234', '258', '276', '56']"
1,3,"[(google.com, 2006-03-18 15:06:00), (cdc.gov, ...","['219', '239', '289', '258', '276', '258', '27...","['1', '180', '186', '258', '276']"
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['1', '243', '219']","['1', '217', '219', '243', '306']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (uhaul.co...","['1', '243', '103', '1', '215', '243']","['1', '103', '215', '239', '243']"
4,6,"[(google.com, 2006-03-06 15:07:17), (google.co...","['219', '219', '219', '219', '219', '219', '21...","['12', '219', '289', '299', '314']"
...,...,...,...,...
198018,521683,"[(imdb.com, 2006-04-08 17:05:00), (cbs.com, 20...","['12', '45', '1', '243']","['1', '12', '230', '243', '45']"
198019,521686,"[(flickr.com, 2006-03-12 21:19:11), (kbb.com, ...","['42', '250', '253', '57', '62', '83', '248', ...","['248', '250', '253', '62', '83']"
198020,521688,"[(aol.com, 2006-03-16 16:59:47), (ebay.com, 20...","['1', '215', '243', '289', '275']","['1', '215', '243', '275', '289']"
198021,521689,"[(msn.com, 2006-03-04 01:07:45), (google.com, ...","['1', '215', '243', '219', '219', '219', '334'...","['1', '215', '219', '243', '334']"


In [36]:
# Checks if every sTopics list has exactly S topics.
display(aol_reduced_google_generalization['sTopics'].map(lambda x : True if len(x) == S else False).all())

True

### Save to file

In [37]:
aol_reduced_google_generalization.to_csv('AOL-experimental-Google-Topics-Classification-v1.csv')