# AOL Experimental Dataset (Google Topics v1)

In [1]:
import pandas, random

## AOL reduced dataset with Google Topics Classification v1

In [2]:
%%time
aol_reduced_google = pandas.read_csv('AOL-reduced-Google-Topics-Classification-v1.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 4.13 s, sys: 357 ms, total: 4.49 s
Wall time: 4.51 s


In [3]:
%%time
aol_reduced_google['QueryTime'] = pandas.to_datetime(aol_reduced_google['QueryTime'])

CPU times: user 2.01 s, sys: 6.19 ms, total: 2.02 s
Wall time: 2.01 s


In [4]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,"'12','45'"
1,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
2,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
3,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
4,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
...,...,...,...,...
2495271,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2495272,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495273,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2495274,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


### Drop singletons

In [5]:
%%time
singletons = (aol_reduced_google[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_reduced_google[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 1.01 s, sys: 23.1 ms, total: 1.03 s
Wall time: 1.03 s


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_reduced_google[aol_reduced_google['RandID'] == i].index.to_list()[0])

CPU times: user 7min 34s, sys: 135 ms, total: 7min 34s
Wall time: 7min 34s


In [7]:
display(len(rows_to_drop))

94047

In [8]:
aol_reduced_google = aol_reduced_google.drop(rows_to_drop)

In [9]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [10]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
1,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
2,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
3,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
4,33,2006-05-20 02:17:44,imdb.com,"'12','45'"
...,...,...,...,...
2401224,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2401225,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2401226,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2401227,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


### Drop outlier

In [11]:
%%time
display(aol_reduced_google['RandID'].value_counts())

RandID
83937     18938
211028      930
443325      851
97578       771
146607      736
          ...  
280676        2
280813        2
280985        2
281645        2
468776        2
Name: count, Length: 230927, dtype: int64

CPU times: user 150 ms, sys: 8 ms, total: 158 ms
Wall time: 173 ms


In [12]:
aol_reduced_google = aol_reduced_google.drop(aol_reduced_google[aol_reduced_google['RandID'] == 83937].index.to_list())

In [13]:
aol_reduced_google = aol_reduced_google.reset_index(drop=True)

In [14]:
display(aol_reduced_google)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,13,2006-03-10 11:51:57,imdb.com,"'12','45'"
1,13,2006-03-25 05:10:47,imdb.com,"'12','45'"
2,20,2006-04-07 21:13:02,imdb.com,"'12','45'"
3,21,2006-05-15 21:18:53,imdb.com,"'12','45'"
4,33,2006-05-20 02:17:44,imdb.com,"'12','45'"
...,...,...,...,...
2382286,515243,2006-05-24 21:46:51,computrabajo.com.pe,'236'
2382287,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2382288,515761,2006-04-18 19:49:49,atv.com.tr,"'1','48'"
2382289,515761,2006-04-18 19:54:33,atv.com.tr,"'1','48'"


### Define browsing histories and lists of topics

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': [], 'AllTopics': []}
for k, v in aol_reduced_google.groupby('RandID').__iter__():
    temp_history = []
    temp_topics = []
    for tup in v.itertuples():
        temp_history.append(tuple([tup.Domain, tup.QueryTime]))
        temp_topics.extend(tup.topics.split(","))
    d['BrowsingHistory'].append(str(temp_history))
    d['AllTopics'].append(temp_topics)
    d['RandID'].append(k)

CPU times: user 4min 44s, sys: 4.68 s, total: 4min 49s
Wall time: 4min 45s


In [16]:
%%time
aol_reduced_google_seen_topics = []
for case in d['AllTopics']:
    aol_reduced_google_seen_topics.extend(case)
aol_reduced_google_seen_topics = list(set(aol_reduced_google_seen_topics))
display(len(aol_reduced_google_seen_topics))

169

CPU times: user 398 ms, sys: 16 ms, total: 414 ms
Wall time: 433 ms


In [17]:
aol_reduced_google_generalization = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory', 'AllTopics'])

In [18]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21..."
...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']"
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']"


In [19]:
aol_reduced_google_generalization['TopicsSet'] = aol_reduced_google_generalization['AllTopics']

In [20]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'TopicsSet'] = sorted(set(aol_reduced_google_generalization.at[row.Index, 'TopicsSet']))

CPU times: user 34 s, sys: 136 ms, total: 34.1 s
Wall time: 34.2 s


In [21]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']"
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']"
...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']"
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142']
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']"


In [22]:
aol_reduced_google_generalization['sTopics'] = aol_reduced_google_generalization['AllTopics']

In [23]:
%%time
random.SystemRandom()
S = 5

aol_reduced_google_topics_by_users = {i : {t : 0 for t in aol_reduced_google_seen_topics} for i in aol_reduced_google_generalization['RandID'].unique()}

for row in aol_reduced_google_generalization.itertuples():
    for topic in row.AllTopics:
        aol_reduced_google_topics_by_users[row.RandID][topic] = aol_reduced_google_topics_by_users[row.RandID][topic] + 1

temp_top_S = {i : dict() for i in aol_reduced_google_generalization['RandID'].unique()}

for k,v in aol_reduced_google_topics_by_users.items():
    temp_dict = dict()
    for t,c in v.items():
        if c != 0:
            temp_dict[t] = c
    temp_dict = dict(sorted(temp_dict.items(), key=lambda item: item[1]))
    
    if len(temp_dict) <= S:
        temp_set = list({t for t in aol_reduced_google_seen_topics} - {k for k in temp_dict.keys()})
        
        while len(temp_dict) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            temp_dict[topic] = 0
        
        temp_top_S[k].update(dict(sorted(temp_dict.items())))
    
    else:
        t,c = temp_dict.popitem()
        max_c = c
        final = dict()
        temp = {t : c}
        
        while len(final) + len(temp) < S:
            t,c = temp_dict.popitem()
            max_c = max(max_c, c)
            
            if max_c == c:
                temp[t] = c
            else:
                final.update(temp)
                temp.clear()
                temp[t] = c
        
        t,c = temp_dict.popitem()
        while (max_c == c) and (len(temp_dict) > 0):
            temp[t] = c
            t,c = temp_dict.popitem()
        
        temp_set = list({k for k in temp.keys()})
        
        while len(final) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            final[topic] = temp[topic]
        
        temp_top_S[k].update(dict(sorted(final.items())))

CPU times: user 27.8 s, sys: 673 ms, total: 28.5 s
Wall time: 28.6 s


In [24]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'sTopics'] = temp_top_S[row.RandID]

CPU times: user 12.6 s, sys: 3.97 ms, total: 12.6 s
Wall time: 12.7 s


In [25]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']","{''179'': 1, ''247'': 0, ''248'': 0, ''289'': ..."
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']","{''129'': 2, ''140'': 2, ''145'': 2, ''275'': ..."
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']","{''100'': 2, ''234'': 0, ''254'': 2, ''275'': ..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']","{''151'': 0, ''172'': 0, ''275'': 3, ''289'': ..."
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']","{''219'': 6, ''234'': 0, ''248'': 1, ''332'': ..."
...,...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']","{''129'': 1, ''183'': 1, ''23'': 2, ''243'': 1..."
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']","{''108'': 0, ''219'': 10, ''275'': 2, ''279'':..."
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142'],"{''142'': 2, ''166'': 0, ''241'': 0, ''299'': ..."
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']","{''12'': 1, ''215'': 1, ''217'': 1, ''219'': 1..."


In [26]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    for k,v in row.sTopics.items():
        if ((v == 0) and (k in row.AllTopics)) or ((v > 0) and (k not in row.AllTopics)):
            print("Error!")

CPU times: user 1.54 s, sys: 34 µs, total: 1.54 s
Wall time: 1.55 s


In [27]:
%%time
for row in aol_reduced_google_generalization.itertuples():
    aol_reduced_google_generalization.at[row.Index, 'sTopics'] = list(temp_top_S[row.RandID].keys())

CPU times: user 12.8 s, sys: 45 µs, total: 12.8 s
Wall time: 12.9 s


In [28]:
display(aol_reduced_google_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,3,"[('amazon.com', Timestamp('2006-05-23 15:31:14...","['289', '179', '289']","['179', '289']","['179', '247', '248', '289', '3']"
1,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['129', '140', '145', '275', '129', '140', '14...","['129', '140', '145', '275']","['129', '140', '145', '275', '58']"
2,7,"[('wikipedia.org', Timestamp('2006-03-20 10:15...","['275', '100', '254', '275', '100', '254', '275']","['100', '254', '275']","['100', '234', '254', '275', '322']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['275', '275', '289', '289', '275']","['275', '289']","['151', '172', '275', '289', '344']"
4,12,"[('priceline.com', Timestamp('2006-04-04 21:17...","['332', '344', '219', '219', '219', '219', '21...","['219', '248', '332', '344']","['219', '234', '248', '332', '344']"
...,...,...,...,...,...
230921,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['243', '23', '289', '23', '289', '129', '183']","['129', '183', '23', '243', '289']","['129', '183', '23', '243', '289']"
230922,521686,"[('google.com', Timestamp('2006-03-04 10:02:30...","['219', '219', '219', '219', '219', '219', '21...","['219', '275']","['108', '219', '275', '279', '292']"
230923,521687,"[('1001freefonts.com', Timestamp('2006-04-08 1...","['142', '142']",['142'],"['142', '166', '241', '299', '315']"
230924,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['12', '45', '275', '219', '215', '217']","['12', '215', '217', '219', '275', '45']","['12', '215', '217', '219', '45']"


### Statistics

`RandID` statistics.

In [29]:
display(aol_reduced_google_generalization['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    230926.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
80%           1.0
85%           1.0
90%           1.0
95%           1.0
96%           1.0
97%           1.0
98%           1.0
99%           1.0
max           1.0
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 150802 records and 70406 `RandID`s have 1 record.

### Save to file

In [30]:
aol_reduced_google_generalization.to_csv('AOL-experimental-Google-Topics-Classification-v1.csv')