# AOL Experimental Dataset (Citizen Lab)

In [1]:
import pandas, random

## AOL reduced dataset with Citizen Lab Classification

In [2]:
%%time
aol_reduced_citizen_lab = pandas.read_csv('AOL-reduced-Citizen-Lab-Classification.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 5.71 s, sys: 382 ms, total: 6.09 s
Wall time: 6.1 s


In [3]:
%%time
aol_reduced_citizen_lab['QueryTime'] = pandas.to_datetime(aol_reduced_citizen_lab['QueryTime'])

CPU times: user 2.28 s, sys: 2.11 ms, total: 2.29 s
Wall time: 2.27 s


In [4]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3128280,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3128281,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3128282,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3128283,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


### Drop singletons

In [5]:
%%time
singletons = (aol_reduced_citizen_lab[['RandID','Domain']].groupby('RandID').count() == 1).index[(aol_reduced_citizen_lab[['RandID','Domain']].groupby('RandID').count() == 1)['Domain'].to_list()]

CPU times: user 1.32 s, sys: 34.7 ms, total: 1.36 s
Wall time: 1.38 s


In [6]:
%%time
rows_to_drop = []
for i in singletons:
    rows_to_drop.append(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == i].index.to_list()[0])

CPU times: user 9min, sys: 154 ms, total: 9min
Wall time: 9min


In [7]:
display(len(rows_to_drop))

93659

In [8]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(rows_to_drop)

In [9]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [10]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3034621,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3034622,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3034623,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3034624,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


### Drop outlier

In [11]:
%%time
display(aol_reduced_citizen_lab['RandID'].value_counts())

RandID
83937     23437
75099      2516
25702      1379
43358      1167
443325     1011
          ...  
63719         2
63685         2
63490         2
63485         2
133897        2
Name: count, Length: 247987, dtype: int64

CPU times: user 149 ms, sys: 8 ms, total: 157 ms
Wall time: 181 ms


In [12]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == 83937].index.to_list())

In [13]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [14]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,0,2006-04-20 17:37:26,imdb.com,'CULTR'
1,13,2006-03-10 11:51:57,imdb.com,'CULTR'
2,13,2006-03-25 05:10:47,imdb.com,'CULTR'
3,20,2006-04-07 21:13:02,imdb.com,'CULTR'
4,21,2006-05-15 21:18:53,imdb.com,'CULTR'
...,...,...,...,...
3011184,519201,2006-03-21 18:18:21,eloccidental.com.mx,'NEWS'
3011185,520267,2006-03-26 18:55:26,chuavietnam.com,'REL'
3011186,520460,2006-05-10 00:35:54,onlybingo.com,'GMB'
3011187,521343,2006-04-04 02:56:55,republika.co.id,'NEWS'


### Define browsing histories and lists of topics

In [15]:
%%time
d = {'RandID': [], 'BrowsingHistory': [], 'AllTopics': []}
for k, v in aol_reduced_citizen_lab.groupby('RandID').__iter__():
    temp_history = []
    temp_topics = []
    for tup in v.itertuples():
        temp_history.append(tuple([tup.Domain, tup.QueryTime]))
        temp_topics.extend(tup.topics.replace(" ", "").split(","))
    d['BrowsingHistory'].append(str(temp_history))
    d['AllTopics'].append(temp_topics)
    d['RandID'].append(k)

CPU times: user 5min 9s, sys: 5.6 s, total: 5min 14s
Wall time: 5min 11s


In [16]:
%%time
aol_reduced_citizen_lab_seen_topics = []
for case in d['AllTopics']:
    aol_reduced_citizen_lab_seen_topics.extend(case)
aol_reduced_citizen_lab_seen_topics = list(set(aol_reduced_citizen_lab_seen_topics))
display(len(aol_reduced_citizen_lab_seen_topics))

31

CPU times: user 1.42 s, sys: 28.1 ms, total: 1.44 s
Wall time: 1.46 s


In [17]:
aol_reduced_citizen_lab_generalization = pandas.DataFrame(data=d, columns=['RandID', 'BrowsingHistory', 'AllTopics'])

In [18]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM..."
...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU..."


In [19]:
aol_reduced_citizen_lab_generalization['TopicsSet'] = aol_reduced_citizen_lab_generalization['AllTopics']

In [20]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'TopicsSet'] = sorted(set(aol_reduced_citizen_lab_generalization.at[row.Index, 'TopicsSet']))

CPU times: user 38.1 s, sys: 192 ms, total: 38.3 s
Wall time: 38.4 s


In [21]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV..."
...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']"
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE..."


In [22]:
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['AllTopics']

In [23]:
%%time
random.SystemRandom()
S = 5

aol_reduced_citizen_lab_topics_by_users = {i : {t : 0 for t in aol_reduced_citizen_lab_seen_topics} for i in aol_reduced_citizen_lab_generalization['RandID'].unique()}

for row in aol_reduced_citizen_lab_generalization.itertuples():
    for topic in row.AllTopics:
        aol_reduced_citizen_lab_topics_by_users[row.RandID][topic] = aol_reduced_citizen_lab_topics_by_users[row.RandID][topic] + 1

temp_top_S = {i : dict() for i in aol_reduced_citizen_lab_generalization['RandID'].unique()}

for k,v in aol_reduced_citizen_lab_topics_by_users.items():
    temp_dict = dict()
    for t,c in v.items():
        if c != 0:
            temp_dict[t] = c
    temp_dict = dict(sorted(temp_dict.items(), key=lambda item: item[1]))
    
    if len(temp_dict) <= S:
        temp_set = list({t for t in aol_reduced_citizen_lab_seen_topics} - {k for k in temp_dict.keys()})
        
        while len(temp_dict) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            temp_dict[topic] = 0
        
        temp_top_S[k].update(dict(sorted(temp_dict.items())))
    
    else:
        t,c = temp_dict.popitem()
        max_c = c
        final = dict()
        temp = {t : c}
        
        while len(final) + len(temp) < S:
            t,c = temp_dict.popitem()
            max_c = max(max_c, c)
            
            if max_c == c:
                temp[t] = c
            else:
                final.update(temp)
                temp.clear()
                temp[t] = c
        
        t,c = temp_dict.popitem()
        while (max_c == c) and (len(temp_dict) > 0):
            temp[t] = c
            t,c = temp_dict.popitem()
        
        temp_set = list({k for k in temp.keys()})
        
        while len(final) < S:
            topic = random.choice(temp_set)
            temp_set.remove(topic)
            final[topic] = temp[topic]
        
        temp_top_S[k].update(dict(sorted(final.items())))

CPU times: user 23.9 s, sys: 164 ms, total: 24 s
Wall time: 24.1 s


In [24]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'] = temp_top_S[row.RandID]

CPU times: user 13.7 s, sys: 20 ms, total: 13.7 s
Wall time: 13.8 s


In [25]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL...","{''COMM'': 4, ''CULTR'': 5, ''GOVT'': 4, ''NEW..."
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']","{''ALDR'': 1, ''COMM'': 2, ''CULTR'': 1, ''GRP..."
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']","{''COMM'': 2, ''COMT'': 2, ''FILE'': 2, ''GOVT..."
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW...","{''COMM'': 2, ''CULTR'': 3, ''LGBT'': 2, ''NEW..."
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","{''CULTR'': 17, ''GRP'': 13, ''HOST'': 16, ''N..."
...,...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']","{''COMM'': 1, ''COMT'': 1, ''GRP'': 1, ''NEWS'..."
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']","{''CULTR'': 2, ''HUMR'': 1, ''MILX'': 1, ''NEW..."
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","{''CULTR'': 14, ''GOVT'': 14, ''NEWS'': 14, ''..."
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE...","{''CULTR'': 5, ''DATE'': 3, ''HUMR'': 3, ''NEW..."


In [26]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    for k,v in row.sTopics.items():
        if ((v == 0) and (k in row.AllTopics)) or ((v > 0) and (k not in row.AllTopics)):
            print("Error!")

CPU times: user 1.69 s, sys: 61 µs, total: 1.69 s
Wall time: 1.7 s


In [27]:
%%time
for row in aol_reduced_citizen_lab_generalization.itertuples():
    aol_reduced_citizen_lab_generalization.at[row.Index, 'sTopics'] = list(temp_top_S[row.RandID].keys())

CPU times: user 15 s, sys: 3.97 ms, total: 15 s
Wall time: 15 s


In [28]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,TopicsSet,sTopics
0,0,"[('imdb.com', Timestamp('2006-04-20 17:37:26')...","['CULTR', 'CULTR', 'GOVT', 'POLR', 'SRCH', 'CO...","['COMM', 'COMT', 'CULTR', 'GOVT', 'NEWS', 'POL...","['COMM', 'CULTR', 'GOVT', 'NEWS', 'SRCH']"
1,3,"[('myspace.com', Timestamp('2006-05-20 00:35:5...","['GRP', 'HOST', 'CULTR', 'COMM', 'COMM', 'ALDR']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']","['ALDR', 'COMM', 'CULTR', 'GRP', 'HOST']"
2,4,"[('microsoft.com', Timestamp('2006-03-29 22:32...","['COMT', 'FILE', 'COMM', 'COMT', 'FILE', 'COMM']","['COMM', 'COMT', 'FILE']","['COMM', 'COMT', 'FILE', 'GOVT', 'HACK']"
3,10,"[('wikipedia.org', Timestamp('2006-03-06 21:22...","['CULTR', 'HUMR', 'POLR', 'SRCH', 'PUBH', 'MIL...","['COMM', 'CULTR', 'HUMR', 'LGBT', 'MILX', 'NEW...","['COMM', 'CULTR', 'LGBT', 'NEWS', 'SRCH']"
4,12,"[('yahoo.com', Timestamp('2006-04-19 17:14:24'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","['CULTR', 'GRP', 'HOST', 'NEWS', 'SRCH']"
...,...,...,...,...,...
247981,521683,"[('wellsfargo.com', Timestamp('2006-05-05 19:4...","['COMM', 'COMT', 'GRP', 'SRCH', 'NEWS']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']","['COMM', 'COMT', 'GRP', 'NEWS', 'SRCH']"
247982,521685,"[('bbc.co.uk', Timestamp('2006-03-15 10:43:00'...","['POLR', 'MILX', 'HUMR', 'NEWS', 'CULTR', 'CUL...","['CULTR', 'GAME', 'HUMR', 'MILX', 'NEWS', 'POLR']","['CULTR', 'HUMR', 'MILX', 'NEWS', 'POLR']"
247983,521686,"[('yahoo.com', Timestamp('2006-03-09 18:44:13'...","['CULTR', 'GOVT', 'POLR', 'SRCH', 'COMT', 'COM...","['COMM', 'COMT', 'CTRL', 'CULTR', 'DATE', 'ENV...","['CULTR', 'GOVT', 'NEWS', 'POLR', 'SRCH']"
247984,521689,"[('imdb.com', Timestamp('2006-03-29 00:35:14')...","['CULTR', 'CULTR', 'HUMR', 'POLR', 'SRCH', 'PU...","['COMT', 'CTRL', 'CULTR', 'DATE', 'ENV', 'FILE...","['CULTR', 'DATE', 'HUMR', 'NEWS', 'POLR']"


### Statistics

`RandID` statistics.

In [29]:
display(aol_reduced_citizen_lab_generalization['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    247986.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
80%           1.0
85%           1.0
90%           1.0
95%           1.0
96%           1.0
97%           1.0
98%           1.0
99%           1.0
max           1.0
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 150802 records and 70406 `RandID`s have 1 record.

### Save to file

In [30]:
aol_reduced_citizen_lab_generalization.to_csv('AOL-experimental-Citizen-Lab-Classification.csv')