# AOL Experimental Dataset (Citizen Lab)

In [1]:
import pandas, random
from collections import Counter

## AOL reduced dataset with Citizen Lab Classification

In [2]:
%%time
# Loads AOL-reduced-Citizen-Lab-Classification dataset into data DataFrame.
aol_reduced_citizen_lab = pandas.read_csv('AOL-reduced-Citizen-Lab-Classification.csv', low_memory=False, on_bad_lines='warn', index_col=0)

CPU times: user 3.94 s, sys: 486 ms, total: 4.43 s
Wall time: 4.4 s


In [3]:
%%time
aol_reduced_citizen_lab['QueryTime'] = pandas.to_datetime(aol_reduced_citizen_lab['QueryTime'])

CPU times: user 1.01 s, sys: 6.52 ms, total: 1.01 s
Wall time: 996 ms


In [4]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
3135265,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
3135266,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
3135267,521691,2006-03-04 11:36:38,go.com,'NEWS'
3135268,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


### Drop singletons

In [5]:
%%time
# Lists rows for RandID individuals with only one domain in their browsing history, to be dropped.
temp = aol_reduced_citizen_lab[['RandID','Domain']].groupby(['RandID']).nunique()
singletons = temp[temp['Domain'] == 1].index.to_list()
print(len(singletons))

130689
CPU times: user 778 ms, sys: 81.3 ms, total: 860 ms
Wall time: 858 ms


In [6]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab[~aol_reduced_citizen_lab.RandID.isin(singletons)]

In [7]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [8]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
2871078,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
2871079,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
2871080,521691,2006-03-04 11:36:38,go.com,'NEWS'
2871081,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


### Drop outlier

In [9]:
%%time
# One individual has too many domains in their browsing history and is most probably a bot, to be dropped.
display(aol_reduced_citizen_lab['RandID'].value_counts())

RandID
463921    23505
311717     2516
352479     1379
118679     1167
342660     1011
          ...  
317417        2
120680        2
448289        2
317341        2
193287        2
Name: count, Length: 211314, dtype: int64

CPU times: user 52.6 ms, sys: 8.21 ms, total: 60.8 ms
Wall time: 59.8 ms


In [10]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.drop(aol_reduced_citizen_lab[aol_reduced_citizen_lab['RandID'] == 463921].index.to_list())

In [11]:
aol_reduced_citizen_lab = aol_reduced_citizen_lab.reset_index(drop=True)

In [12]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH'
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'"
...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,'GOVT'
2847574,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'"
2847575,521691,2006-03-04 11:36:38,go.com,'NEWS'
2847576,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH..."


### Statistics

In [13]:
%%time
aol_reduced_citizen_lab.info(verbose=True, memory_usage='deep', show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2847578 entries, 0 to 2847577
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   RandID     2847578 non-null  int64         
 1   QueryTime  2847578 non-null  datetime64[ns]
 2   Domain     2847578 non-null  object        
 3   topics     2847578 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 471.9 MB
CPU times: user 1.13 s, sys: 5.95 ms, total: 1.14 s
Wall time: 1.13 s


Number of unique values per attribute.

In [14]:
display(aol_reduced_citizen_lab.nunique(dropna=False))

RandID        211313
QueryTime    1922950
Domain          4872
topics           217
dtype: int64

`RandID` statistics.

In [15]:
display(aol_reduced_citizen_lab['RandID'].value_counts().describe([.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99]))

count    211313.000000
mean         13.475640
std          27.133604
min           2.000000
25%           3.000000
50%           6.000000
75%          13.000000
80%          17.000000
85%          21.000000
90%          29.000000
95%          47.000000
96%          53.000000
97%          64.000000
98%          80.000000
99%         116.000000
max        2516.000000
Name: count, dtype: float64

Number of `RandID`s per count of records, e.g. 1 `RandID` has 2516 records and 32488 `RandID`s have 2 records.

In [16]:
with pandas.option_context('display.max_rows', None):
    display(aol_reduced_citizen_lab['RandID'].value_counts().value_counts().sort_index(ascending=False).sort_index(ascending=False))

count
2516        1
1379        1
1167        1
1011        1
980         1
902         1
863         1
822         1
774         1
759         1
755         1
748         1
719         1
715         1
699         1
690         2
677         1
656         1
651         1
644         1
640         1
636         1
634         1
633         1
627         1
622         1
621         1
611         1
609         1
597         1
596         1
591         1
586         1
582         1
581         1
578         1
572         1
571         1
566         1
564         1
561         1
558         1
550         1
547         1
543         1
541         1
538         1
529         2
524         1
521         1
520         1
518         1
516         3
515         2
512         1
511         1
510         1
498         1
497         1
496         3
495         1
494         1
493         1
490         1
489         1
484         3
483         1
482         1
480         2
473         1
472         1


Date and time range.

In [17]:
display(aol_reduced_citizen_lab['QueryTime'].min())

Timestamp('2006-03-01 00:01:04')

In [18]:
display(aol_reduced_citizen_lab['QueryTime'].max())

Timestamp('2006-05-31 23:59:58')

Number of unique `Domain` values.

In [19]:
%%time
aol_reduced_citizen_lab_domains_counts = dict(Counter(aol_reduced_citizen_lab['Domain'].to_list()))
display(len(aol_reduced_citizen_lab_domains_counts))

4872

CPU times: user 184 ms, sys: 3.22 ms, total: 188 ms
Wall time: 185 ms


Top `Domain` values by number of records.

In [20]:
top = 100
with pandas.option_context('display.max_rows', top, 'display.min_rows', top):
    display(pandas.Series(aol_reduced_citizen_lab_domains_counts).sort_values(ascending=False).head(top))

yahoo.com                 386719
google.com                332319
myspace.com               203992
ebay.com                  149904
wikipedia.org             120136
amazon.com                101643
imdb.com                   99581
msn.com                    97027
go.com                     76901
craigslist.org             60366
aol.com                    58108
nih.gov                    46997
bankofamerica.com          42151
ask.com                    36234
tripod.com                 30720
azlyrics.com               22912
microsoft.com              22484
cnn.com                    21744
walmart.com                20942
angelfire.com              20459
nytimes.com                19314
pogo.com                   15549
bbc.co.uk                  15043
gamespot.com               13783
wellsfargo.com             13557
ign.com                    13354
army.mil                   12658
nbc.com                    12651
reference.com              11470
xanga.com                  11195
sapo.pt   

### Define browsing histories and lists of topics

In [21]:
%%time
aol_reduced_citizen_lab['BrowsingHistory'] = list(zip(aol_reduced_citizen_lab.Domain, aol_reduced_citizen_lab.QueryTime))

CPU times: user 8.42 s, sys: 309 ms, total: 8.73 s
Wall time: 8.71 s


In [22]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,'PUBH',"(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-03-01 12:10:38,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'","(harvard.edu, 2006-03-01 12:10:38)"
4,1,2006-03-08 21:16:04,harvard.edu,"'NEWS', 'CTRL', 'HUMR', 'PUBH'","(harvard.edu, 2006-03-08 21:16:04)"
...,...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,'GOVT',"(nih.gov, 2006-05-05 01:09:28)"
2847574,521689,2006-05-05 02:00:33,typepad.com,"'HOST', 'NEWS'","(typepad.com, 2006-05-05 02:00:33)"
2847575,521691,2006-03-04 11:36:38,go.com,'NEWS',"(go.com, 2006-03-04 11:36:38)"
2847576,521691,2006-03-15 19:24:17,google.com,"'COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRCH...","(google.com, 2006-03-15 19:24:17)"


In [23]:
%%time
aol_reduced_citizen_lab['topics'] = aol_reduced_citizen_lab['topics'].map(lambda x : x.replace(" ", "").split(","))

CPU times: user 4.93 s, sys: 601 ms, total: 5.53 s
Wall time: 5.51 s


In [24]:
display(aol_reduced_citizen_lab)

Unnamed: 0,RandID,QueryTime,Domain,topics,BrowsingHistory
0,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
1,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
2,1,2006-03-01 11:54:19,kidshealth.org,['PUBH'],"(kidshealth.org, 2006-03-01 11:54:19)"
3,1,2006-03-01 12:10:38,harvard.edu,"['NEWS', 'CTRL', 'HUMR', 'PUBH']","(harvard.edu, 2006-03-01 12:10:38)"
4,1,2006-03-08 21:16:04,harvard.edu,"['NEWS', 'CTRL', 'HUMR', 'PUBH']","(harvard.edu, 2006-03-08 21:16:04)"
...,...,...,...,...,...
2847573,521689,2006-05-05 01:09:28,nih.gov,['GOVT'],"(nih.gov, 2006-05-05 01:09:28)"
2847574,521689,2006-05-05 02:00:33,typepad.com,"['HOST', 'NEWS']","(typepad.com, 2006-05-05 02:00:33)"
2847575,521691,2006-03-04 11:36:38,go.com,['NEWS'],"(go.com, 2006-03-04 11:36:38)"
2847576,521691,2006-03-15 19:24:17,google.com,"['COMT', 'MMED', 'CTRL', 'CULTR', 'NEWS', 'SRC...","(google.com, 2006-03-15 19:24:17)"


In [25]:
%%time
aol_reduced_citizen_lab_generalization = aol_reduced_citizen_lab[['RandID','BrowsingHistory','topics']].groupby('RandID', sort=False)[['BrowsingHistory','topics']].agg(lambda x : list(x)).reset_index()

CPU times: user 15.6 s, sys: 183 ms, total: 15.8 s
Wall time: 15.8 s


In [26]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","[['PUBH'], ['PUBH'], ['PUBH'], ['NEWS', 'CTRL'..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","[['SRCH'], ['SRCH'], ['HOST', 'NEWS'], ['PUBH'..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","[['NEWS', 'REL'], ['COMT', 'MMED', 'CTRL', 'CU..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","[['NEWS', 'REL'], ['GOVT'], ['SRCH', 'POLR', '..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","[['HOST'], ['COMT', 'MMED', 'CTRL', 'CULTR', '..."
...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","[['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SR..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","[['LGBT', 'MMED'], ['GMB'], ['HOST', 'GRP', 'C..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","[['REL'], ['SRCH', 'POLR', 'HUMR', 'REL'], ['C..."
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","[['ENV', 'POLR'], ['NEWS', 'SRCH', 'GRP', 'COM..."


In [27]:
%%time
aol_reduced_citizen_lab_generalization['topics'] = aol_reduced_citizen_lab_generalization['topics'].map(lambda x : [topic for t in x for topic in t])

CPU times: user 1.27 s, sys: 8.07 ms, total: 1.28 s
Wall time: 1.28 s


In [28]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,topics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW..."
...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']"
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',..."


In [29]:
aol_reduced_citizen_lab_generalization = aol_reduced_citizen_lab_generalization.rename(columns={'topics' : 'AllTopics'})

In [30]:
%%time
# Computes the number of unique seen topics.
aol_reduced_citizen_lab_seen_topics = sorted(list(set([x for y in aol_reduced_citizen_lab_generalization['AllTopics'] for x in y])))
display(len(aol_reduced_citizen_lab_seen_topics))

31

CPU times: user 1.09 s, sys: 43.6 ms, total: 1.13 s
Wall time: 1.13 s


In [31]:
%%time
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['AllTopics'].map(lambda x : dict(Counter(x)))

CPU times: user 1.49 s, sys: 24.4 ms, total: 1.51 s
Wall time: 1.51 s


In [32]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR...","{''PUBH'': 6, ''NEWS'': 4, ''CTRL'': 3, ''HUMR..."
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT...","{''SRCH'': 8, ''HOST'': 5, ''NEWS'': 6, ''PUBH..."
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR...","{''NEWS'': 2, ''REL'': 2, ''COMT'': 1, ''MMED'..."
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'...","{''NEWS'': 1, ''REL'': 2, ''GOVT'': 3, ''SRCH'..."
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW...","{''HOST'': 11, ''COMT'': 19, ''MMED'': 10, ''C..."
...,...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC...","{''COMT'': 7, ''CULTR'': 4, ''COMM'': 7, ''NEW..."
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'...","{''LGBT'': 3, ''MMED'': 2, ''GMB'': 1, ''HOST'..."
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']","{''REL'': 2, ''SRCH'': 1, ''POLR'': 1, ''HUMR'..."
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',...","{''ENV'': 6, ''POLR'': 6, ''NEWS'': 12, ''SRCH..."


In [33]:
def s_top(topics, s, allTopics):
    # If RandID's AllTopics list has exactly S topics, this is already their sTopics list.
    if len(topics) == s:
        top_s = list(topics.keys())
    # If RandID's AllTopics list has less than S topics, selects the remaining topics at random from the taxonomy.
    elif len(topics) < s:
        top_s = list(topics.keys())
        top_s.extend(random.sample(list(set(allTopics) - set(top_s)), k=s-len(top_s)))
    # If RandID's AllTopics list has more than S topics, selects only S of them, in decreasing order of occurrence.
    elif len(topics) > s:
        # Creates a temporary copy of RandID's AllTopics list (here a dictionary mapping topic to count).
        temp = topics.copy()
        # Creates the sTopics list with RandID's most seen topic and deletes it from temporary list.
        top_s = [max(temp, key=temp.get)]
        val = temp[top_s[0]]
        del temp[top_s[0]]
        # Creates a temporary list, alternatives, to account for topics with the same number of occurrences.
        v = temp[max(temp, key=temp.get)]
        alternatives = [top_s[0]]
        # While the sTopics list has less than S topics OR the next topic has the same number of occurrences.
        while len(top_s) < s or v == val:
            t = max(temp, key=temp.get)
            # If the current topic has the same number of occurrences as the previous one,
            # append it to alternatives.
            if v == val:
                alternatives.append(t)
            # Otherwise, clear alternatives and append the current topic to it.
            else:
                alternatives.clear()
                alternatives.append(t)
            top_s.append(t)
            val = v
            del temp[t]
            # If the temporary list is empty, break the loop.
            if len(temp) == 0:
                break
            else:
                v = temp[max(temp, key=temp.get)]
        # If the sTopics list has more than S topics, randomly selects the appropriate number of topics
        # from alternatives, i.e. from the topics with the same number of occurrences.
        if len(top_s) > s:
            delta = s - (len(top_s) - len(alternatives))
            top_s = top_s[0:s-delta]
            top_s.extend(random.sample(alternatives, k=delta))
    
    return sorted(top_s)

In [34]:
%%time
# Selects S topics to be in sTopics list for each RandID.
random.SystemRandom()
S = 5
aol_reduced_citizen_lab_generalization['sTopics'] = aol_reduced_citizen_lab_generalization['sTopics'].map(lambda x : s_top(x, S, aol_reduced_citizen_lab_seen_topics))

CPU times: user 5.01 s, sys: 20 ms, total: 5.03 s
Wall time: 5.03 s


In [35]:
display(aol_reduced_citizen_lab_generalization)

Unnamed: 0,RandID,BrowsingHistory,AllTopics,sTopics
0,1,"[(kidshealth.org, 2006-03-01 11:54:19), (kidsh...","['PUBH', 'PUBH', 'PUBH', 'NEWS', 'CTRL', 'HUMR...","['COMM', 'CTRL', 'HUMR', 'NEWS', 'PUBH']"
1,3,"[(dogpile.com, 2006-03-12 16:09:37), (dogpile....","['SRCH', 'SRCH', 'HOST', 'NEWS', 'PUBH', 'COMT...","['COMT', 'HOST', 'NEWS', 'PUBH', 'SRCH']"
2,4,"[(foxnews.com, 2006-03-12 20:33:21), (google.c...","['NEWS', 'REL', 'COMT', 'MMED', 'CTRL', 'CULTR...","['ENV', 'LGBT', 'NEWS', 'REL', 'SRCH']"
3,5,"[(foxnews.com, 2006-03-22 00:22:53), (senate.g...","['NEWS', 'REL', 'GOVT', 'SRCH', 'POLR', 'HUMR'...","['GOVT', 'HUMR', 'NEWS', 'POLR', 'REL']"
4,6,"[(technorati.com, 2006-03-03 20:48:41), (googl...","['HOST', 'COMT', 'MMED', 'CTRL', 'CULTR', 'NEW...","['COMT', 'CULTR', 'GOVT', 'NEWS', 'POLR']"
...,...,...,...,...
211308,521674,"[(yahoo.com, 2006-03-06 10:04:29), (yahoo.com,...","['COMT', 'CULTR', 'COMM', 'NEWS', 'POLR', 'SRC...","['COMM', 'COMT', 'GOVT', 'NEWS', 'SRCH']"
211309,521686,"[(flickr.com, 2006-03-12 21:19:11), (partypoke...","['LGBT', 'MMED', 'GMB', 'HOST', 'GRP', 'CULTR'...","['CULTR', 'GOVT', 'HOST', 'LGBT', 'NEWS']"
211310,521688,"[(ytmnd.com, 2006-03-12 20:00:14), (aol.com, 2...","['REL', 'SRCH', 'POLR', 'HUMR', 'REL', 'COMM']","['COMM', 'HUMR', 'POLR', 'REL', 'SRCH']"
211311,521689,"[(globalwitness.org, 2006-03-04 01:04:05), (ms...","['ENV', 'POLR', 'NEWS', 'SRCH', 'GRP', 'COMT',...","['COMT', 'CTRL', 'GOVT', 'GRP', 'NEWS']"


In [36]:
# Checks if every sTopics list has exactly S topics.
display(aol_reduced_citizen_lab_generalization['sTopics'].map(lambda x : True if len(x) == S else False).all())

True

### Save to file

In [37]:
aol_reduced_citizen_lab_generalization.to_csv('AOL-experimental-Citizen-Lab-Classification.csv')