### Install and import libraries

In [1]:
import pickle
import pandas as pd

import nltk
# nltk.download('punkt')      # uncomment to download punkt

# !pip install -U textblob    # uncomment to install textblob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier


### Read Pickeled Data
This file reads and processes data that has previously been pickeled by Parse_opp_115

In [2]:
# Choose one of several data files
with open('Data/parsed-annotation-0.5.pk', 'rb') as pfile:
    data = pickle.load(pfile)
    pfile.close()


### Data structure
`data[site][category][topic] =` pandas dataframe

### Count the number of classifications

In [3]:
features = {}

for site in data:
    for category in data[site]:

        if category not in features:
            features[category] = {}

        for topic in data[site][category]:
                
            if topic not in features[category]:
                features[category][topic] = {}

            for row in data[site][category][topic].iterrows():
                index, rowdata = row
                value = rowdata["value"]
                
                if value not in features[category][topic]:
                    features[category][topic][value] = 0

                features[category][topic][value] += 1
                    

### Print out a tree of categories, topics, and values
But only print out if we have at least 100 classifications

In [4]:
lines = []

for category in sorted(features):
    for topic in sorted(features[category]):
        for value in sorted(features[category][topic]):
            n = features[category][topic][value]
            lines.append( "%4d;%s;%s;%s" % (n, category, topic, value) )

for line in reversed(sorted(lines)):
    print line


6907;First Party Collection/Use;Does/Does Not;Does
4600;First Party Collection/Use;User Type;Unspecified
4106;First Party Collection/Use;Choice Scope;Unspecified
3878;First Party Collection/Use;Choice Type;Unspecified
3841;Third Party Sharing/Collection;Does/Does Not;Does
3278;First Party Collection/Use;Identifiability;Unspecified
2912;First Party Collection/Use;Action First-Party;Unspecified
2896;Third Party Sharing/Collection;User Type;Unspecified
2706;Third Party Sharing/Collection;Action Third Party;Receive/Shared with
2698;First Party Collection/Use;Action First-Party;Collect on website
2546;Third Party Sharing/Collection;Choice Scope;Unspecified
2407;Third Party Sharing/Collection;Choice Type;Unspecified
2095;First Party Collection/Use;Purpose;Unspecified
1980;First Party Collection/Use;Choice Scope;not-selected
1963;Third Party Sharing/Collection;Identifiability;Unspecified
1939;First Party Collection/Use;Collection Mode;Unspecified
1933;Third Party Sharing/Collection;Third Part

### Read in feature map
Map category, topic, and value to a label

In [5]:
feature_map_file = pd.read_csv("Data/label_map.csv", header=0, sep=",")
feature_map = {}

for row in feature_map_file.iterrows():
    index, rowdata = row
    
    category = rowdata["Category"]
    topic = rowdata["Topic"]
    value = rowdata["Value"]
    label = rowdata["Label"]
    n = rowdata["n"]

    # only consider features with at least 100 classifications
    if n < 100:
        continue
    
    if category not in feature_map:
        feature_map[category] = {}

    if topic not in feature_map[category]:
        feature_map[category][topic] = {}
    
    if value not in feature_map[category][topic]:
        feature_map[category][topic][value] = {}

    feature_map[category][topic][value] = label

feature_map_file.head(3)


Unnamed: 0,Category,Topic,Value,Label,n
0,Data Retention,Personal Information Type,Computer information,Data Retention,5
1,Data Retention,Personal Information Type,Contact,Data Retention,30
2,Data Retention,Personal Information Type,Cookies and tracking elements,Data Retention,22


### Use the feature map to label data

In [6]:
labels = set()
labeled_data = []

for site in data:
    for category in data[site]:
        for topic in data[site][category]:
            for row in data[site][category][topic].iterrows():
                index, rowdata = row
                
                try:
                    value = rowdata["value"]
                    selectedText = rowdata["selectedText"]
                    if selectedText.lower() == "nan":
                        continue
                except KeyError:
                    continue
                except AttributeError:
                    continue
                    
                try:
                    label = feature_map[category][topic][value]
                except:
                    # if we could not find a label, skip it
                    continue
                    
                if label == "-":
                    continue
                
                labels.add(label)
                labeled_data.append((selectedText, label))

print "Length of labeled data: %d" % (len(labeled_data))
print "Number of labels: %d" % (len(labels))
print "Labels: %s" % ("; ".join(labels))

Length of labeled data: 76165
Number of labels: 9
Labels: Data Retention; First Party Collection/Use; Policy Change; International and Specific Audiences; User Access, Edit and Deletion; User Choice/Control; Data Security; Third Party Sharing/Collection; Other


### Print a few labels with selectedText

In [7]:
for text, label in labeled_data[:10]:
    print "%s: %s" % (label, text)


Data Retention: eparately from any account information that directly identifies the user, such as name, e-mail address, or phone numbers.
Data Retention: which are retained
Data Retention: retained
Data Retention: We store search terms (and the cookie IDs associated with search terms) separately from any account information that directly identifies the user, such as name, e-mail address, or phone numbers. We have technological safeguards in place designed to prevent the unauthorized correlation of this data and we remove the entirety of the IP address after 6 months, cookies and other cross session identifiers, after 18 months.
Data Retention: separately from any account information that directly identifies the user, such as name, e-mail address, or phone numbers.
Data Retention: nformation from our standard search logs, which are retained and anonymized as described in the Collecting Your Information Section.
First Party Collection/Use: If you sign into
First Party Collection/Use: If 