### Install and import libraries

In [2]:
import pickle
import pandas as pd

import nltk
# nltk.download('punkt')      # uncomment to download punkt

# !pip install -U textblob    # uncomment to install textblob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier


## Read Pickeled Data
This file reads and processes data that has previously been pickeled by Parse_opp_115

In [3]:
# Choose one of several data files
with open('Data/parsed-annotation-0.5.pk', 'rb') as pfile:
    data = pickle.load(pfile)
    pfile.close()


### Data structure
`data[site][category][topic] =` pandas dataframe

### Sites, Categories, and Topics
Print out all sites, categories, and topics

In [18]:
sites = set()
categories = set()
topics = set()
values = set()

for site in data:
    sites.add(site)
    for category in data[site]:
        categories.add(category)
        for topic in data[site][category]:
            topics.add(topic)
            for row in data[site][category][topic].iterrows():
                index, rowdata = row
                values.add(rowdata["value"])

                    
print "Sites [{}]: ".format(len(sites)) + "; ".join(sorted(sites))
print
print "Categories [{}]: ".format(len(categories)) + "; ".join(sorted(categories))
print
print "Topics [{}]: ".format(len(topics)) + "; ".join(sorted(topics))
print
print "Values [{}]: ".format(len(values)) + "; ".join(sorted(values))


Sites [115]: abcnews.com; abita.com; acbj.com; adweek.com; allstate.com; amazon.com; aol.com; archives.gov; austincc.edu; bankofamerica.com; barnesandnoble.com; boardgamegeek.com; buffalowildwings.com; cariboucoffee.com; cbsinteractive.com; chasepaymentech.com; cincymuseum.org; citizen.org; coffeereview.com; communitycoffee.com; dailyillini.com; dailynews.com; dairyqueen.com; dcccd.edu; disinfo.com; dogbreedinfo.com; earthkam.org; eatchicken.com; education.jlab.org; enthusiastnetwork.com; esquire.com; everydayhealth.com; foodallergy.org; fool.com; fortune.com; foxsports.com; fredericknewspost.com; freep.com; gamestop.com; gawker.com; geocaching.com; google.com; gwdocs.com; highgearmedia.com; honda.com; ifsa-butler.org; imdb.com; instagram.com; internetbrands.com; ironhorsevineyards.com; jibjab.com; kaleidahealth.org; kraftrecipes.com; latinpost.com; lids.com; liquor.com; lodgemfg.com; lynda.com; meredith.com; miaminewtimes.com; military.com; minecraft.gamepedia.com; mlb.mlb.com; mohega

### Convert to a Pandas DataFrame

In [7]:
df = pd.DataFrame(data)
df['abcnews.com']['Data Retention']['Personal Information Type']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,599,26,personal information,579,Generic personal information
1,599,26,personal information,579,Generic personal information


In [8]:
df['abcnews.com']['Data Retention']['Personal Information Type']['value'][0]

'Generic personal information'

In [9]:
df['usa.gov']['Data Retention'].keys()

['Retention Period', 'Retention Purpose', 'Personal Information Type']

In [10]:
df['usa.gov']['Data Retention']['Personal Information Type']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,172,3,the Internet protocol address,143,IP address and device IDs
1,217,3,date and time,204,Other
2,346,3,the name of the web site from which you linked...,219,User online activities
3,388,3,the browser and operating system used,351,Computer information
4,-1,6,,-1,Unspecified


### Problem with data conversion, or inherent in the data?
Why do the data contain so many "NaN" values?

In [11]:
df['abcnews.com']['First Party Collection/Use']['Collection Mode']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,126,4,We collect two basic types of information pers...,64,Unspecified
1,-1,4,,-1,Unspecified
2,-1,4,,-1,Unspecified
3,-1,4,,-1,Unspecified
4,-1,4,,-1,Unspecified
5,46,5,you provide,35,Explicit
6,46,5,provide,39,Explicit
7,46,5,provide,39,Explicit
8,195,5,egistration information you provide when you c...,11,Explicit
9,195,5,egistration information you provide when you c...,11,Explicit


### Create a training data set

In [23]:
train = []

# sites = data.keys()[0:10]

# let's start with just 1 site
sites = ["washingtonpost.com"]

for site in sites:
    for category in data[site]:
        #print "#" * 50
        #print category
        for topic in data[site][category]:
            #print "-" * 50
            #print topic
            for row in df[site][category][topic].iterrows():
                index, rowdata = row
                # print rowdata
                try:
                    selectedText = str(rowdata["selectedText"])
                except KeyError:
                    continue
                value = rowdata["value"]

                if selectedText.lower() == "nan":
                    continue
                    
                #print "%s -> %s" % (selectedText, value)
                train.append((selectedText, value))

print len(train)

772


### Train a Naive Bayes Classifier

In [24]:
%%time
cl = NaiveBayesClassifier(train)

CPU times: user 1min 59s, sys: 1.21 s, total: 2min
Wall time: 2min


In [25]:
cl.classify("Your IP Address is stored")

'Generic personal information'

In [26]:
cl.classify("We collect your email address")

'Does'

In [28]:
cl.classify("cookies are used to track our users")

'Does'

### Value mapping
We need to ignore unhelpful values or map them to something useful

In [29]:
useful_values = "Advertising; Children; Cookies and tracking elements; Financial; First party collection; First party use".split('; ')
print len(useful_values)

6
