### Install and import libraries

In [97]:
import pickle
import pandas as pd

import nltk
# nltk.download('punkt')      # uncomment to download punkt

# !pip install -U textblob    # uncomment to install textblob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier


## Read Pickeled Data
This file reads and processes data that has previously been pickeled by Parse_opp_115

In [98]:
# Choose one of several data files
with open('Data/parsed-annotation-0.5.pk', 'rb') as pfile:
    data = pickle.load(pfile)
    pfile.close()


### Data structure
`data[site][category][topic] =` pandas dataframe

### Sites, Categories, and Topics
Print out all sites, categories, and topics

In [2]:
sites = set()
categories = set()
topics = set()

for site in data:
    sites.add(site)
    for category in data[site]:
        categories.add(category)
        for topic in data[site][category]:
            topics.add(topic)
        
print "Sites: " + ", ".join(sites)
print
print "Categories: " + ", ".join(categories)
print
print "Topics: " + ", ".join(topics)


Sites: playstation.com, honda.com, sports-reference.com, sltrib.com, foxsports.com, style.com, wnep.com, sheknows.com, mlb.mlb.com, coffeereview.com, jibjab.com, theatlantic.com, kaleidahealth.org, uptodate.com, gwdocs.com, meredith.com, ticketmaster.com, abita.com, www.loc.gov, washingtonian.com, lynda.com, highgearmedia.com, walmart.com, ted.com, nytimes.com, lodgemfg.com, enthusiastnetwork.com, kraftrecipes.com, rockstargames.com, geocaching.com, tangeroutlet.com, voxmedia.com, ironhorsevineyards.com, gawker.com, abcnews.com, eatchicken.com, allstate.com, austincc.edu, sidearmsports.com, uh.edu, dogbreedinfo.com, amazon.com, si.edu, ocregister.com, boardgamegeek.com, yahoo.com, freep.com, neworleansonline.com, instagram.com, tulsaworld.com, liquor.com, stlouisfed.org, gamestop.com, msn.com, cincymuseum.org, cariboucoffee.com, esquire.com, vikings.com, bankofamerica.com, dailynews.com, lids.com, tgifridays.com, thehill.com, sciencemag.org, dcccd.edu, acbj.com, restaurantnews.com, com

### Convert to a Pandas DataFrame

In [3]:
df = pd.DataFrame(data)
df['abcnews.com']['Data Retention']['Personal Information Type']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,599,26,personal information,579,Generic personal information
1,599,26,personal information,579,Generic personal information


In [4]:
df['abcnews.com']['Data Retention']['Personal Information Type']['value'][0]

'Generic personal information'

In [5]:
df['usa.gov']['Data Retention'].keys()

['Retention Period', 'Retention Purpose', 'Personal Information Type']

In [55]:
df['usa.gov']['Data Retention']['Personal Information Type']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,172,3,the Internet protocol address,143,IP address and device IDs
1,217,3,date and time,204,Other
2,346,3,the name of the web site from which you linked...,219,User online activities
3,388,3,the browser and operating system used,351,Computer information
4,-1,6,,-1,Unspecified


### Problem with data conversion, or inherent in the data?
Why do the data contain so many "NaN" values?

In [60]:
df['abcnews.com']['First Party Collection/Use']['Collection Mode']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,126,4,We collect two basic types of information pers...,64,Unspecified
1,-1,4,,-1,Unspecified
2,-1,4,,-1,Unspecified
3,-1,4,,-1,Unspecified
4,-1,4,,-1,Unspecified
5,46,5,you provide,35,Explicit
6,46,5,provide,39,Explicit
7,46,5,provide,39,Explicit
8,195,5,egistration information you provide when you c...,11,Explicit
9,195,5,egistration information you provide when you c...,11,Explicit


### Create a training data set

In [101]:
train = []

# sites = data.keys()[0:10]

# let's start with just 1 site
sites = ["washingtonpost.com"]

for site in sites:
    for category in data[site]:
        #print "#" * 50
        #print category
        for topic in data[site][category]:
            #print "-" * 50
            #print topic
            for row in df[site][category][topic].iterrows():
                index, rowdata = row
                # print rowdata
                try:
                    selectedText = str(rowdata["selectedText"])
                except KeyError:
                    continue
                value = rowdata["value"]

                if selectedText.lower() == "nan":
                    continue
                    
                #print "%s -> %s" % (selectedText, value)
                train.append((selectedText, value))

print len(train)

772


### Train a Naive Bayes Classifier

In [104]:
%%time
cl = NaiveBayesClassifier(train)

CPU times: user 2min 1s, sys: 952 ms, total: 2min 2s
Wall time: 2min 1s


In [105]:
cl.classify("IP Address")

'Generic personal information'

In [106]:
cl.classify("email address")

'Generic personal information'

In [107]:
cl.classify("cookies")

'Cookies and tracking elements'