In [1]:
## import labeled data from pickled object
import pickle
import random
import pandas as pd
from IPython.display import display, HTML
import glob
from bs4 import BeautifulSoup
import os
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re
import itertools
import string
engstop=set(stopwords.words("english"))



### Match the selected text with corresponding sentences

In [2]:
%%time
policyFiles=glob.glob("/share/pub/OPP-115/sanitized_policies/*.html")
print(len(policyFiles))

## read in annotation
def readAnno(filelist):
    annotations={}
    for filename in filelist:
        website=re.sub(".json", '', os.path.basename(filename))
        with open(filename, "r") as f:
            annotations[website]=json.load(f)
        f.close()
    return annotations

def readPolicies(filelist):
    soups={}
    for filename in filelist:
        base=os.path.basename(filename).split("_")[1]
        website=re.sub(".html", '', base)
        soups[website]=BeautifulSoup(open(filename, "r").read(), 'html.parser')
    return soups

with open("Data/parsed-annotation-1.0.pk", 'rb') as f:
    annotations=pickle.load(f, encoding='latin1')
  
  
## Get the texts from beautifulsoup objects

def extractTexts(soups):
    policyTexts={}
    for website in soups:
        policyTexts[website]=soups[website].get_text()
    return policyTexts
### read in sanitized policy texts
policySoups=readPolicies(policyFiles)
## get texts from soup objects
policyTexts=extractTexts(policySoups)

115
CPU times: user 2.88 s, sys: 204 ms, total: 3.08 s
Wall time: 3.09 s


In [3]:
%%time
## Each policy text is separated by sections using "|||" 
## Each selectedText has the section number and the character indices at which they start and end in the section

def getSentIdx(raw):
    """Get the sentences and index of each sentence and their section number"""
    allIdx={}
    secList=raw.split("|||")
    for i in range(len(secList)):
        try:
            secText=secList[i]
        except IndexError:
            print(i)
        secSents=sent_tokenize(secText)
        secIdxLists=[]
        for sent in secSents:
            m=re.search(re.escape(sent), secText)  ## escape to account for quotes in the string
            secIdxLists.append((sent, i, m.start(),m.end()))
        allIdx[i]=secIdxLists
    return allIdx

def getAllIdx(textDict):
    results={}
    for website, text in textDict.items():
        try:
            senidxes=getSentIdx(text)
            results[website]=senidxes
        except IndexError:
            print(website)
    return results


sentIdx=getAllIdx(policyTexts)
# Structure : {website:{section:[(sentence, section, startIdx, endIdx)]}}

CPU times: user 7.92 s, sys: 12 ms, total: 7.93 s
Wall time: 7.93 s


In [28]:
sentIdx[list(sentIdx.keys())[0]][0]

[('PRIVACY POLICY   This Privacy Policy explains what information is collected via the Mohegan Sun website located at mohegansun.com, why the information is collected and how it is used.',
  0,
  0,
  183),
 ('Mohegan Sun does not share or sell your personal information to anyone.',
  0,
  184,
  255),
 ('Mohegan Sun does not participate in any format of SPAM and will not send any unsolicited emails.',
  0,
  256,
  352)]

In [29]:
## repack the annotations
def annotation2table(anno):
    siteTableList=[]
    for website, cat in anno.items():
        cattableList=[]
        for category, topictable in cat.items():
            topictablelist=[]
            for topic, table in topictable.items():
                tableMod=table.assign(Topic=[topic]*table.shape[0])
                topictablelist.append(tableMod)
        
            catFrame=pd.concat(topictablelist, axis=0)
            catFrameMod=catFrame.assign(Category=[category]*catFrame.shape[0])
            cattableList.append(catFrameMod)
      
        siteFrame=pd.concat(cattableList, axis=0)
        siteFrameMod=siteFrame.assign(Website=[website]*siteFrame.shape[0])
        siteTableList.append(siteFrameMod)
        
    annoFrame=pd.concat(siteTableList, axis=0)
    
    return annoFrame

annotable1_0=annotation2table(annotations)

In [9]:
annotable1_0.shape

(142850, 8)

In [34]:
%%time
# for each selectedText in annotation, search for corresponding sentence in the corpus 
# if a selectedText is more than one sentence long, it will be discarded, since it does not provide much information
# for the importance of words
def labelRel(anno, sentIdx):
    sentlabels={}
    for website, siteanno in anno.items():
        sentlabels[website]={}
        siteSents=sentIdx[website]
        for section, sentList in siteSents.items():
            sentlabels[website][section]=[list(sentTuple) for sentTuple in sentList]
            for sentEntry in sentlabels[website][section]:
                sentEntry.append([])

        for category in siteanno:
            for topic in siteanno[category]:
                topicFrame=siteanno[category][topic].dropna()
                for idx in topicFrame.index:
                    
                    if topicFrame.loc[idx]["startIndexInSegment"]!=-1 and topicFrame.loc[idx]["value"]!="Unspecified":
                        entry=topicFrame.loc[idx]
                        anno_start=entry["endIndexInSegment"]
                        anno_end=entry["startIndexInSegment"]
                        corpusSents=sentlabels[website][entry["section"]]
                        
                        for sent in corpusSents:
                            corpus_start=sent[2]
                            corpus_end=sent[3]
                            if corpus_start <=anno_start and corpus_end >= anno_end:
                                sent[4].append((category, topic, entry["value"]))
                            elif abs(corpus_start-anno_start) <10 and abs(corpus_end-anno_end)<10:
                                sent[4].append((category,topic, entry["value"]))
                            elif anno_start<=corpus_start and anno_end >= corpus_end:
                                sent[4].append((category,topic, entry["value"]))
                            sent[4]=list(set(sent[4]))
              
    return sentlabels

labeledSents=labelRel(annotations, sentIdx)

CPU times: user 1min 9s, sys: 40 ms, total: 1min 9s
Wall time: 1min 9s


In [35]:
labeledSents.keys()

dict_keys(['playstation.com', 'honda.com', 'sports-reference.com', 'sltrib.com', 'foxsports.com', 'style.com', 'wnep.com', 'sheknows.com', 'mlb.mlb.com', 'coffeereview.com', 'jibjab.com', 'theatlantic.com', 'kaleidahealth.org', 'uptodate.com', 'gwdocs.com', 'meredith.com', 'ticketmaster.com', 'abita.com', 'www.loc.gov', 'washingtonian.com', 'lynda.com', 'highgearmedia.com', 'walmart.com', 'ted.com', 'nytimes.com', 'lodgemfg.com', 'enthusiastnetwork.com', 'kraftrecipes.com', 'rockstargames.com', 'geocaching.com', 'tangeroutlet.com', 'voxmedia.com', 'ironhorsevineyards.com', 'gawker.com', 'abcnews.com', 'eatchicken.com', 'allstate.com', 'austincc.edu', 'sidearmsports.com', 'uh.edu', 'dogbreedinfo.com', 'amazon.com', 'si.edu', 'ocregister.com', 'boardgamegeek.com', 'yahoo.com', 'freep.com', 'neworleansonline.com', 'instagram.com', 'tulsaworld.com', 'liquor.com', 'stlouisfed.org', 'gamestop.com', 'msn.com', 'cincymuseum.org', 'cariboucoffee.com', 'esquire.com', 'vikings.com', 'bankofameric

In [12]:
#Random Checking
"""
table1=annotable1_0.loc[annotable1_0.Website=='coffeereview.com',:]
table2=table1.loc[table1.section==2]
table3=table2
table3.loc[(table3["endIndexInSegment"]<1000) & (table3["startIndexInSegment"]>0) ]
"""

'\ntable1=annotable1_0.loc[annotable1_0.Website==\'coffeereview.com\',:]\ntable2=table1.loc[table1.section==2]\ntable3=table2\ntable3.loc[(table3["endIndexInSegment"]<1000) & (table3["startIndexInSegment"]>0) ]\n'

In [13]:
## import the labeled score that we annotated
labelpolarity=pd.read_csv("/home/nyao/Code/Data/Label_score_170327.csv")
labelpolarity.head()

Unnamed: 0,Category,Topic,Value,n,Use,Polarity
0,Data Retention,Personal Information Type,Unspecified,87,,0
1,Data Retention,Personal Information Type,Generic personal information,54,,-1
2,Data Retention,Personal Information Type,Contact,30,,-1
3,Data Retention,Personal Information Type,Other,25,No,0
4,Data Retention,Personal Information Type,Cookies and tracking elements,22,,-1


In [14]:
labelpolarityUse=labelpolarity.loc[labelpolarity.Use!="No"]
labelpolarityUse.shape

(247, 6)

In [15]:
labelpolarityUse.values

array([['Data Retention', 'Personal Information Type', 'Unspecified', 87,
        nan, 0],
       ['Data Retention', 'Personal Information Type',
        'Generic personal information', 54, nan, -1],
       ['Data Retention', 'Personal Information Type', 'Contact', 30, nan,
        -1],
       ..., 
       ['User Choice/Control', 'User Type', 'User with account', 142, nan,
        0],
       ['User Choice/Control', 'User Type', 'Other', 35, nan, 0],
       ['User Choice/Control', 'User Type', 'User without account', 18,
        nan, 0]], dtype=object)

In [16]:
## make the polarity table into a dictionary so that we can look up the 
## polarity given category, topic and value
polarityMap={}
for row in labelpolarityUse.values:
  polarityMap[(row[0], row[1], row[2])]=row[5]

In [17]:
websites=list(annotations.keys())

## choose the websites for training and validating 
seed=1324
random.seed(seed)
trainWebsites=random.sample(websites, 102)
valWebsites=[thing for thing in websites if thing not in trainWebsites]

In [18]:
## A list of all topics 

topicList=[('Data Retention', 'Personal Information Type'),
 ('Data Retention', 'Retention Period'),
 ('Data Retention', 'Retention Purpose'),
 ('Data Security', 'Security Measure'),
 ('Do Not Track', 'Do Not Track policy'),
 ('First Party Collection/Use', 'Action First-Party'),
 ('First Party Collection/Use', 'Choice Scope'),
 ('First Party Collection/Use', 'Choice Type'),
 ('First Party Collection/Use', 'Collection Mode'),
 ('First Party Collection/Use', 'Does/Does Not'),
 ('First Party Collection/Use', 'Identifiability'),
 ('First Party Collection/Use', 'Personal Information Type'),
 ('First Party Collection/Use', 'Purpose'),
 ('First Party Collection/Use', 'User Type'),
 ('International and Specific Audiences', 'Audience Type'),
 ('Other', 'Other Type'),
 ('Policy Change', 'Change Type'),
 ('Policy Change', 'Notification Type'),
 ('Policy Change', 'User Choice'),
 ('Third Party Sharing/Collection', 'Action Third Party'),
 ('Third Party Sharing/Collection', 'Choice Scope'),
 ('Third Party Sharing/Collection', 'Choice Type'),
 ('Third Party Sharing/Collection', 'Does/Does Not'),
 ('Third Party Sharing/Collection', 'Identifiability'),
 ('Third Party Sharing/Collection', 'Personal Information Type'),
 ('Third Party Sharing/Collection', 'Purpose'),
 ('Third Party Sharing/Collection', 'Third Party Entity'),
 ('Third Party Sharing/Collection', 'User Type'),
 ('User Access, Edit and Deletion', 'Access Scope'),
 ('User Access, Edit and Deletion', 'Access Type'),
 ('User Access, Edit and Deletion', 'User Type'),
 ('User Choice/Control', 'Choice Scope'),
 ('User Choice/Control', 'Choice Type'),
 ('User Choice/Control', 'Personal Information Type'),
 ('User Choice/Control', 'Purpose'),
 ('User Choice/Control', 'User Type')]

In [22]:
def extractFeatureLabels(labeledSents):
    results={}
    for website, data in labeledSents.items():
        websiteresult=[]
        for section, infoList in data.items():
            for sentence in infoList:
                websiteresult.append([sentence[0], sentence[4]])
            results[website]=websiteresult
    return results

simpleSentLabels=extractFeatureLabels(labeledSents)

In [23]:
simpleSentLabels["playstation.com"][5][1]  # each sentence is grouped with the topic they are related to and the label

[('Other', 'Other Type', 'Practice not covered'),
 ('Data Security', 'Security Measure', 'Privacy/Security program'),
 ('Data Security', 'Security Measure', 'Privacy review/audit')]

In [24]:
## The "Other" category will be removed from each sentence
## and also organize the data  into a table 

def sentLabel2Table(sentLabel, remove="Other"):
  counter=0
  for website, sentList in sentLabel.items():
    table=[]
    for sent in sentList:
      labelist=list(set(sent[1]))
      labels={"Label"+str(i):labelist[i] for i in range(len(labelist)) if remove not in labelist[i] }
      entry={"Sentence":sent[0]}
      entry.update(labels)
      table.append(entry)
    websiteTable=pd.DataFrame(table)
    websiteTable=websiteTable.assign(Website=[website]*websiteTable.shape[0])
    return websiteTable
    

In [25]:
### For each sentence, get the score for each (category, topic, value) combination
### make the result in to a list of dictionaries, each dictionary for a sentence
### Save the the category of each value,and the score  ({category: score })
### If the category is not in the map provdied .i.e, it is marked as not used
### add the label  ("Not_used", 0)
### so "Not_used" will be its own category

def scoreSents(sentsWithLabels, scoreMap):
  results={}
  for website, sentlist in sentsWithLabels.items():
    results.setdefault(website, [])
    for sent in sentlist:
      entry=[]
      entry.append(sent[0])
      labellist=sent[1]
      entrylabel=[]
      for label in set(labellist):
        
        try:
          score=scoreMap[label]
          cat=label[0]
        except KeyError:
          score=0
          cat="Not_used"
        finally:
          entrylabel.append((cat, score))
    
      entry.append(entrylabel)
      results[website].append(entry)
    
  return results

sentsWithScore=scoreSents(simpleSentLabels, polarityMap)

In [26]:
def majVote(scoreList):
  return max(set(scoreList), key=scoreList.count)
from itertools import groupby
test=sentsWithScore["honda.com"][8][1]
test=sorted(test, key=lambda x: x[0])
for key, group in groupby(test, lambda x: x[0]):
  scorelist=[x[1] for x in group]
  print(key, scorelist)
  print(key, majVote(scorelist))

First Party Collection/Use [0, 0, -2, 0, 0, 0, 0]
First Party Collection/Use 0
Not_used [0]
Not_used 0
User Choice/Control [1, 2, 0, 1, 2, 1]
User Choice/Control 1


In [27]:
## make the final form of the training data
## for each sentence, have a list of unique categories that they belong to, and the scores 
## for each sentence, if it has multiple scores for a category, then the scores will be merged 
## by taking a majority vote of the scores
## e.g. if a sentence has 5 scores for a category -1,1,-1,-2,0, then the sentence likley contains 
## information that are negative , and therefore, will be given a score of -1
## The final form will be a table, with website, and a score for category, "NaN" if the sentence does not belong to that category
## 

def majVote(scoreList):
  return max(set(scoreList), key=scoreList.count)


def scoreSentUnify(sentscores):
  resultlist=[]
  for website, sentList in sentscores.items():
    for sent in sentList:
      # for each sentence, make a dictionary. This will help making the columns for the data frame later on
      entry={"Website":website, "Sentence": sent[0]}
      sortedlabels=sorted(sent[1], key=lambda x: x[0])
      for cat, group in groupby(sortedlabels, lambda x: x[0]):
        # for scores that are for the same category, use majority voting to select one
        scorelist = [x[1] for x in group]
        entry[cat]=majVote(scorelist)
      resultlist.append(entry)
  resulttable=pd.DataFrame(resultlist)
  ## reorganize the column order
  oldcolumns=resulttable.columns.tolist()
  oldcolumns.remove("Website")
  oldcolumns.remove("Sentence")
  newcolumns=["Website", "Sentence"]+oldcolumns
  return resulttable[newcolumns]

scoresUnified=scoreSentUnify(sentsWithScore)

In [36]:
## Ta daaaa
scoresUnified.head(100).loc[5]["Sentence"]

'To protect your privacy to the maximum extent possible, we have undertaken this privacy initiative and our websites have been reviewed and certified by ESRB Privacy Online to meet established online information collection and use practices.'

In [34]:
dataretention=scoresUnified[["Sentence", "Data Retention"]]
dataretention.head(200)

Unnamed: 0,Sentence,Data Retention
0,"Privacy Policy Last Revised: April, 2011",
1,"Sony Computer Entertainment America LLC (""SCEA...",
2,This privacy policy is intended to provide you...,
3,"If you have any questions, complaints or comme...",
4,This Privacy Statement and the certification s...,
5,To protect your privacy to the maximum extent ...,
6,"As part of the privacy program, we are subject...",
7,ESRB Privacy Online is a third-party seal prov...,
8,ESRB Privacy Online promotes and enforces esta...,
9,Whenever you visit a website that displays the...,


In [315]:
## output to a file 
with open("Data/sentence_score_conso10.pk", "wb") as f:
    pickle.dump(scoresUnified, f)

In [29]:
## output to a file that is readable in python2
import pickle
with open("Data/sentence_score_conso10_py2.pk", "wb") as f:
    pickle.dump(scoresUnified, f, 2)

## Generate Training data for each category

In [31]:
trainData={website:simpleSentLabels[website] for website in trainWebsites}
valData={website:simpleSentLabels[website] for website in valWebsites}

In [42]:
## Make a way to organize the training texts and labels
## we need to make a model for each topic
def generateXY(inputdata, topic):
    results=[]
    for website, data in inputdata.items():
        for item in data:    
            if topic in [x[0:2] for x in item[1]]:   # if the topic is in a (sentence, topic+label)  tuple
                label=[x[2] for x in item[1] if x[0:2]==topic][0]
                results.append([item[0], label])
        text, labels=zip(*results)
    return text, labels

In [43]:
topic=('Third Party Sharing/Collection', 'Personal Information Type')
trainText, trainLabels=generateXY(trainData, topic)

### Testing random functions


In [37]:
pd.DataFrame([{"a":2}, {"a":3}, {"a":4, "b":2}])

Unnamed: 0,a,b
0,2,
1,3,
2,4,2.0


In [39]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})


right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2', 'K1', "K2"],
                       'key2': ['K0', 'K0', 'K0', 'K0', "K0", "K1"],
                      'C': ['C0', 'C1', 'C2', 'C3', "C4", "C5"],
                      'D': ['D0', 'D1', 'D2', 'D3',"D4", "D5"]})

display(left)
display(right)

result=pd.merge(left, right, how='inner', on=["key1", "key2"])
display(result)

Unnamed: 0,A,B,key1,key2
0,A0,B0,K0,K0
1,A1,B1,K0,K1
2,A2,B2,K1,K0
3,A3,B3,K2,K1


Unnamed: 0,C,D,key1,key2
0,C0,D0,K0,K0
1,C1,D1,K1,K0
2,C2,D2,K1,K0
3,C3,D3,K2,K0
4,C4,D4,K1,K0
5,C5,D5,K2,K1


Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K1,K0,C1,D1
2,A2,B2,K1,K0,C2,D2
3,A2,B2,K1,K0,C4,D4
4,A3,B3,K2,K1,C5,D5


In [40]:
a=left.columns.values.tolist()
print(a)
random.shuffle(a)
print(a)

['A', 'B', 'key1', 'key2']
['key1', 'B', 'A', 'key2']


In [41]:
import random
columnslist=left.columns.values.tolist()
random.shuffle(columnslist)
leftshuffle=left.loc[:,columnslist]

pd.concat([leftshuffle, right], axis=0, ignore_index=True)

Unnamed: 0,A,B,C,D,key1,key2
0,A0,B0,,,K0,K0
1,A1,B1,,,K0,K1
2,A2,B2,,,K1,K0
3,A3,B3,,,K2,K1
4,,,C0,D0,K0,K0
5,,,C1,D1,K1,K0
6,,,C2,D2,K1,K0
7,,,C3,D3,K2,K0
8,,,C4,D4,K1,K0
9,,,C5,D5,K2,K1


In [53]:
print(left.shape)
extra="Hello"
new=pd.concat([left, [1,2,3,4]], axis=1)
display(new)


(4, 4)


TypeError: cannot concatenate a non-NDFrame object