In [2]:
import json
import pandas as pd 
import glob
import os
import re
from bs4 import BeautifulSoup 
from IPython.display import display, HTML
import pickle
import numpy as np

In [3]:
## Read in consolidated annotations
#annotations={}
#annofiles=glob.glob("Data/opp115-parsed-annotation-1.0/*.json")
#print len(annofiles)
policyFiles=glob.glob("/share/pub/OPP-115/sanitized_policies/*.html")
print(len(policyFiles))

115


###  Read in the annotations and original policies

In [4]:
def readAnno(filelist):
  annotations={}
  for filename in filelist:
    website=re.sub(".json", '', os.path.basename(filename))
    with open(filename, "r") as f:
      annotations[website]=json.load(f)
    f.close()
  return annotations

def readPolicies(filelist):
  soups={}
  for filename in filelist:
    base=os.path.basename(filename).split("_")[1]
    website=re.sub(".html", '', base)
    soups[website]=BeautifulSoup(open(filename, "r").read(), 'html.parser')
  return soups

In [5]:
with open("Data/parsed-annotation-0.5.pk", 'rb') as f:
  annotations=pickle.load(f, encoding='latin1')

In [6]:
annotations["playstation.com"]["Data Retention"]['Retention Purpose']

Unnamed: 0,endIndexInSegment,section,selectedText,startIndexInSegment,value
0,153,18,promotional purpose through one of our websites,106,Marketing
1,201,18,or to make a purchase from the PlayStation Shop,154,Perform service
2,226,22,Email addresses collected from consumers durin...,0,Perform service
3,200,24,so that we may assist these customers with cur...,124,Perform service
4,159,40,necessary to fulfill the purposes outlined in ...,102,Other


In [7]:
### read in sanitized policy texts
policySoups=readPolicies(policyFiles)

In [8]:
## make sure that the website names are the same in both dictinoaries
test1=filter(lambda website: website in annotations.keys(), policySoups.keys())
test2=filter(lambda website: website in policySoups.keys(), annotations.keys())
print(len(list(test1)))
print(len(list(test2)))

115
115


## Get a list of ngrams from all of the policies

In [9]:
## Get the texts from beautifulsoup objects

def extractTexts(soups):
  policyTexts={}
  for website in soups:
    policyTexts[website]=soups[website].get_text()
  return policyTexts

In [10]:
## get texts from soup objects
policyTexts=extractTexts(policySoups)

In [11]:
## get the training and validation set 
import random
websites=policyTexts.keys()
seed=124
random.seed(seed)
trainWebsites=random.sample(websites, 105)
valWebsites=[web for web in websites if web not in trainWebsites]

trainTexts={web:policyTexts[web] for web in trainWebsites}

In [12]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re
import itertools
import string
engstop=set(stopwords.words("english"))


In [13]:
print (string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [56]:
from collections import Counter
import itertools


def cleantext(rawtext):
  # remove punctuations
  # remove=string.punctuations
  cleaned3=re.sub("\|","", rawtext.lower())
  return cleaned3


def getNgrams(rawtext, length, stopwords):  # length defines number of words in the token, i.e. unigrams, bigrams ec
  punctuations = re.compile('[%s]' % re.escape(string.punctuation))
  if length==1:
    cleaned=cleantext(rawtext)
    tokenlist=word_tokenize(cleaned)
    tokensNoStop=[token for token in tokenlist if token not in stopwords]
    return list(set(tokensNoStop))
  
  if length>=2:
    sentenceList=sent_tokenize(rawtext)
    sentenceClean=[cleantext(sentence) for sentence in sentenceList]
    unigramLists=[word_tokenize(sentence) for sentence in sentenceClean]
    
    bigramLists=[zip(*[sentUnigram[i::] for i in range(length)]) for sentUnigram in unigramLists]
    bigrams=list(itertools.chain.from_iterable(bigramLists))
    
    return list(set(bigrams))

In [52]:
def getAllTokens(textDict, ngram, stopwords):
  allTokens=[]
  for website in textDict:
    allTokens.extend(getNgrams(textDict[website], ngram, stopwords))
  return list(set(allTokens))

#### Generate Training data

In [16]:
## Get annotation of the training documents
trainAnno={key:annotations[key] for key in trainTexts.keys()}

## Determine Relevance of each sentence in training set

### 1. Get the index of each sentences in the policy texts

In [19]:
## Label the sentences in a policy as relevant or irrelevant
## 
# Topic: "Personal Information Type"

In [17]:
def getSentIdx(raw):
  allIdx={}
  secList=raw.split("|||")
  for i in range(len(secList)):
    try:
      secText=secList[i]
    except IndexError:
      print(i)
    secSents=sent_tokenize(secText)
    secIdxLists=[]
    for sent in secSents:
      m=re.search(re.escape(sent), secText)  ## escape to account for quotes in the string
      secIdxLists.append((sent, i, m.start(),m.end()))
    allIdx[i]=secIdxLists
  return allIdx

def getAllIdx(textDict):
  results={}
  for website, text in textDict.items():
    try:
      senidxes=getSentIdx(text)
      results[website]=senidxes
    except IndexError:
      print(website)
  return results


In [18]:
%%time
trainSentIdx=getAllIdx(trainTexts)

CPU times: user 14.3 s, sys: 16 ms, total: 14.3 s
Wall time: 14.3 s


In [19]:
## An example to see if indexing is correct:
exampleSite="uptodate.com"
annoEg=trainAnno[exampleSite]['First Party Collection/Use']['Personal Information Type'].loc[4]
print (annoEg)
selecTextAnno=annoEg["selectedText"]

print (selecTextAnno)

print ("\n")
corpusText=trainSentIdx[exampleSite][annoEg['section']]
sentenceNumber=[idx for idx in range(len(corpusText))][0]

print ("Text in corpus : ")
display(corpusText)

print ("\n")

corpusStart=re.search(selecTextAnno, corpusText[sentenceNumber][0]).start()
corpusEnd= re.search(selecTextAnno, corpusText[sentenceNumber][0]).end()

print ("start index in corpus section :", corpusText[sentenceNumber][2]+corpusStart)
print ("end index in corpus section :", corpusText[sentenceNumber][2]+corpusEnd)

print ("Annotation indices is ahead of corpus indices by {} characters ".format(annoEg["startIndexInSegment"]-corpusStart))


print (trainAnno[exampleSite]['First Party Collection/Use']['Personal Information Type'].loc[4]["startIndexInSegment"])

endIndexInSegment                                              119
section                                                          2
selectedText           nformation that specifically identifies you
startIndexInSegment                                             76
value                                 Generic personal information
Name: 4, dtype: object
nformation that specifically identifies you


Text in corpus : 


[('Subscriber Information   UpToDate never automatically collects any information that specifically identifies you such as your name, address, or e-mail address.',
  2,
  0,
  158),
 ('This information is collected only when you voluntarily provide it as part of the subscription process ("Subscriber Information").',
  2,
  159,
  289),
 ('We will ask you whenever we need Subscriber Information that identifies you or allows us to contact you.',
  2,
  290,
  394)]



start index in corpus section : 68
end index in corpus section : 111
Annotation indices is ahead of corpus indices by 8 characters 
76


### Gather the sentences and put them in a convenient structure

In [20]:
%%time
# for each selectedText in annotation, search for corresponding sentence in the corpus 
# if a selectedText is more than one sentence long, it will be discarded, since it does not provide much information
# for the importance of words
def labelRel(anno, sentIdx):
  sentlabels={}
  for website, siteanno in anno.items():
    sentlabels[website]={}
    siteSents=sentIdx[website]
    for section, sentList in siteSents.items():

      sentlabels[website][section]=[list(sentTuple) for sentTuple in sentList]
      for sentEntry in sentlabels[website][section]:
        sentEntry.append([])

    for category in siteanno:
      for topic in siteanno[category]:
        topicFrame=siteanno[category][topic]
        for idx in topicFrame.index:
          if topicFrame.loc[idx]["startIndexInSegment"]!=-1 and topicFrame.loc[idx]["value"]!="Unspecified":
            entry=topicFrame.loc[idx]
            anno_start=entry["endIndexInSegment"]
            anno_end=entry["startIndexInSegment"]
            corpusSents=sentlabels[website][entry["section"]]
            for sent in corpusSents:
              corpus_start=sent[2]
              corpus_end=sent[3]
              if corpus_start <=anno_start and corpus_end >= anno_end:
                sent[4].append((category, topic, entry["value"]))
              elif  abs(corpus_start-anno_start) <20 and abs(corpus_end-anno_end)<20:
                sent[4].append((category,topic, entry["value"]))
              elif anno_start<=corpus_start and anno_end >= corpus_end:
                sent[4].append((category,topic, entry["value"]))
                
  return sentlabels

labeledTrainSents=labelRel(trainAnno, trainSentIdx)

CPU times: user 1min 2s, sys: 80 ms, total: 1min 2s
Wall time: 1min 2s


In [None]:
labeledTrainSents['sheknows.com'][1]

In [22]:
%%time
### Label all the sentences in all of the texts, just like in  "labeledTrainSents" , output to binary
allSentIdx=getAllIdx(policyTexts)
allLabeledSents=labelRel(annotations, allSentIdx)

In [533]:
with open("LabeledFullSentences.pk", 'wb') as f:
  pickle.dump(allLabeledSents, f)

In [23]:
### Collect all the sentences , along with their topics, for ease of processing later
### Leave out the value for now

def gatherSentsTopics(labeldIdxSet):
  topiclist=[]
  allsentences=[]
  for website, corpus in labeldIdxSet.items():
    for section, sentLists in corpus.items():
      for sent in sentLists:
        allsentences.append([website, sent[0], [(item[0], item[1]) for item in sent[4] if item[0]!="Other"]])
        for item in sent[4]:
          topiclist.append((item[0], item[1]))
  return set(topiclist), allsentences

alltopics, allLabeledSentences=gatherSentsTopics(labeledTrainSents)

In [24]:
allLabeledSentences[20]

['sheknows.com',
 'You can change the settings on your browser or Local Device Storage to prevent cookies being stored on your Local Device Storage without your explicit consent.',
 [('First Party Collection/Use', 'Choice Type'),
  ('First Party Collection/Use', 'Choice Type'),
  ('First Party Collection/Use', 'Choice Type'),
  ('Third Party Sharing/Collection', 'Choice Type'),
  ('Third Party Sharing/Collection', 'Choice Type')]]

In [25]:
%%time
## for each topic, we get alist of related sentences and a list of unrelated sentences

def getTopicRelevanceList(labeledSentences, topiclist):
  topicSentsCollection={topic:{"Related":[], "Unrelated":[]} for topic in topiclist}
  for entry in labeledSentences:
    labelset=set(entry[2])
    for topic in topiclist:
      if topic not in labelset:
        topicSentsCollection[topic]["Unrelated"].append(entry[0:2])
      else:
         topicSentsCollection[topic]["Related"].append(entry[0:2])
          
  results={}
  for topic in topiclist:
    results[topic]={}
    results[topic]["Related"]=topicSentsCollection[topic]["Related"]
    results[topic]["Unrelated"]=topicSentsCollection[topic]["Unrelated"]
  
  return results

relevantSetences=getTopicRelevanceList(allLabeledSentences, alltopics)

CPU times: user 1.68 s, sys: 20 ms, total: 1.7 s
Wall time: 1.7 s


In [26]:
len(relevantSetences[('User Choice/Control', 'Personal Information Type')]["Related"])+\
len(relevantSetences[('User Choice/Control', 'Personal Information Type')]["Unrelated"])

10417

In [27]:
print(len(relevantSetences[('First Party Collection/Use', 'Does/Does Not')]["Related"]))

1509


In [28]:
relevantSetences[('User Choice/Control', 'Purpose')]["Related"][2]

['sheknows.com',
 'You may also "opt-out" of other third party programs using OBA technology on your web browser: (a) for users targeted in the United States by visiting http://www.aboutads.info/choices; (b) for users targeted in Europe by visiting http:://www.youronlinechoices.com, selecting the country where you are located; (c) for users targeted in Australia by visiting http://youronlinechoices.com.au, and selecting "Your Ad Choices" or "Your Choices" as applicable, and (d) for users targeted in Canada by visiting http://youradchoices.ca/choices.']

In [114]:
relevantSetences[('First Party Collection/Use', 'User Type')]['Related'][0]

['sheknows.com',
 ' Information We Collect    User-Provided Information: You provide us information about yourself, such as your name and email address, when you register with the Service.']

In [38]:
## put all the topics and relevance labels in the 
from itertools import groupby
from operator import itemgetter


def embedLabels(relSent):
  resultsDict={}
  for topic, releSents in relSent.items():
    for relevance, labeledSents in releSents.items():
      for sentence in labeledSents:
        resultsDict.setdefault(tuple(sentence), [])
        resultsDict[tuple(sentence)].append((topic, relevance))
      
  results=[list(key)+[list(set(value))] for key, value in resultsDict.items()]
  
  return results

relSentTuples=embedLabels(relevantSetences)

In [65]:
relSentTuples[0][1]

'If you wish to make changes to any personal information you have provided us, or if you have any questions about what we do with your personal information, please contact us by sending an e-mail to privacy@washingtonian.com.'

In [58]:
%%time
## Get ngrams from the the labeled sentences
def getNgramsSentTup(sentsRelLabels, n, stopwords):
  ngramsList=[]
  for sentTuple in sentsRelLabels:
    ngrams=getNgrams(sentTuple[1], n, stopwords)
    ngramsList.extend(ngrams)
  return list(set(ngramsList))


trainUnigrams=getNgramsSentTup(relSentTuples, 1, engstop)
trainBigrams=getNgramsSentTup(relSentTuples, 2, engstop)
trainTrigrams=getNgramsSentTup(relSentTuples, 3, engstop)

CPU times: user 12 s, sys: 64 ms, total: 12.1 s
Wall time: 12.1 s


In [62]:
### Index the topics and ngrams
alltopicsList=list(alltopics)
topicIdx={alltopicsList[n]:n for n in range(len(alltopicsList))}
trainUnigramIdx={trainUnigrams[n]:n for n in range(len(trainUnigrams))}
trainBigramIdx={trainBigrams[n]:n for n in range(len(trainBigrams))}
trainTrigramIdx={trainTrigrams[n]:n for n in range(len(trainTrigrams))}

## make revers index
topicRev={topicIdx[key]:key for key in topicIdx}
trainUnigramRev={trainUnigramIdx[key]:key for key in trainUnigramIdx.keys()}
trainBigramRev={trainBigramIdx[key]:key for key in trainBigramIdx.keys()}
trainTrigramRev={trainTrigramIdx[key]:key for key in trainTrigramIdx.keys()}

In [76]:
%%time
## for each sentence in relSentTuples, index the topics
## 
def indexTopics(sentTuples, topicIdx):
  results=[]
  for thing in sentTuples:
    item=thing[0:2]
    item.append([(topicIdx[topic],rel) for topic, rel in thing[2]])
    results.append(item)
  return results

relSentTuplesIdx=indexTopics(relSentTuples, topicIdx)

CPU times: user 152 ms, sys: 24 ms, total: 176 ms
Wall time: 175 ms


In [79]:
relSentTuplesIdx[0][1]

'If you wish to make changes to any personal information you have provided us, or if you have any questions about what we do with your personal information, please contact us by sending an e-mail to privacy@washingtonian.com.'

In [72]:
print(len(relSentTuplesIdx))  ## this will be the input for the subsequent steps

10231


## Topic Signature Model 

### STEP 1: Calculate the scores by indexing the words and processing with numpy array

Get ngram collections from the sentences collections

In [80]:
print(len(trainUnigrams))
print(len(trainBigrams))
print(len(trainTrigrams))

6541
57617
130350


In [81]:
np.seterr(divide='warn', invalid='warn')
def lambdaScoresNP(relSentTupAllTopics, n,topicIdxTable, ngramIdxTable, printEvery=5000):
  # Initiate Final count Matrix
  countresult=np.zeros([len(ngramIdxTable),len(topicIdxTable)*4])+10**-10
  # Some lookup tables to help assign number
  
  totalnumber=len(relSentTupAllTopics)
  
  obsidx={"O11":0, "O12":1, "O21":2, "O22":3}
  assignlabel={"In":{"Related":0, "Unrelated":1},"Out":{"Related":2, "Unrelated":3}}
  
  # initiate some temporary variables
  counter=0
  
  # Some other things
  
  ngramIdxSet=set(ngramIdxTable.values())
  ngramIdxValue=list(ngramIdxTable.values())
  
  for sentTuple in relSentTupAllTopics:
    counter+=1
    if counter%printEvery==0:
      print("Processed {} sentences out of {}".format(counter, totalnumber))
      
    sentNgrams=getNgrams(rawtext=sentTuple[1], length=n, stopwords=engstop)
    
    topicRels=sentTuple[1]
    
    for topic, relevance in sentTuple[2]:
      
      ngramIdxIN=[]
      for ngram in sentNgrams:
        try:
          ngramIdxIN.append(ngramIdxTable[ngram])
        except:
          pass
      
    
      columnIdx=np.repeat(topic*4+assignlabel["Out"][relevance],len(ngramIdxTable))
      columnIdx[ngramIdxIN]=topic*4+assignlabel["In"][relevance]
      countresult[ngramIdxValue, columnIdx]+=1
    
  print("Calculating -2lambda score")
  O11cols=list(range(0,len(topicIdxTable)*4,4))
  O12cols=list(range(1,len(topicIdxTable)*4,4))
  O21cols=list(range(2,len(topicIdxTable)*4,4))
  O22cols=list(range(3,len(topicIdxTable)*4,4))
  
  O11values=countresult[:,O11cols]
  O12values=countresult[:,O12cols]
  O21values=countresult[:,O21cols]
  O22values=countresult[:,O22cols]
  
  p1=O11values/(O11values+O12values)
  p2=O21values/(O21values+O22values)
  P=(O11values+O21values)/(O11values+O12values+O21values+O22values)
  
  #change topics back 
  lambdascore=-2*(( O11values+O21values )*np.log(P)+(O12values+O22values)*np.log(1-P)-(O11values*np.log(p1)+O12values*np.log(1-p1)+O21values*np.log(p2)+O22values*np.log(1-p2)))
  
  assert lambdascore.shape==(len(ngramIdxValue),len(topicIdxTable)), "Score matrix doesn't have the right dimensions"
  
  print("Truning Score matrx to dictionary ")
  
  topicReverse={topicIdxTable[key]:key for key in topicIdxTable.keys()}
  ngramReverse={ngramIdxTable[key]:key for key in ngramIdxTable.keys()}
  
  finalresult={}
  for idx in range(lambdascore.shape[1]):
    topic=topicReverse[idx]
    topicScores=lambdascore[:,idx]
    finalresult.setdefault(topic, {})
    finalresult[topic]={ngramReverse[ngramIdx]:topicScores[ngramIdx] for ngramIdx in range(lambdascore.shape[0])}
  
  return finalresult

In [125]:
%%time
unigramScoreNP=lambdaScoresNP(relSentsAllTopicIndexed, 1, topicIdx,trainUnigramIdx, printEvery=1000)

Processed 1000 sentences out of 9752
Processed 2000 sentences out of 9752
Processed 3000 sentences out of 9752
Processed 4000 sentences out of 9752
Processed 5000 sentences out of 9752
Processed 6000 sentences out of 9752
Processed 7000 sentences out of 9752
Processed 8000 sentences out of 9752
Processed 9000 sentences out of 9752
Calculating -2lambda score
Truning Score matrx to dictionary 
CPU times: user 8min 43s, sys: 32 ms, total: 8min 43s
Wall time: 8min 43s


In [136]:
%%time
bigramScoreNP=lambdaScoresNP(relSentsAllTopicIndexed, 2, topicIdx,trainBigramIdx, printEvery=500)

Processed 500 sentences out of 9752
Processed 1000 sentences out of 9752
Processed 1500 sentences out of 9752
Processed 2000 sentences out of 9752
Processed 2500 sentences out of 9752
Processed 3000 sentences out of 9752
Processed 3500 sentences out of 9752
Processed 4000 sentences out of 9752
Processed 4500 sentences out of 9752
Processed 5000 sentences out of 9752
Processed 5500 sentences out of 9752
Processed 6000 sentences out of 9752
Processed 6500 sentences out of 9752
Processed 7000 sentences out of 9752
Processed 7500 sentences out of 9752
Processed 8000 sentences out of 9752
Processed 8500 sentences out of 9752
Processed 9000 sentences out of 9752
Processed 9500 sentences out of 9752
Calculating -2lambda score
Truning Score matrx to dictionary 
CPU times: user 1h 15min 29s, sys: 520 ms, total: 1h 15min 29s
Wall time: 1h 15min 29s


In [137]:
%%time
trigramScoreNP=lambdaScoresNP(relSentsAllTopicIndexed, 3, topicIdx,trainTrigramIdx, printEvery=2000)

Processed 2000 sentences out of 9752
Processed 4000 sentences out of 9752
Processed 6000 sentences out of 9752
Processed 8000 sentences out of 9752
Calculating -2lambda score
Truning Score matrx to dictionary 
CPU times: user 2h 50min 54s, sys: 1.97 s, total: 2h 50min 56s
Wall time: 2h 50min 55s


In [162]:
## Save the scores

In [167]:
import json
import pickle

with open("/home/nyao/Results/unigramScoresNP_170312.pk", 'wb') as f:
  pickle.dump(unigramScoreNP ,f)
  
with open("/home/nyao/Results/bigramScoresNP_170312.pk", "wb") as f:
  pickle.dump(bigramScoreNP, f)

with open("/home/nyao/Results/trigramScoresNP_170312.pk", "wb") as f:
  pickle.dump(trigramScoreNP, f)

#### STEP2 : Validation phase

In [59]:
### Take the output of 'labelRel'
### label each sentence in each document with the topic, but keep only the topic
### makes it easier for validation later on
def labelRelTopicOnly(labeledSentsWithSections):
  results={}
  for website, sectionList in labeledSentsWithSections.items():
    results[website]=[]
    for section, sentList in sectionList.items():
      for sentTuple in sentList:
        results[website].append([sentTuple[0],[topicTuple[0:2] for topicTuple in sentTuple[4]] ])
  return results
      

## Split text into features that can be used to calculate scores 
## can be used to for creating trainning or validation data

def sentWithTopic(ValTextDict, ValAnnoTation):
  sentIdx=getAllIdx(ValTextDict)
  labelSectionIdx=labelRel(ValAnnoTation, sentIdx)
  labelTopOnly=labelRelTopicOnly(labelSectionIdx)
  return labelTopOnly
  

In [82]:
### take a raw document, convert to the same output produced by 'getTopicRelevanceList'
def text2Label(rawtext, annotationMap):
  sentIdx=getAllIdx(rawtext)
  anno={key:annotationMap[key] for key in rawtext.keys()}
  labeledSents=labelRel(anno, sentIdx)
  allTop, allLabeledSents=gatherSentsTopics(labeledSents)
  relSents=getTopicRelevanceList(allLabeledSents, allTop)
  relSentTup=embedLabels(relSents)
  return relSentTup

In [83]:
%%time
valTexts={web:policyTexts[web] for web in valWebsites}
valAnno={web: annotations[web] for web in valWebsites}
validateRelSentsTuples=text2Label(valTexts, valAnno)

CPU times: user 4.37 s, sys: 4 ms, total: 4.37 s
Wall time: 4.37 s


In [85]:
validateRelSentsTuples[0][2]

[(('First Party Collection/Use', 'Choice Type'), 'Unrelated'),
 (('First Party Collection/Use', 'Collection Mode'), 'Unrelated'),
 (('First Party Collection/Use', 'Action First-Party'), 'Unrelated'),
 (('International and Specific Audiences', 'Audience Type'), 'Related'),
 (('Third Party Sharing/Collection', 'Choice Scope'), 'Unrelated'),
 (('First Party Collection/Use', 'Identifiability'), 'Unrelated'),
 (('User Choice/Control', 'User Type'), 'Related'),
 (('User Access, Edit and Deletion', 'Access Type'), 'Related'),
 (('User Choice/Control', 'Choice Type'), 'Related'),
 (('Third Party Sharing/Collection', 'Purpose'), 'Unrelated'),
 (('User Choice/Control', 'Choice Scope'), 'Related'),
 (('Third Party Sharing/Collection', 'Action Third Party'), 'Unrelated'),
 (('User Access, Edit and Deletion', 'Access Scope'), 'Related'),
 (('Other', 'Other Type'), 'Unrelated'),
 (('Policy Change', 'Notification Type'), 'Unrelated'),
 (('User Choice/Control', 'Personal Information Type'), 'Related')

In [60]:
%%time
valLabeledData=sentWithTopic(valTexts, valAnno)

CPU times: user 7.56 s, sys: 8 ms, total: 7.56 s
Wall time: 7.56 s


In [446]:
valLabeledData['everydayhealth.com'][120]

['We do not knowingly collect Personal Information from individuals under 13 years of age.',
 [('First Party Collection/Use', 'Personal Information Type'),
  ('First Party Collection/Use', 'Does/Does Not'),
  ('International and Specific Audiences', 'Audience Type')]]

In [460]:
#See which topic are in all of the texts
def countTopicsInDoc(labeledData):
  websiteTopics={}
  for website, sentList in labeledData.items():
    topicList=[]
    for sent in sentList:
      for topic in sent[1]:
        topicList.append(topic)
    websiteTopics[website]=dict(Counter(topicList))
  return websiteTopics
  

In [459]:
dict(Counter({"a":2}))

{'a': 2}

In [464]:
valSetTopics=countTopicsInDoc(valLabeledData)

## check which topics exist in all of the docs
## Drop any rows that have NaN
completetopics=pd.DataFrame.from_dict(valSetTopics).dropna(axis=0, how="any")

In [465]:
completetopics

Unnamed: 0,Unnamed: 1,austincc.edu,boardgamegeek.com,dairyqueen.com,dcccd.edu,dogbreedinfo.com,everydayhealth.com,lids.com,naturalnews.com,randomhouse.com,style.com
First Party Collection/Use,Action First-Party,27.0,14.0,17.0,3.0,1.0,59.0,50.0,1.0,102,32.0
First Party Collection/Use,Purpose,26.0,14.0,16.0,12.0,2.0,83.0,77.0,1.0,106,63.0
Other,Other Type,21.0,2.0,23.0,12.0,4.0,44.0,27.0,5.0,52,15.0
Third Party Sharing/Collection,Action Third Party,17.0,11.0,22.0,9.0,6.0,52.0,41.0,3.0,43,47.0
Third Party Sharing/Collection,Does/Does Not,15.0,9.0,4.0,9.0,5.0,50.0,25.0,3.0,21,37.0
Third Party Sharing/Collection,Personal Information Type,16.0,4.0,21.0,5.0,6.0,36.0,23.0,2.0,33,26.0
Third Party Sharing/Collection,Purpose,9.0,8.0,25.0,6.0,7.0,44.0,36.0,2.0,39,42.0
Third Party Sharing/Collection,Third Party Entity,15.0,7.0,24.0,6.0,9.0,75.0,46.0,5.0,40,49.0


In [128]:
# Make a list of topics that are in all of the documents 
valTopcis=[('First Party Collection/Use', 'Action First-Party'), 
          ('First Party Collection/Use', 'Purpose'),
          ('Third Party Sharing/Collection', 'Action Third Party'), 
          ('Third Party Sharing/Collection', 'Does/Does Not'),
          ('Third Party Sharing/Collection', 'Personal Information Type'),
          ('Third Party Sharing/Collection', 'Purpose'),
          ('Third Party Sharing/Collection', 'Third Party Entity')]

In [380]:
## take a sentence and score it 
def scoreSentence(rawSent, n, stopwords, scoreMap, scoreCutoff):
  cleanSent=re.sub("\\|", "", rawSent.lower())
  ngramList=getNgrams(cleanSent, n , stopwords)
  totalscore=0
  for ngram in ngramList:
    try:
      rawscore=scoreMap[ngram]
      totalscore+=int(rawscore>=scoreCutoff)
    except KeyError:
      pass
  return totalscore/len(ngramList)
    
## feed in score table of a specific topic
## rank all the sentence in the document, can be used on external data in practice
def rankSentencesInCorpus(rawText, cutOff,nList, topic, scoreMapList):
  scoredSentences=[]
  sentenceList=sent_tokenize(rawText)
  for sentence in sentenceList:
    cleanSent=re.sub("\\|", "", sentence.lower())
    sentScore=0
    for i in range(len(scoreMapList)):
      sentScore+=scoreSentence(sentence, nList[i], engstop, scoreMapList[i][topic], cutOff )
    scoredSentences.append([cleanSent, sentScore])
  return sorted(scoredSentences, reverse=True, key=lambda x: x[1])

## feed in the output from  'sentWithTopic', calculate the scores and rank the sentences, and also keep the labels
# 
def rankSentencesWithLabels(valSet, topic,nList,scoreCutOff,rankCutOff, *scoretableList):
  # if a sentence ranks higher than a cut off, then it is relevant
  fulltuples={}
  predLabels={}  # a smplified list of [sentence, topic label, predicted topic]
  for website, sentList in valSet.items():

    sentenceScoreList=[]
    
    for sentTuple in sentList:
      sentence=sentTuple[0]
      if len(sentence.split(" "))>4:
        sentResult=[sentTuple[0], sentTuple[1]]
        sentScore=0
        
        for i in range(len(scoretableList)):
          sentScore+=scoreSentence(sentence, nList[i], engstop, scoretableList[i][topic], scoreCutOff )
        sentResult.append([topic, sentScore])
        sentenceScoreList.append(sentResult)
        
    sentScoresRanked=sorted(sentenceScoreList, reverse=True, key=lambda x: x[2][1])
    fulltuples[website]=sentScoresRanked
    
    sentLabelPredList=[]
    
    for i, sentFeatureLabel in enumerate(sentScoresRanked):
      sentLabelPred=[sentFeatureLabel[0], int(topic in sentFeatureLabel[1]), int(i<=rankCutOff), i]
      sentLabelPredList.append(sentLabelPred)
    
    predLabels[website]=sentLabelPredList
    
  return sentScoresRanked, predLabels


In [447]:
valSentLabeled, valSentPredVectors=rankSentencesWithLabels(valLabeledData,('First Party Collection/Use', 'Does/Does Not'), [1], 3.84, 10, unigramScoreNP)

In [445]:
np.logical_and(np.array([2,3,4])==1,np.array([4,2,1])==1)

array([False, False, False], dtype=bool)

In [452]:
## calculate accuracy for all the topics
def getAccuracy(valLabelPred):
  websiteAccu={}
  for website, result in valLabelPred.items():
    labelPred=list(zip(*result))
    label=np.array(labelPred[1])
    pred=np.array(labelPred[2])
    accuracy=np.sum(label==pred)/len(label)
    precision=np.sum(np.logical_and(label==1, pred==1))/np.sum(pred==1)
    recall=np.sum(np.logical_and(label==1, pred==1))/np.sum(label==1)
    websiteAccu[website]=[accuracy,precision, recall]
  return websiteAccu

def getAccuracyAlltopic(valSet, topicList, nList,scoreCutOff,rankCutOff, *scoretableList):
  allPredTuples={}
  allLabelPredVectors={}
  allAccuracy={}
  for topic in topicList:
    allInfo,labelPred=rankSentencesWithLabels(valSet, topic,nList, scoreCutOff, rankCutOff, *scoretableList)
    allPredTuples[topic]=allInfo
    allLabelPredVectors[topic]=labelPred
    allAccuracy[topic]=getAccuracy(labelPred)
  return allPredTuples, allLabelPredVectors, allAccuracy

In [477]:
valAllInfo, valPredLabels, valAccuracy=getAccuracyAlltopic(valLabeledData, valTopcis, [1], 3.84, 15, unigramScoreNP)

In [478]:
valAccuracyTable=pd.DataFrame.from_dict(valAccuracy)

Unnamed: 0_level_0,First Party Collection/Use,First Party Collection/Use,Third Party Sharing/Collection,Third Party Sharing/Collection,Third Party Sharing/Collection,Third Party Sharing/Collection,Third Party Sharing/Collection
Unnamed: 0_level_1,Action First-Party,Purpose,Action Third Party,Does/Does Not,Personal Information Type,Purpose,Third Party Entity
austincc.edu,"[0.717948717949, 0.5, 0.727272727273]","[0.538461538462, 0.3125, 0.416666666667]","[0.692307692308, 0.375, 0.75]","[0.615384615385, 0.25, 0.571428571429]","[0.641025641026, 0.25, 0.666666666667]","[0.615384615385, 0.125, 0.666666666667]","[0.641025641026, 0.25, 0.666666666667]"
boardgamegeek.com,"[0.444444444444, 0.375, 1.0]","[0.222222222222, 0.125, 1.0]","[0.333333333333, 0.25, 1.0]","[0.388888888889, 0.3125, 1.0]","[0.222222222222, 0.125, 1.0]","[0.222222222222, 0.125, 1.0]","[0.388888888889, 0.3125, 1.0]"
dairyqueen.com,"[0.6, 0.0, 0.0]","[0.66, 0.1875, 0.428571428571]","[0.68, 0.3125, 0.5]","[0.68, 0.0625, 0.5]","[0.64, 0.25, 0.4]","[0.68, 0.4375, 0.5]","[0.7, 0.4375, 0.538461538462]"
dcccd.edu,"[0.6, 0.125, 1.0]","[0.457142857143, 0.125, 0.285714285714]","[0.657142857143, 0.25, 1.0]","[0.657142857143, 0.25, 1.0]","[0.542857142857, 0.0625, 0.5]","[0.6, 0.125, 1.0]","[0.6, 0.125, 1.0]"
dogbreedinfo.com,"[0.0909090909091, 0.0909090909091, 1.0]","[0.181818181818, 0.181818181818, 1.0]","[0.272727272727, 0.272727272727, 1.0]","[0.272727272727, 0.272727272727, 1.0]","[0.454545454545, 0.454545454545, 1.0]","[0.272727272727, 0.272727272727, 1.0]","[0.363636363636, 0.363636363636, 1.0]"
everydayhealth.com,"[0.792307692308, 0.125, 0.133333333333]","[0.746153846154, 0.1875, 0.130434782609]","[0.823076923077, 0.4375, 0.333333333333]","[0.776923076923, 0.3125, 0.217391304348]","[0.8, 0.1875, 0.1875]","[0.784615384615, 0.1875, 0.166666666667]","[0.8, 0.5625, 0.321428571429]"
lids.com,"[0.78125, 0.1875, 0.272727272727]","[0.760416666667, 0.5, 0.347826086957]","[0.791666666667, 0.1875, 0.3]","[0.802083333333, 0.1875, 0.333333333333]","[0.791666666667, 0.125, 0.25]","[0.822916666667, 0.3125, 0.454545454545]","[0.791666666667, 0.3125, 0.357142857143]"
naturalnews.com,"[0.142857142857, 0.142857142857, 1.0]","[0.142857142857, 0.142857142857, 1.0]","[0.285714285714, 0.285714285714, 1.0]","[0.285714285714, 0.285714285714, 1.0]","[0.285714285714, 0.285714285714, 1.0]","[0.142857142857, 0.142857142857, 1.0]","[0.285714285714, 0.285714285714, 1.0]"
randomhouse.com,"[0.766129032258, 0.5, 0.275862068966]","[0.741935483871, 0.5, 0.25]","[0.846774193548, 0.4375, 0.411764705882]","[0.870967741935, 0.375, 0.5]","[0.870967741935, 0.375, 0.5]","[0.83064516129, 0.375, 0.352941176471]","[0.879032258065, 0.5, 0.533333333333]"
style.com,"[0.833333333333, 0.4375, 0.636363636364]","[0.782051282051, 0.5, 0.470588235294]","[0.74358974359, 0.25, 0.333333333333]","[0.769230769231, 0.375, 0.428571428571]","[0.74358974359, 0.1875, 0.3]","[0.846153846154, 0.5, 0.666666666667]","[0.769230769231, 0.375, 0.428571428571]"


In [375]:
dqRankSent=rankSentencesInCorpus(valTexts['dairyqueen.com'], 1, [1],('Third Party Sharing/Collection', 'Personal Information Type') , [unigramScoreNP])

###  Try supervised classification model for sentence selection

In [96]:
#### Try random forest


## Generate Training Features
def generateXY(relSentsWithLabels, topic):
  trainTexts=[]
  trainLabels=[]
  for sent in relSentsWithLabels:
    trainTexts.append(sent[1])
    topicTuple=[x for x in sent[2] if x[0] == topic][0]
    trainLabels.append(int(topicTuple[1]=="Related"))
  return trainTexts, trainLabels

train1, labels1=generateXY(relSentTuples,('Third Party Sharing/Collection', 'Personal Information Type'))
valtext, valLabels=generateXY(validateRelSentsTuples, ('Third Party Sharing/Collection', 'Personal Information Type'))

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer=TfidfVectorizer(max_df=3, stop_words="english")
vectorizer.fit(train1)
trainVec=vectorizer.transform(train1)

In [122]:
valVec=vectorizer.transform(valtext)
model = RandomForestClassifier(n_estimators=500)
model.fit(trainVec, labels1)
pred=model.predict(valVec)
f1score=f1_score(valLabels, pred, average="weighted")

In [132]:
# Make a list of topics that are in all of the documents 
valTopics=[('First Party Collection/Use', 'Action First-Party'), 
          ('First Party Collection/Use', 'Purpose'),
          ('Third Party Sharing/Collection', 'Action Third Party'), 
          ('Third Party Sharing/Collection', 'Does/Does Not'),
          ('Third Party Sharing/Collection', 'Personal Information Type'),
          ('Third Party Sharing/Collection', 'Purpose'),
          ('Third Party Sharing/Collection', 'Third Party Entity')]

In [163]:
def modelAndValidate(trainSentTuples, valSentTuples,topiclist, vectorizer, classifier):
  results={}

  for topic in topiclist:
    ## fit the model
    traintexts, trainlabels=generateXY(trainSentTuples, topic)
    vectorizer.fit(traintexts)
    trainCounts=vectorizer.transform(traintexts)
    valTexts, valLabels=generateXY(valSentTuples, topic)
    valCounts=vectorizer.transform(valTexts)
    classifier.fit(trainCounts, trainlabels)
    predictions=classifier.predict(valCounts)
    f1score=f1_score(valLabels, predictions, average="weighted")
    results[topic]={"model":classifier, "validation_labels":valLabels, "predictions":predictions, "f1_score":f1score}
    print("Finished modeling for topic {}".format(topic))
  return results, vectorizer

In [171]:
%%time
vectorizerFinal=TfidfVectorizer(max_df=0.2, stop_words="english")
classifierFinal=RandomForestClassifier(n_estimators=500)
extractionModels, fittedVectorizer =modelAndValidate(relSentTuples, validateRelSentsTuples, valTopics, vectorizerFinal, classifierFinal)

Finished modeling for topic ('First Party Collection/Use', 'Action First-Party')
Finished modeling for topic ('First Party Collection/Use', 'Purpose')
Finished modeling for topic ('Third Party Sharing/Collection', 'Action Third Party')
Finished modeling for topic ('Third Party Sharing/Collection', 'Does/Does Not')
Finished modeling for topic ('Third Party Sharing/Collection', 'Personal Information Type')
Finished modeling for topic ('Third Party Sharing/Collection', 'Purpose')
Finished modeling for topic ('Third Party Sharing/Collection', 'Third Party Entity')
CPU times: user 6min 4s, sys: 180 ms, total: 6min 5s
Wall time: 6min 5s


In [172]:
accuracyTables=pd.DataFrame.from_dict(extractionModels, orient="index")
display(accuracyTables.loc[:, ["f1_score", "model"]])


Unnamed: 0,Unnamed: 1,f1_score,model
First Party Collection/Use,Action First-Party,0.861534,"(DecisionTreeClassifier(class_weight=None, cri..."
First Party Collection/Use,Purpose,0.809102,"(DecisionTreeClassifier(class_weight=None, cri..."
Third Party Sharing/Collection,Action Third Party,0.878209,"(DecisionTreeClassifier(class_weight=None, cri..."
Third Party Sharing/Collection,Does/Does Not,0.852496,"(DecisionTreeClassifier(class_weight=None, cri..."
Third Party Sharing/Collection,Personal Information Type,0.874944,"(DecisionTreeClassifier(class_weight=None, cri..."
Third Party Sharing/Collection,Purpose,0.879136,"(DecisionTreeClassifier(class_weight=None, cri..."
Third Party Sharing/Collection,Third Party Entity,0.870839,"(DecisionTreeClassifier(class_weight=None, cri..."


In [166]:
extractionModelsOnly={topic:value["model"] for topic, value in extractionModels.items()}

In [173]:
with open("/share/pub/Results/extractionModels_randomForest_upgrade.pk", "wb") as f:
  pickle.dump([extractionModelsOnly, valTopics, fittedVectorizer], f)

In [170]:
alltopics

{('Data Retention', 'Personal Information Type'),
 ('Data Retention', 'Retention Period'),
 ('Data Retention', 'Retention Purpose'),
 ('Data Security', 'Security Measure'),
 ('Do Not Track', 'Do Not Track policy'),
 ('First Party Collection/Use', 'Action First-Party'),
 ('First Party Collection/Use', 'Choice Scope'),
 ('First Party Collection/Use', 'Choice Type'),
 ('First Party Collection/Use', 'Collection Mode'),
 ('First Party Collection/Use', 'Does/Does Not'),
 ('First Party Collection/Use', 'Identifiability'),
 ('First Party Collection/Use', 'Personal Information Type'),
 ('First Party Collection/Use', 'Purpose'),
 ('First Party Collection/Use', 'User Type'),
 ('International and Specific Audiences', 'Audience Type'),
 ('Other', 'Other Type'),
 ('Policy Change', 'Change Type'),
 ('Policy Change', 'Notification Type'),
 ('Policy Change', 'User Choice'),
 ('Third Party Sharing/Collection', 'Action Third Party'),
 ('Third Party Sharing/Collection', 'Choice Scope'),
 ('Third Party Sha

#### Mutual information approach (not finish)

In [55]:
x=np.random.randint(0,2,size=(1000,200))
y=np.random.randint(0,2,size=(1000,200))

print(x)
print(y)

[[0 0 0 ..., 0 1 1]
 [0 0 1 ..., 0 1 1]
 [0 1 0 ..., 1 1 1]
 ..., 
 [1 0 0 ..., 0 1 0]
 [1 1 0 ..., 0 0 1]
 [1 0 1 ..., 0 0 1]]
[[0 1 0 ..., 1 0 0]
 [1 0 1 ..., 1 1 1]
 [0 0 0 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 1 0 0]
 [1 0 0 ..., 0 1 0]
 [0 1 1 ..., 0 0 0]]


In [53]:
# Given matrices of 1s and 0s, row is number of observations
# column is the number of variables , implement a function that calculates MI

def MIscorePart(X,Y,xlabel, ylabel):
  assert X.shape==Y.shape, "Shape of the two input matrices must be the same"
  nrow=X.shape[0]
  ncol=X.shape[1]
  rightX=X==xlabel
  rightY=Y==ylabel
  margX=(np.sum(rightX, axis=0)+10**-10)/nrow
  margY=(np.sum(rightY, axis=0)+10**-10)/nrow
  
  inter=(np.sum(np.logical_and(rightX, rightY), axis=0)+10**-10)/nrow
  #print(margX, margY, inter)
  #print(inter*np.log(margX*margY))
  result=inter*np.log(inter/(margX*margY))
  return result
  


def MIscore(X,Y):
  assert X.shape==Y.shape, "Shape of the two input matrices must be the same"
  nrow=X.shape[0]
  ncol=X.shape[1]
  part1=MIscorePart(X,Y,1,1) #x=1, y=1 
  part2=MIscorePart(X,Y,1,0)
  part3=MIscorePart(X,Y,0,1)
  part4=MIscorePart(X,Y,0,0)
  
  score=part1+part2+part3+part4
  return score

In [57]:
np.seterr(divide='warn', invalid='warn')
testresult=MIscore(x,y)

In [None]:
from sklearn.metrics import mutual_info_score
def compareMI(X,Y):
  scikitMI=mutual_info_score(X,Y)
  myMI=MIscore(X,Y)
  return np.equal(scikitMI, myMI)

In [None]:
testx=np.random.randint(0,2,10000)
testy=np.random.randint(0,2,10000)

result=compareMI(testx, testy)
print(result)

In [None]:
#index All the Sentences, 
#for each sentences

def getScoreMI(relSentsAllTopic, n,topicIdxTable, ngramIdxTable, printEvery=50000):
  relSentsAllTopic