In [11]:
## import labeled data from pickled object
import pickle
import random

In [3]:
with open("/share/pub/Results/LabeledFullSentences.pk", 'rb') as f:
  labeledSentences=pickle.load(f)

In [None]:
"""
The structure of the labeledSentences is {website:{section:[sentence, section, startIdx, endIdx, (topic, value)]}}
The topic is made up of two parts : category ,
"""

In [16]:
labeledSentences['playstation.com'][2]

[["This Privacy Statement and the certification seal located to your right confirms that SCEA is a valid licensee and participating member in the Entertainment Software Rating Board's Privacy Online Program: ESRB Privacy Online.",
  2,
  0,
  225,
  [('Other', 'Other Type', 'Practice not covered')]],
 ['To protect your privacy to the maximum extent possible, we have undertaken this privacy initiative and our websites have been reviewed and certified by ESRB Privacy Online to meet established online information collection and use practices.',
  2,
  226,
  466,
  [('Other', 'Other Type', 'Practice not covered'),
   ('Data Security', 'Security Measure', 'Privacy/Security program')]],
 ['As part of the privacy program, we are subject to frequent audits of our sites and other enforcement and accountability mechanisms administered independently by ESRB.',
  2,
  467,
  633,
  [('Other', 'Other Type', 'Practice not covered'),
   ('Data Security', 'Security Measure', 'Privacy/Security program

In [9]:
websites=list(labeledSentences.keys())

In [30]:
## choose the websites for training and validating 
seed=1324
random.seed(seed)
trainWebsites=random.sample(websites, 102)
valWebsites=[thing for thing in websites if thing not in trainWebsites]

In [17]:
## A list of all topics 

topicList=[('Data Retention', 'Personal Information Type'),
 ('Data Retention', 'Retention Period'),
 ('Data Retention', 'Retention Purpose'),
 ('Data Security', 'Security Measure'),
 ('Do Not Track', 'Do Not Track policy'),
 ('First Party Collection/Use', 'Action First-Party'),
 ('First Party Collection/Use', 'Choice Scope'),
 ('First Party Collection/Use', 'Choice Type'),
 ('First Party Collection/Use', 'Collection Mode'),
 ('First Party Collection/Use', 'Does/Does Not'),
 ('First Party Collection/Use', 'Identifiability'),
 ('First Party Collection/Use', 'Personal Information Type'),
 ('First Party Collection/Use', 'Purpose'),
 ('First Party Collection/Use', 'User Type'),
 ('International and Specific Audiences', 'Audience Type'),
 ('Other', 'Other Type'),
 ('Policy Change', 'Change Type'),
 ('Policy Change', 'Notification Type'),
 ('Policy Change', 'User Choice'),
 ('Third Party Sharing/Collection', 'Action Third Party'),
 ('Third Party Sharing/Collection', 'Choice Scope'),
 ('Third Party Sharing/Collection', 'Choice Type'),
 ('Third Party Sharing/Collection', 'Does/Does Not'),
 ('Third Party Sharing/Collection', 'Identifiability'),
 ('Third Party Sharing/Collection', 'Personal Information Type'),
 ('Third Party Sharing/Collection', 'Purpose'),
 ('Third Party Sharing/Collection', 'Third Party Entity'),
 ('Third Party Sharing/Collection', 'User Type'),
 ('User Access, Edit and Deletion', 'Access Scope'),
 ('User Access, Edit and Deletion', 'Access Type'),
 ('User Access, Edit and Deletion', 'User Type'),
 ('User Choice/Control', 'Choice Scope'),
 ('User Choice/Control', 'Choice Type'),
 ('User Choice/Control', 'Personal Information Type'),
 ('User Choice/Control', 'Purpose'),
 ('User Choice/Control', 'User Type')]

In [13]:
## We can take the data and put them in a better format 
## For example, we can just keep the website information, the sentence, the topic and the value

In [20]:
def extractFeatureLabels(labeledSents):
  results={}
  for website, data in labeledSents.items():
    websiteresult=[]
    for section, infoList in data.items():
      for sentence in infoList:
        websiteresult.append([sentence[0], sentence[4]])
    results[website]=websiteresult
  return results

simpleSentLabels=extractFeatureLabels(labeledSentences)

In [28]:
simpleSentLabels["playstation.com"][4:6]  # each sentence is grouped with the topic they are related to and the label

[["This Privacy Statement and the certification seal located to your right confirms that SCEA is a valid licensee and participating member in the Entertainment Software Rating Board's Privacy Online Program: ESRB Privacy Online.",
  [('Other', 'Other Type', 'Practice not covered')]],
 ['To protect your privacy to the maximum extent possible, we have undertaken this privacy initiative and our websites have been reviewed and certified by ESRB Privacy Online to meet established online information collection and use practices.',
  [('Other', 'Other Type', 'Practice not covered'),
   ('Data Security', 'Security Measure', 'Privacy/Security program')]]]

In [31]:
trainData={website:simpleSentLabels[website] for website in trainWebsites}
valData={website:simpleSentLabels[website] for website in valWebsites}

In [42]:
## Make a way to organize the training texts and labels
## we need to make a model for each topic
def generateXY(inputdata, topic):
  results=[]
  for website, data in inputdata.items():
    for item in data:    
      if topic in [x[0:2] for x in item[1]]:   # if the topic is in a (sentence, topic+label)  tuple
        label=[x[2] for x in item[1] if x[0:2]==topic][0]
        results.append([item[0], label])
  text, labels=zip(*results)
  return text, labels

In [43]:
topic=('Third Party Sharing/Collection', 'Personal Information Type')
trainText, trainLabels=generateXY(trainData, topic)

In [44]:
print(trainText[:5])
print(trainLabels[:5])

("To do this, we use 'cookies' or alphanumeric identifiers that we transfer to your computer's hard drive through your web browser to enable our systems to recognize your browser and to provide features such as storage of items in your Shopping Cart between visits.", 'This provider may collect anonymous information about your visits to the Sites and may also use information about your visits to this and other Web sites to refine our services.', 'We may also collect anonymous information about your interactions with us through the use of pixel tags (also called "web beacons" or "clear gifs"), which are tiny graphic images on the Sites and in our emails.', 'Does Hat World share personal information about me-   By using the Sites, you agree that we may share your personal information in the following ways:     When you provide personal information to one of our retail stores or affiliates, we may share that information with our other retail stores or affiliates in the United States or Can

In [45]:
valText, valLabels=generateXY(valData, topic)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer

# make features using tfidf vectorizer
vectorizer=TfidfVectorizer(max_df=0.2)  

# max_df = maximum document frequency , this makes it ignores the terms 
# that have higher document frequency than the specified threshold, in this case it's 20% of the documents
# so if a term appears in 20% of all the sentences, it will be ignored
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# There are also several other parameters we can play with

vectorizer.fit(trainText)
trainVec=vectorizer.transform(trainText )  # this is the input to the model

model=RandomForestClassifier(n_estimators=1000)  # instantiate a classifier object

model.fit(trainVec, trainLabels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [51]:
## Get the prediction of the validation data
valVec=vectorizer.transform(valText)
pred=model.predict(valVec)
print(pred[:10])

['Generic personal information' 'Generic personal information'
 'Cookies and tracking elements' 'Cookies and tracking elements'
 'Generic personal information' 'Generic personal information'
 'Generic personal information' 'Generic personal information'
 'Generic personal information' 'Generic personal information']


In [54]:
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html 
#we can show precision and recall 


from sklearn.metrics import precision_recall_fscore_support

prec_recall=precision_recall_fscore_support(valLabels, pred, average="macro")


(0.44297507993749757, 0.37142857142857133, 0.36475513939559961, None)


  'precision', 'predicted', average, warn_for)


In [56]:
# or we can show f1_score
f1score=f1_score(valLabels, pred, average="macro")

  'precision', 'predicted', average, warn_for)


In [57]:
f1score

0.36475513939559961