In [1]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data
import xml.etree.ElementTree as ET
from textblob import TextBlob
from glob import glob

### Read vectorizers and classifiers from disk

In [2]:
with gzip.open('./Data/classifiers_rforest.pk.gz', 'rb') as f:
    models = pickle.load(f)

In [3]:
with gzip.open('./Data/vectorizers.pk.gz', 'rb') as f:
    vectorizers = pickle.load(f)

### Define function to read privacy policies from corpus
Using the ACL Coling dataset from https://www.usableprivacy.org/static/data/acl-coling-2014-corpus.zip

In [4]:
def getSentencesFromXML(filename):

    tree = ET.parse(filename)
 
    # extract the website_url attribute
    website = tree.find('.').attrib['website_url']
    
    # get a list of sentences from the privacy policy
    sentences = []
    for section in tree.getroot():
        for subsection in section:
            try:
                blob = TextBlob(subsection.text)
            except TypeError:
                continue # ignore
            for s in blob.sentences:
                sentences.append(str(s))
                
    return website, sentences


def classifySentences(vectorizers, models, sentences):
    
    classifiedSentences = {}
    
    for category in models:
        vectorizer = vectorizers[category]
        model = models[category]

        X = vectorizer.transform(sentences)
        predictions = model.predict(X)
        
        classifiedSentences[category] = []
        
        for i, predict in enumerate(predictions):
            if predict == 1:
                classifiedSentences[category].append(sentences[i])

    return classifiedSentences


### Loop through all files and classify each sentence

In [5]:
%%time

filenames = glob('./Data/corpus/*.xml')

corpus_sentences = {}

for filename in filenames:
    website, sentences = getSentencesFromXML(filename)
    corpus_sentences[website] = classifySentences(vectorizers, models, sentences)
    
print ("{} files processed\n".format(len(corpus_sentences)))

998 files processed

CPU times: user 13min 8s, sys: 34.8 s, total: 13min 43s
Wall time: 13min 43s


### Print out an example of how to access a sentence

In [6]:
print (corpus_sentences['ask.com']['Data Security'][0])

We take the security of your personal information seriously and use appropriate technical and organizational measures to protect your personal information against unauthorized or unlawful processing and against accidental loss, destruction or damage.


### Write corpus_sentences to disk

In [7]:
with open('./Data/corpus_sentences.pk', 'wb') as f:
    pickle.dump(corpus_sentences, f)