In [11]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data
import xml.etree.ElementTree as ET
from textblob import TextBlob
from glob import glob
import os.path as path

### Read vectorizers and classifiers from disk

In [2]:
with gzip.open('./Data/classifiers_rforest.pk.gz', 'rb') as f:
    models = pickle.load(f)

In [3]:
with gzip.open('./Data/vectorizers.pk.gz', 'rb') as f:
    vectorizers = pickle.load(f)

### Define function to read privacy policies from corpus
Using the ACL Coling dataset from https://www.usableprivacy.org/static/data/acl-coling-2014-corpus.zip

In [7]:
def getSentencesFromXML(filename):

    tree = ET.parse(filename)
 
    # extract the website_url attribute
    website = tree.find('.').attrib['website_url']
    
    # get a list of sentences from the privacy policy
    sentences = []
    for section in tree.getroot():
        for subsection in section:
            try:
                blob = TextBlob(subsection.text)
            except TypeError:
                continue # ignore
            for s in blob.sentences:
                sentences.append(str(s))
                
    return website, sentences

def getSentencesFromTXT(filename):
    # get a list of sentences from the privacy policy
    sentences = []

    with open(filename, 'r') as f:
        blob = TextBlob(f.read())
        
    for s in blob.sentences:
        sentences.append(str(s))
        
    return sentences


def classifySentences(vectorizers, models, sentences):
    
    classifiedSentences = {}
    
    for category in models:
        vectorizer = vectorizers[category]
        model = models[category]

        X = vectorizer.transform(sentences)
        predictions = model.predict(X)
        
        classifiedSentences[category] = []
        
        for i, predict in enumerate(predictions):
            if predict == 1:
                classifiedSentences[category].append(sentences[i])

    return classifiedSentences


### Loop through all files and classify each sentence

In [14]:
%%time

#filenames = glob('./Data/corpus/*.xml')
filenames = glob('./Data/unroll.me')

corpus_sentences = {}

for filename in filenames:
    if filename[-4:] == ".xml":
        website, sentences = getSentencesFromXML(filename)
    else: # assume text file
        website = path.basename(filename)
        sentences = getSentencesFromTXT(filename)
    corpus_sentences[website] = classifySentences(vectorizers, models, sentences)
    
print ("{} files processed\n".format(len(corpus_sentences)))

1 files processed

CPU times: user 742 ms, sys: 32.5 ms, total: 775 ms
Wall time: 774 ms


### Print out an example of how to access a sentence

In [17]:
print (corpus_sentences['unroll.me']['First Party Collection/Use'][5])

We collect such commercial transactional messages so that we can better understand the behavior of the senders of such messages, and better understand our customer behavior and improve our products, services, and advertising.


### Write corpus_sentences to disk

In [18]:
with open('./Data/unrollme.pk', 'wb') as f:
    pickle.dump(corpus_sentences, f)