In [1]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data
import xml.etree.ElementTree as ET
from textblob import TextBlob


### Read vectorizers and classifiers from disk

In [2]:
with gzip.open('./Data/classifiers_rforest.pk.gz', 'rb') as f:
    models = pickle.load(f)


In [3]:
with gzip.open('./Data/vectorizers.pk.gz', 'rb') as f:
    vectorizers = pickle.load(f)

### Read a privacy policy from the corpus
Using the ACL Coling dataset from https://www.usableprivacy.org/static/data/acl-coling-2014-corpus.zip

In [10]:
tree = ET.parse('./Data/corpus/www_nytimes.xml')

# get a list of sentences from the privacy policy
sentences = []
for section in tree.getroot():
    for subsection in section:
        try:
            blob = TextBlob(subsection.text)
        except:
            # ignore NoneType error
            continue
        #sentences.append(blob.sentences)
        for s in blob.sentences:
            sentences.append(str(s))


for sentence in sentences[1:6]:
    print(sentence)
    print("-"*5)


For the purposes of this Privacy Policy, unless otherwise noted, all references to "The New York Times" include NYTimes.com and The New York Times newspaper.
-----
The New York Times Replica Edition, which is maintained by NewspaperDirect, maintains its own Privacy Policy.
-----
The New York Times advertising portal, for advertisers of The Times, also maintains a separate Privacy Policy.
-----
The NYT Services may contain links to other Web sites for your convenience and information.
-----
We are not responsible for the privacy practices or the content of those sites.
-----


### Apply classifiers to new document

In [11]:
for category in models:
    vectorizer = vectorizers[category]
    model = models[category]

    print("vectorizer: {}, model: {}, category: {}".format(len(vectorizer.vocabulary_), model.n_features_, category))

    X = vectorizer.transform(sentences)
    predictions = model.predict(X)
        

vectorizer: 53636, model: 53636, category: Data Retention
vectorizer: 52872, model: 52872, category: Data Security
vectorizer: 53066, model: 53066, category: Do Not Track
vectorizer: 53195, model: 53195, category: First Party Collection/Use
vectorizer: 53392, model: 53392, category: International and Specific Audiences
vectorizer: 53738, model: 53738, category: Policy Change
vectorizer: 53413, model: 53413, category: Third Party Sharing/Collection
vectorizer: 53267, model: 53267, category: User Access, Edit and Deletion
vectorizer: 52942, model: 52942, category: User Choice/Control
