In [1]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data
import xml.etree.ElementTree as ET
from textblob import TextBlob

### Read vectorizers and classifiers from disk

In [2]:
with gzip.open('./Data/classifiers_rforest.pk.gz', 'rb') as f:
    models = pickle.load(f)

In [3]:
with gzip.open('./Data/vectorizers.pk.gz', 'rb') as f:
    vectorizers = pickle.load(f)

### Read a privacy policy from the corpus
Using the ACL Coling dataset from https://www.usableprivacy.org/static/data/acl-coling-2014-corpus.zip

In [44]:
tree = ET.parse('./Data/corpus/www_nytimes.xml')

# get a list of sentences from the privacy policy
sentences = []
for section in tree.getroot():
    for subsection in section:
        try:
            blob = TextBlob(subsection.text)
        except TypeError:
            continue # ignore
        for s in blob.sentences:
            sentences.append(str(s))

for sentence in sentences[1:6]:
    print(sentence)
    print("---")

For the purposes of this Privacy Policy, unless otherwise noted, all references to "The New York Times" include NYTimes.com and The New York Times newspaper.
---
The New York Times Replica Edition, which is maintained by NewspaperDirect, maintains its own Privacy Policy.
---
The New York Times advertising portal, for advertisers of The Times, also maintains a separate Privacy Policy.
---
The NYT Services may contain links to other Web sites for your convenience and information.
---
We are not responsible for the privacy practices or the content of those sites.
---


### Classify sentences into categories

In [32]:
%%time

predictions = {}

for category in models:
    vectorizer = vectorizers[category]
    model = models[category]

    X = vectorizer.transform(sentences)
    predictions[category] = model.predict(X)


CPU times: user 1.29 s, sys: 39 ms, total: 1.33 s
Wall time: 1.33 s


### Print a few example sentences from each category

In [33]:
for category in models:
    print ("#" * len(category))
    print(category)
    print ("#" * len(category))

    printed = 0
    for i, predict in enumerate(predictions[category]):
        if predict == 1:
            print (sentences[i])

            # print 3 sentences from each category
            printed += 1
            if printed >= 3:
                break
            
            print ("---")
    
    print()
        

##############
Data Retention
##############
This information will only be shared with third parties who perform tasks required to complete the purchase transaction.
---
The e-mail addresses that you supply to this service are saved for your convenience for future articles you may wish to e-mail; these addresses are not used for any other purpose, and will not be shared with any third parties.
---

#############
Data Security
#############
Compliance with legal process
Children's Guidelines
Your California Privacy Rights
Changes to this Privacy Policy

TRUSTe: The New York Times has been awarded TRUSTe's Privacy Seal signifying that this privacy policy and our practices have been reviewed by TRUSTe for compliance with TRUSTe's program requirements including transparency, accountability and choice regarding the collection and use of your personal information.
---
TRUSTe's mission, as an independent third party, is to accelerate online trust among consumers and organizations globally thr