# Extracting Data Collection Practices from Privacy Labels (Demo)

Once we collect the URL of an app's privacy policy from the App Store, we can visit this URL and parse its text contents.

In this demo, we provide code to visit example policies and clean the response text. We then independently classify text segments (or paragraphs) and interpret the classifier outputs into privacy labels.

💡 This demo needs to be run on GPUs. To do so, navigate to the notebook's menu at the top, `Runtime > Change runtime type` and select `T4 GPU`.

In [1]:
!pip install readabilipy langdetect beautifulsoup4



In [2]:
import os
import pandas as pd
import requests
from readabilipy import simple_json_from_html_string
import random
import string
from langdetect import detect
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer

# 1. Crawl Privacy Policy

### Functions to Clean Extracted Text

In [3]:
def filter_out_headings(policy_text, html_content):
    def getTextFromTag(html_string, tag):
        header_lines = []
        soup = BeautifulSoup(html_string, 'html.parser')
        for element in soup.find_all(tag):
            header_lines.append(element.text)
        return header_lines
    policy_headings_text = getTextFromTag(html_content, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    policy_text_filtered_headers = [x for x in policy_text if x not in policy_headings_text]
    return policy_text_filtered_headers

def merge_lists(policy_text):
    policy_text_filtered_lists = []
    for line_index in range(len(policy_text)):
        if policy_text[line_index][-1] == ',':
            whole_segment = policy_text[line_index].split('*')
            avg_len = 0
            for list_element in whole_segment:
                avg_len += len(list_element.split())
            avg_len = avg_len / len(whole_segment)
            if (avg_len >= 20):
                for list_element in whole_segment:
                    policy_text_filtered_lists.append(list_element.strip())
            else:
                if (len(policy_text_filtered_lists) == 0):
                    policy_text_filtered_lists = [policy_text[line_index]]
                else:
                    policy_text_filtered_lists[-1] += policy_text[line_index]
        else:
            policy_text_filtered_lists.append(policy_text[line_index])
    return policy_text_filtered_lists

def remove_short_sentences(policy_text):
    policy_text_filtered_lists = []
    for line_index in range(len(policy_text)):
        num_words = len(policy_text[line_index].split(' '))
        if (num_words >= 20):
            policy_text_filtered_lists.append(policy_text[line_index].strip())
    return policy_text_filtered_lists

def find_and_remove_large_string(strings):
    def preprocess_string(s):
        # Convert to lowercase and remove punctuation
        return s.lower().translate(str.maketrans('', '', string.punctuation))

    def is_substring_found(substring, large_string):
        # Check if 90% of the substring is in the large string
        substring_length = len(substring)
        match_length = int(substring_length * 0.9)

        for i in range(len(large_string) - match_length + 1):
            if substring[:match_length] in large_string[i:i+match_length]:
                return True
        return False

    def contains_all_substrings(large_string, substrings):
        count = 0
        for substring in substrings:
            if is_substring_found(substring, large_string):
                count += 1
        return count / len(substrings) >= 0.50

    preprocessed_strings = [preprocess_string(s) for s in strings]

    for large_string in strings:
        preprocessed_large_string = preprocess_string(large_string)

        # Check if this string contains 90% of the other preprocessed strings
        other_strings = [s for s in preprocessed_strings if s != preprocessed_large_string]
        if contains_all_substrings(preprocessed_large_string, other_strings):
            strings.remove(large_string)
            return strings

    return strings

 ### Functions to Viait URL and Collect Policy Text

In [4]:
def get_policy_text(privacy_policy_url):
    user_agents = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0"
    ]

    request_headers = {
        "Accept": "text/html",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "https://apps.apple.com",
        "Referer": "https://apps.apple.com",
        "User-Agent": random.choice(user_agents),
    }

    try:
        response = requests.get(privacy_policy_url, headers=request_headers, timeout=15)
    except:
        print("Error with Request")
        return

    if response.status_code >= 200 and response.status_code < 400:
        try:
            html_content = response.text
        except:
            print("Error extracting HTML content")
            return

        try:
            article = simple_json_from_html_string(html_content, use_readability=True)
        except:
            try:
                article = simple_json_from_html_string(html_content, use_readability=False)
            except:
                print("Error parsing text from HTML content")

        if 'plain_text' in article and article['plain_text']:
            return list(set(list(map(lambda x: x['text'], article['plain_text'])))), article['content']

# 2. Classification Demo: Extract Data Collection Practices from Policy Segments

## Load Pre-trained Classification Models

In [5]:
!rm -rf PrivBERT-*

In [6]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Main

Git LFS initialized.
Cloning into 'PrivBERT-Main'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 14 (delta 1), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (14/14), 1.09 MiB | 4.81 MiB/s, done.


In [7]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Purpose

Git LFS initialized.
Cloning into 'PrivBERT-Purpose'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 14 (delta 1), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (14/14), 1.09 MiB | 2.36 MiB/s, done.


In [8]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Identifiability

Git LFS initialized.
Cloning into 'PrivBERT-Identifiability'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 1.08 MiB | 1.44 MiB/s, done.


In [9]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Does-or-Does-Not

Git LFS initialized.
Cloning into 'PrivBERT-Does-or-Does-Not'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 1.08 MiB | 1.62 MiB/s, done.


In [10]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Personal-Information-Type

Git LFS initialized.
Cloning into 'PrivBERT-Personal-Information-Type'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 15 (delta 1), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (15/15), 1.09 MiB | 5.14 MiB/s, done.


In [11]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Action-First-Party

Git LFS initialized.
Cloning into 'PrivBERT-Action-First-Party'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 1.08 MiB | 1.42 MiB/s, done.


In [12]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Action-Third-Party

Git LFS initialized.
Cloning into 'PrivBERT-Action-Third-Party'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 1.08 MiB | 4.83 MiB/s, done.


In [13]:
!git lfs install
!git clone https://huggingface.co/masoodali/PrivBERT-Audience-Type

Git LFS initialized.
Cloning into 'PrivBERT-Audience-Type'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 0), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 1.08 MiB | 2.87 MiB/s, done.


In [14]:
from torch import cuda
cuda.empty_cache()
device = 'cuda' if cuda.is_available() else 'cpu'

In [15]:
majority_model = torch.load("PrivBERT-Main/pytorch-privbert.bin")
majority_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Main/")

identifiability_model = torch.load("PrivBERT-Identifiability/pytorch-privbert.bin")
identifiability_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Identifiability/")

does_model = torch.load("PrivBERT-Does-or-Does-Not/pytorch-privbert.bin")
does_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Does-or-Does-Not/")

purpose_model = torch.load("PrivBERT-Purpose/pytorch-privbert.bin")
purpose_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Purpose/")

information_model = torch.load("PrivBERT-Personal-Information-Type/pytorch-privbert.bin")
information_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Personal-Information-Type/")

action_first_model = torch.load("PrivBERT-Action-First-Party/pytorch-privbert.bin")
action_first_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Action-First-Party/")

action_third_model = torch.load("PrivBERT-Action-Third-Party/pytorch-privbert.bin")
action_third_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Action-Third-Party/")

audience_model = torch.load("PrivBERT-Audience-Type/pytorch-privbert.bin")
audience_tokenizer = AutoTokenizer.from_pretrained("PrivBERT-Audience-Type/")

## Classify Extracted Policy Segments

In [16]:
def parse_classifier_result(classifier_result, segment_text):
    return {
        'segment_text': segment_text,
        'action_first_mobile': 1 if classifier_result['action_first'][0][0] > 0.5 else 0,
        'action_first_website': 1 if classifier_result['action_first'][0][1] > 0.5 else 0,
        'action_third_website': 1 if  classifier_result['action_third'][0][0] > 0.5 else 0,
        'action_third_see': 1 if classifier_result['action_third'][0][1] > 0.5 else 0,
        'children': 1 if classifier_result['audience'][0][0] > 0.5 else 0,
        'does': 1 if classifier_result['does'][0][0] > 0.5 else 0,
        'does_not': 1 if classifier_result['does'][0][1] > 0.5 else 0,
        'aggregated': 1 if classifier_result['identifiability'][0][0] > 0.5 else 0,
        'identifiable': 1 if classifier_result['identifiability'][0][1] > 0.5 else 0,
        'main_first': 1 if classifier_result['main'][0][0] > 0.5 else 0,
        'main_third': 1 if classifier_result['main'][0][1] > 0.5 else 0,
        'main_audience': 1 if classifier_result['main'][0][5] > 0.5 else 0,
        'computer_info': 1 if classifier_result['information'][0][0] > 0.5 else 0,
        'contact': 1 if classifier_result['information'][0][1] > 0.5 else 0,
        'cookies': 1 if classifier_result['information'][0][2] > 0.5 else 0,
        'demographic': 1 if classifier_result['information'][0][3] > 0.5 else 0,
        'financial': 1 if classifier_result['information'][0][4] > 0.5 else 0,
        'generic': 1 if classifier_result['information'][0][5] > 0.5 else 0,
        'health': 1 if classifier_result['information'][0][6] > 0.5 else 0,
        'ip': 1 if classifier_result['information'][0][7] > 0.5 else 0,
        'location': 1 if classifier_result['information'][0][8] > 0.5 else 0,
        'personal_id': 1 if classifier_result['information'][0][9] > 0.5 else 0,
        'social': 1 if classifier_result['information'][0][10] > 0.5 else 0,
        'survey': 1 if classifier_result['information'][0][11] > 0.5 else 0,
        'online_activities': 1 if classifier_result['information'][0][12] > 0.5 else 0,
        'profile': 1 if classifier_result['information'][0][13] > 0.5 else 0,
        'info_unspecified': 1 if classifier_result['information'][0][14] > 0.5 else 0,
        'additional': 1 if classifier_result['purpose'][0][0] > 0.5 else 0,
        'advertising': 1 if classifier_result['purpose'][0][1] > 0.5 else 0,
        'analytics': 1 if classifier_result['purpose'][0][2] > 0.5 else 0,
        'basic': 1 if classifier_result['purpose'][0][3] > 0.5 else 0,
        'legal': 1 if classifier_result['purpose'][0][4] > 0.5 else 0,
        'marketing': 1 if classifier_result['purpose'][0][5] > 0.5 else 0,
        'merger': 1 if classifier_result['purpose'][0][6] > 0.5 else 0,
        'personalization': 1 if classifier_result['purpose'][0][7] > 0.5 else 0,
        'operation': 1 if classifier_result['purpose'][0][8] > 0.5 else 0,
        'purpose_unspecified': 1 if classifier_result['purpose'][0][9] > 0.5 else 0
    }

In [17]:
def classify_policy(policy_segments):
    def classify_segment(segment, model, tokenizer):
        inputs = tokenizer.encode_plus(
            segment,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_token_type_ids=True,
            return_tensors = 'pt'
        )
        ids = inputs['input_ids'].to(device, dtype = torch.long)
        mask = inputs['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = inputs['token_type_ids'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids)
        outputs = outputs.logits
        fin_outputs=[]
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        return fin_outputs

    classifier_results_raw = []
    for policy_segment in policy_segments:
        classifier_results_raw.append({
            'segment' : policy_segment,
            'main': classify_segment(policy_segment, majority_model, majority_tokenizer),
            'identifiability': classify_segment(policy_segment, identifiability_model, identifiability_tokenizer),
            'does': classify_segment(policy_segment, does_model, does_tokenizer),
            'purpose': classify_segment(policy_segment, purpose_model, purpose_tokenizer),
            'information': classify_segment(policy_segment, information_model, information_tokenizer),
            'action_first': classify_segment(policy_segment, action_first_model, action_first_tokenizer),
            'action_third': classify_segment(policy_segment, action_third_model, action_third_tokenizer),
            'audience': classify_segment(policy_segment, audience_model, audience_tokenizer),
        })

    classifier_results_parsed = [parse_classifier_result(classifier_results_raw[i], policy_segments[i]) for i in range(len(policy_segments))]

    return classifier_results_parsed

## Create Privacy Label from Privacy Policy

In [18]:
sensitive_info_keywords = ['racial', 'ethnic', 'ethnicity', 'sexual orientation', 'sexual preference', 'pregnancy', 'pregnant', 'childbirth', 'child birth', 'child-birth', 'disability', 'religion', 'religious', 'religious belief', 'trade union', 'union member', 'politics', 'political', 'genetic', 'genetic information', 'biometric']
def classifier_to_label(classifier_results):
    generated_privacy_label = []
    for classifier_result in classifier_results:
        segment_privacy_label = {
            'privacy_type_track': 1 if classifier_result['does'] == 1 and ((classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1)) and classifier_result['advertising'] == 1) else 0,
            'privacy_type_linked': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['identifiable'] == 1) else 0,
            'privacy_type_not_linked': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['aggregated'] == 1) else 0,
            'privacy_type_not_collected': 1 if (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['does'] == 0 and classifier_result['does_not'] == 1) else 0,
            'purpose_third_party_advertising': 1 if classifier_result['does'] == 1 and ((classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1)) and classifier_result['advertising'] == 1) else 0,
            'purpose_developer_advertising_marketing': 1 if classifier_result['does'] == 1 and ((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) and classifier_result['advertising'] == 1) else 0,
            'purpose_analytics': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['analytics'] == 1) else 0,
            'purpose_product_personalization': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['personalization'] == 1) else 0,
            'purpose_app_functionality': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) == 1 or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['basic'] == 1 or classifier_result['additional'] == 1 or classifier_result['operation'] == 1)) else 0,
            'purpose_other':  1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['merger'] == 1 or classifier_result['legal'] == 1 or classifier_result['purpose_unspecified'] == 1)) else 0,
            'data_category_contact_info': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['contact'] == 1) else 0,
            'data_category_health_fitness': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['health'] == 1) else 0,
            'data_category_financial_info': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['financial'] == 1) else 0,
            'data_category_location': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['location'] == 1) else 0,
            'data_category_sensitive': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['demographic'] == 1 or any (sensitive_keyword in classifier_result['segment_text'].lower() for sensitive_keyword in sensitive_info_keywords))) else 0,
            'data_category_contacts': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and ((classifier_result['social'] == 1 and any (contact_keyword in classifier_result['segment_text'].lower() for contact_keyword in ['contact', 'friend'])) or any (contact_keyword in classifier_result['segment_text'].lower() for contact_keyword in ['phone book', 'address book']))) else 0,
            'data_category_user_content': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['profile'] == 1 or classifier_result['social'] == 1)) else 0,
            'data_category_browsing_history': 1 if classifier_result['does'] == 1 and ((classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1)) and classifier_result['online_activities'] == 1) else 0,
            'data_category_search_history': 1 if classifier_result['does'] == 1 and ((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) and (classifier_result['online_activities'] == 1 and 'search' in classifier_result['segment_text'].lower())) else 0,
            'data_category_identifiers': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['cookies'] == 1 or classifier_result['ip'] == 1)) else 0,
            'data_category_purchases': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['financial'] == 1 and classifier_result['online_activities'] == 1)) else 0,
            'data_category_usage_data': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and classifier_result['online_activities'] == 1) else 0,
            'data_category_diagnostics': 1 if classifier_result['does'] == 1 and (((classifier_result['main_first'] == 1 and not (classifier_result['action_first_mobile'] == 0 and classifier_result['action_first_website'] == 1)) or (classifier_result['main_third'] == 1 and not (classifier_result['action_third_website'] == 0 and classifier_result['action_third_see'] == 1))) and (classifier_result['computer_info'] == 1 or classifier_result['ip'] == 1)) else 0
        }

        if not set(list(segment_privacy_label.values())) == {0}:
            generated_privacy_label.append(segment_privacy_label)
    return generated_privacy_label

# 3. Putting it all Together

In [19]:
privacy_policy_urls = {
    'som': 'https://www.som.org.uk/privacy-and-cookie-policy'
}

In [20]:
policy_segments, html_content = get_policy_text(privacy_policy_urls['som'])
policy_segments = merge_lists(policy_segments)
policy_segments = filter_out_headings(policy_segments, html_content)
policy_segments = remove_short_sentences(policy_segments)
policy_segments = find_and_remove_large_string(policy_segments)

In [21]:
classifier_results = classify_policy(policy_segments)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [22]:
generated_label = classifier_to_label(classifier_results)
generated_label

[{'privacy_type_track': 1,
  'privacy_type_linked': 1,
  'privacy_type_not_linked': 0,
  'privacy_type_not_collected': 0,
  'purpose_third_party_advertising': 1,
  'purpose_developer_advertising_marketing': 0,
  'purpose_analytics': 0,
  'purpose_product_personalization': 0,
  'purpose_app_functionality': 1,
  'purpose_other': 0,
  'data_category_contact_info': 0,
  'data_category_health_fitness': 0,
  'data_category_financial_info': 0,
  'data_category_location': 0,
  'data_category_sensitive': 0,
  'data_category_contacts': 0,
  'data_category_user_content': 0,
  'data_category_browsing_history': 0,
  'data_category_search_history': 0,
  'data_category_identifiers': 1,
  'data_category_purchases': 0,
  'data_category_usage_data': 0,
  'data_category_diagnostics': 0}]