<a target="_blank" href="https://colab.research.google.com/github/masood/2024-pets-privacy-labels-policies/blob/main/template_detection.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Comparing Privacy Policy against Popular Template (Demo)

In [1]:
!pip install readabilipy langdetect beautifulsoup4



In [2]:
#Imports needed from pytorch
import torch
from torch import nn

#Some built-in imports
import numpy as np
import pickle
import os
import requests
import random
import string
import json

from readabilipy import simple_json_from_html_string
from langdetect import detect
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 0. Helper/Utitility Functions to Process Text

In [3]:
!wget "https://raw.githubusercontent.com/masood/2024-pets-privacy-labels-policies/main/template-files/word2idx_300.pkl" -O word2idx_300.pkl

--2024-05-29 16:41:46--  https://raw.githubusercontent.com/masood/2024-pets-privacy-labels-policies/main/template-files/word2idx_300.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2064330 (2.0M) [application/octet-stream]
Saving to: ‘word2idx_300.pkl’


2024-05-29 16:41:46 (114 MB/s) - ‘word2idx_300.pkl’ saved [2064330/2064330]



In [4]:
with open('word2idx_300.pkl', 'rb') as dictionary_file:
    dictionary = pickle.load(dictionary_file)

In [5]:
def sentence_serialization(sentence, word2idx, lower_case = True):
    """

    Transforms a sentence into a list of integers. No integer will be appended if the token is not present in word2idx.

    Args:
        sentence: string, sentence that we want to serialize.
        word2idx: dictionary, dictionary with words as keys and indexes as values.
        lower_case: boolean, turns all words in the sentence to lower case. Useful if word2idx doesn't support upper case words.
    Returns:
        s_sentence: list, list containing the indexes of the words present in the sentence.
        s_sentence stands for serialized sentence.

    """

    s_sentence = []

    not_found = 0

    if lower_case:

        tokens = map(str.lower,nltk.word_tokenize(sentence))

    else:

        tokens = nltk.word_tokenize(sentence)

    for token in tokens:

        try:

            s_sentence.append(word2idx[token])

        except KeyError:

            not_found += 1

    return s_sentence

In [6]:
def vectorize_string(word2idx, segments, max_template_len=-1):

    """

    This function process all the privacy policy files and transforms all the segments into lists of integers. It also
    transforms all the labels into a list of 0s except in the positions associated with the labels in which we will find 1s
    where we will find a 1. It will also place .pkl files into the processed_data folder so that we can load the data from
    there instead of having to process the whole dataset.


    """

    """

    Helper functions

    """

    def stack_segments(segments, max_template_len=-1, clearance=2):

        segments_len = map(len, segments)

        if (max_template_len == -1):
            max_len = max(segments_len)
            max_template_len = max_len
        else:
            max_len = max_template_len

        segments_list = []

        output_len = max_len + clearance * 2

        for i, segment in enumerate(segments):
            segment_array = np.array(segment)

            if (len(segment_array) > max_len):
                segment_array = segment_array[:max_len]

            zeros_to_prepend = int((output_len - len(segment_array)) / 2)

            zeros_to_append = output_len - len(segment_array) - zeros_to_prepend

            resized_array = np.append(np.zeros(zeros_to_prepend), segment_array)

            resized_array = np.append(resized_array, np.zeros(zeros_to_append))

            segments_list.append(torch.tensor(resized_array, dtype=torch.int64))

            segments_tensor = torch.stack(segments_list).unsqueeze(1)

        return segments_tensor, max_template_len

    num_records = len(segments)

    segments_matrices = np.zeros(num_records, dtype='object')

    for index in range(num_records):
        segment = segments[index]
        segments_matrices[index] = sentence_serialization(segment, word2idx)

    segments_tensor, max_template_len = stack_segments(segments_matrices, max_template_len)

    return segments_tensor, max_template_len

In [7]:
def get_sentences(policy_text):
    sentence_list = []
    for line in policy_text.splitlines():
        sentence_list.extend(sent_tokenize(line))
    return sentence_list

# 1. Download and Prepare Template Files

In [8]:
base_path = "https://raw.githubusercontent.com/masood/2024-pets-privacy-labels-policies/main/template-files/"
template_files = [
    'app-privacy-policy-generator-firebaseapp.txt',
    'enzuzo.txt',
    'FreePrivacyPolicy.txt',
    'getterms.txt',
    'iubenda.txt',
    'pandadoc.txt',
    'PrivacyPolicies.txt',
    'PrivacyPolicyGenerator.txt',
    'PrivacyPolicyGeneratorInfo.txt',
    'securiti-ai.txt',
    'shopify.txt',
    'Termly.txt',
    'TermsFeed.txt',
    'website-policies-com.txt',
    'WebsitePrivacyPolicyGenerator.txt'
]
template_sites = [
    'https://app-privacy-policy-generator.firebaseapp.com/',
    'https://www.enzuzo.com/privacy-policy-generator',
    'https://www.freeprivacypolicy.com/free-privacy-policy-generator/',
    'https://getterms.io/',
    'https://www.iubenda.com/',
    'https://www.pandadoc.com/free-privacy-policy-template/',
    'https://www.privacypolicies.com/',
    'https://www.privacypolicygenerator.org',
    'https://www.privacypolicygenarator.info/',
    'https://securiti.ai/privacy-center/',
    'https://www.shopify.com/tools/policy-generator',
    'https://termly.io/products/privacy-policy-generator/',
    'https://www.termsfeed.com',
    'https://websitepolicies.com',
    'https://www.websiteprivacypolicygenerator.com/'
]

In [9]:
processed_templates = []
template_vectors = []
max_template_lens = []
for template_file in template_files:

        # Get Template from File
        response = requests.get(base_path + template_file)

        # Gather Template as Sentences
        processed_template = get_sentences(response.text)

        # Vectorize Template
        template_vector, max_template_len = vectorize_string(dictionary, processed_template)

        processed_templates.append(processed_template)
        template_vectors.append(template_vector)
        max_template_lens.append(max_template_len)

# 2. Download and Process Privacy Policy

### Functions to Clean Extracted Text

In [10]:
def filter_out_headings(policy_text, html_content):
    def getTextFromTag(html_string, tag):
        header_lines = []
        soup = BeautifulSoup(html_string, 'html.parser')
        for element in soup.find_all(tag):
            header_lines.append(element.text)
        return header_lines
    policy_headings_text = getTextFromTag(html_content, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    policy_text_filtered_headers = [x for x in policy_text if x not in policy_headings_text]
    return policy_text_filtered_headers

def merge_lists(policy_text):
    policy_text_filtered_lists = []
    for line_index in range(len(policy_text)):
        if policy_text[line_index][-1] == ',':
            whole_segment = policy_text[line_index].split('*')
            avg_len = 0
            for list_element in whole_segment:
                avg_len += len(list_element.split())
            avg_len = avg_len / len(whole_segment)
            if (avg_len >= 20):
                for list_element in whole_segment:
                    policy_text_filtered_lists.append(list_element.strip())
            else:
                if (len(policy_text_filtered_lists) == 0):
                    policy_text_filtered_lists = [policy_text[line_index]]
                else:
                    policy_text_filtered_lists[-1] += policy_text[line_index]
        else:
            policy_text_filtered_lists.append(policy_text[line_index])
    return policy_text_filtered_lists

def remove_short_sentences(policy_text):
    policy_text_filtered_lists = []
    for line_index in range(len(policy_text)):
        num_words = len(policy_text[line_index].split(' '))
        if (num_words >= 20):
            policy_text_filtered_lists.append(policy_text[line_index].strip())
    return policy_text_filtered_lists

def find_and_remove_large_string(strings):
    def preprocess_string(s):
        # Convert to lowercase and remove punctuation
        return s.lower().translate(str.maketrans('', '', string.punctuation))

    def is_substring_found(substring, large_string):
        # Check if 90% of the substring is in the large string
        substring_length = len(substring)
        match_length = int(substring_length * 0.9)

        for i in range(len(large_string) - match_length + 1):
            if substring[:match_length] in large_string[i:i+match_length]:
                return True
        return False

    def contains_all_substrings(large_string, substrings):
        count = 0
        for substring in substrings:
            if is_substring_found(substring, large_string):
                count += 1
        return count / len(substrings) >= 0.50

    preprocessed_strings = [preprocess_string(s) for s in strings]

    for large_string in strings:
        preprocessed_large_string = preprocess_string(large_string)

        # Check if this string contains 90% of the other preprocessed strings
        other_strings = [s for s in preprocessed_strings if s != preprocessed_large_string]
        if contains_all_substrings(preprocessed_large_string, other_strings):
            strings.remove(large_string)
            return strings

    return strings

 ### Functions to Visit URL and Collect Policy Text

In [11]:
def get_policy_text(privacy_policy_url):
    user_agents = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0"
    ]

    request_headers = {
        "Accept": "text/html",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "https://apps.apple.com",
        "Referer": "https://apps.apple.com",
        "User-Agent": random.choice(user_agents),
    }

    try:
        response = requests.get(privacy_policy_url, headers=request_headers, timeout=15)
    except:
        print("Error with Request")
        return

    if response.status_code >= 200 and response.status_code < 400:
        try:
            html_content = response.text
        except:
            print("Error extracting HTML content")
            return

        try:
            article = simple_json_from_html_string(html_content, use_readability=True)
        except:
            try:
                article = simple_json_from_html_string(html_content, use_readability=False)
            except:
                print("Error parsing text from HTML content")

        if 'plain_text' in article and article['plain_text']:
            return list(set(list(map(lambda x: x['text'], article['plain_text'])))), article['content']

# 3. Compare Policy to Templates

In [12]:
def evaluate_policy(policy_text):
    # Parse policy text as a list of strings
    policy_text = get_sentences(policy_text)

    matched_templates = []

    for template_index in range(len(template_vectors)):

            template_file_name = template_files[template_index]
            template_site = template_sites[template_index]
            processed_template = processed_templates[template_index]
            template_vector = template_vectors[template_index]
            max_template_len = max_template_lens[template_index]

            try:
                # Vectorize Policy Text, Use Max Template Len as cutoff
                policy_vector, max_policy_len = vectorize_string(dictionary, policy_text, max_template_len)
            except Exception as timeout_exception:
                print(f'TIMED OUT! Template: {template_file_name}')
                continue


            matched_sentences = [0] * len(processed_template)
            for policy_sent_index in range(policy_vector.shape[0]):
                for template_sent_index in range(template_vector.shape[0]):
                    cos = nn.CosineSimilarity(dim=0)
                    output = cos(template_vector[template_sent_index][0].float(), policy_vector[policy_sent_index][0].float())
                    if (output >= 0.80):
                        matched_sentences[template_sent_index] = 1

            if (matched_sentences.count(1) / len(matched_sentences)) >= 0.50:
                if (matched_sentences.count(1) / policy_vector.shape[0]) >= 0.50:
                    matched_templates.append(template_site)
    return matched_templates

# 4. Putting it all Together

In [13]:
privacy_policy_urls = {
    'som': 'https://www.som.org.uk/privacy-and-cookie-policy'
}

In [14]:
policy_segments, html_content = get_policy_text(privacy_policy_urls['som'])
policy_segments = merge_lists(policy_segments)
policy_segments = filter_out_headings(policy_segments, html_content)
policy_segments = remove_short_sentences(policy_segments)
policy_segments = ' '.join(find_and_remove_large_string(policy_segments))

In [15]:
matched_templates = evaluate_policy(policy_segments)

if len(matched_templates) > 0:
    print('Policy is similar to templates from the follwing sites:')
    print(json.dumps(matched_templates))
else:
    print('Policy did not match any template against which we check.')

Policy is similar to templates from the follwing sites:
["https://www.freeprivacypolicy.com/free-privacy-policy-generator/", "https://securiti.ai/privacy-center/", "https://termly.io/products/privacy-policy-generator/"]
