In [81]:
#Imports needed from pytorch
import torch
from torch.utils.data import Dataset
from collections import OrderedDict
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import SGD,Adam
!pip install skorch

#Some built-in imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from os.path import join, isfile
from os import listdir
import json

# SKLearn and Skorch
from sklearn.datasets import make_classification
from skorch import NeuralNet, NeuralNetClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report
from skorch.callbacks import EarlyStopping

#Imports from the repository
from data_processing import get_weights_matrix, get_tokens
import data_processing as dp
from privacy_policies_dataset import PrivacyPoliciesDataset as PPD
from database import Database

from urllib.parse import unquote
from bs4 import BeautifulSoup

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dictionary

In [82]:
with open('./embeddings/word2idx_300.pkl', 'rb') as dictionary_file:
    dictionary = pickle.load(dictionary_file)
with open('./embeddings/word2vector_300.pkl', 'rb') as word2vector_file:
    word2vector = pickle.load(word2vector_file)
with open('./embeddings/weights_matrix_300.pkl', 'rb') as weights_matrix_file:
    weights_matrix = pickle.load(weights_matrix_file)

# Define CNN and Collate Data Fn.

In [83]:
class CNN(nn.Module):


    """
    
    Convolutional Neural Model used for training the models. The total number of kernels that will be used in this
    CNN is Co * len(Ks). 
    
    Args:
        weights_matrix: numpy.ndarray, the shape of this n-dimensional array must be (words, dims) were words is
        the number of words in the vocabulary and dims is the dimensionality of the word embeddings.
        Co (number of filters): integer, stands for channels out and it is the number of kernels of the same size that will be used.
        Hu: integer, stands for number of hidden units in the hidden layer.
        C: integer, number of units in the last layer (number of classes)
        Ks: list, list of integers specifying the size of the kernels to be used. 
     
    """
    
    def __init__(self, embeddings, vocab_size, emb_dim, Co, Hu, C, Ks, dropout, name = 'generic'):
        
        super(CNN, self).__init__()
              
        self.num_embeddings = vocab_size
        
        self.embeddings_dim = emb_dim

        self.padding_index = 0
        
        self.cnn_name = 'cnn_' + str(emb_dim) + '_' + str(Co) + '_' + str(Hu) + '_' + str(C) + '_' + str(Ks) + '_' + name

        self.Co = Co
        
        self.Hu = Hu
        
        self.C = C
        
        self.Ks = Ks
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embeddings_dim, self.padding_index)
        self.embedding = self.embedding.from_pretrained(torch.tensor(embeddings).float(), freeze=True)

        self.convolutions = nn.ModuleList([nn.Conv2d(1,self.Co,(k, self.embeddings_dim)) for k in self.Ks])
        
        # activation function for hidden layers =  Rectified Linear Unit
        self.relu = nn.ReLU()
        
        self.drop_out = nn.Dropout(p=dropout)
        
        self.linear1 = nn.Linear(self.Co * len(self.Ks), self.Hu[0])
        
        self.linear2 = nn.Linear(self.Hu[-1], self.C)
        
        # activation function of output layer
        self.sigmoid = nn.Sigmoid()
        
        self.double()
    
    def forward(self,x):
        
        #size(N,1,length) to size(N,1,length,dims)
        
        x = self.embedding(x)
        
        #size(N,1,length,dims) to size(N,1,length)
        
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convolutions]
        
        #size(N,1,length) to (N, Co * len(Ks))
        
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        
        x = torch.cat(x,1)
        
        x = self.linear1(x)
        
        x = self.relu(x)
        
        x = self.linear2(x)

        x = self.sigmoid(x)
        
        return x
    

In [84]:
def collate_data(batch):

    def stack_segments(segments, clearance = 2):

        import numpy as np

        segments_len = map(len, segments)
        max_len = max(segments_len)

        segments_list = []

        output_len = max_len + clearance * 2

        for i, segment in enumerate(segments):

            segment_array = np.array(segment)

            zeros_to_prepend = int((output_len - len(segment_array))/2)

            zeros_to_append = output_len - len(segment_array) - zeros_to_prepend

            resized_array = np.append(np.zeros(zeros_to_prepend), segment_array)

            resized_array = np.append(resized_array, np.zeros(zeros_to_append))

            segments_list.append(torch.tensor(resized_array, dtype = torch.int64, device=torch.device("cuda")))

            segments_tensor = torch.stack(segments_list).unsqueeze(1)

        return segments_tensor                         

    segments = [item[0] for item in batch]

    labels = [item[1] for item in batch]

    segments_tensor = stack_segments(segments)

    labels_tensor = torch.stack(labels)

    return [segments_tensor, labels_tensor]

# Load Models

In [85]:
def load_model(current_attribute, current_num_levels):
    # Load Trained Model
    net = NeuralNet(
        CNN,
        module__embeddings = weights_matrix,
        module__vocab_size = weights_matrix.shape[0],
        module__emb_dim = weights_matrix.shape[1],
        module__Co = 200,
        module__Hu = [100],
        module__C = current_num_levels,
        module__Ks = [3],
        module__name = f'{current_attribute}_zeros_60-20-(no-val)_polisis',
        module__dropout = 0.5,
        max_epochs = 300,
        lr = 0.01,
        optimizer = SGD,
        optimizer__weight_decay = 0,
        optimizer__momentum=0.9,
        criterion = nn.BCELoss(),
        batch_size=40,
        # Turn the validation split off once we have the metadata values set
        train_split = None,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        iterator_train__collate_fn=collate_data,
        iterator_valid__collate_fn=collate_data,
        # Turn off verbose
        verbose = 0,
        device='cuda',
    ).initialize()
    net.load_params(f_params=f'trained_models/{current_attribute}/model.pkl',f_optimizer=f'trained_models/{current_attribute}/optimizer.pkl', f_history=f'trained_models/{current_attribute}/history.json')
    return net

In [86]:
main_category_model = load_model('Majority', 12)
does_not_model = load_model('Does or Does Not', 2)
identifiability_model = load_model('Identifiability', 3)
purpose_model = load_model('Purpose', 10)
information_type_model = load_model('Personal Information Type', 15)

# Load Labels

In [87]:
def load_labels(current_attribute):
    labels_file = open(f"labels/labels_{current_attribute}.pkl","rb")
    labels = pickle.load(labels_file)
    labels_file.close()
    return list(labels)

In [89]:
main_category_labels = load_labels('Majority')
does_not_labels = load_labels('Does or Does Not')
identifiability_labels = load_labels('Identifiability')
purpose_labels = load_labels('Purpose')
information_type_labels = load_labels('Personal Information Type')

# Pre-process Policy Segments

In [90]:
def merge_lists(policy_text):
    policy_text_filtered_lists = []
    for line_index in range(len(policy_text)):
        if policy_text[line_index][-1] == ',':
            whole_segment = policy_text[line_index].split('*')
            avg_len = 0
            for list_element in whole_segment:
                avg_len += len(list_element.split())
            avg_len = avg_len / len(whole_segment)
            if (avg_len >= 20):
                for list_element in whole_segment:
                    policy_text_filtered_lists.append(list_element.strip())
            else:
                if (len(policy_text_filtered_lists) == 0):
                    policy_text_filtered_lists = [policy_text[line_index]]
                else:
                    policy_text_filtered_lists[-1] += policy_text[line_index]
        else:
            policy_text_filtered_lists.append(policy_text[line_index]) 
    return policy_text_filtered_lists

In [91]:
def filter_out_headings(policy_text, html_content):
    def getTextFromTag(html_string, tag):
        header_lines = []
        soup = BeautifulSoup(html_string, 'html.parser')
        for element in soup.find_all(tag):
            header_lines.append(element.text)
        return header_lines
    policy_headings_text = getTextFromTag(html_content, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    policy_text_filtered_headers = [x for x in policy_text if x not in policy_headings_text]
    return policy_text_filtered_headers

# Connect To Database

In [92]:
def get_raw_html(policy_id):
    """Function to extract raw HTML from the raw_policy table.

    :param policy_id: The ID of the corresponding row of the raw_policy table.
    """
    
    select_raw_html = """
        SELECT `raw_html`
        FROM `run_41_raw_policy`
        WHERE `id` = %s
    """
    
    # Establish a database connection
    database_connection = Database().get_database_connection()

    # Try with the database connection as a resource.
    with database_connection:
        with database_connection.cursor() as cursor:
            # Insert a row in the error log table.
            cursor.execute(select_raw_html,
                           (policy_id))

            # Fetch all app urls.
            result_rows = cursor.fetchall()
    
    # Initialize a rawHTML string variable
    raw_html = ''

    # Parse the returned list and extract a single element
    for row in result_rows:
        raw_html = unquote(row['raw_html'], encoding='utf-8', errors='replace')
    
    return raw_html

# Example Policy Classification

In [94]:
column_list = ['policy_id', 'app_id', 'policy_url', 'content', 'plain_content', 'plain_text']
policy_df = pd.read_csv('/data/policy.csv', names = column_list)

policy_df['content'] = policy_df['content'].apply(lambda x: unquote(str(x), encoding='utf-8', errors='replace'))
policy_df['plain_content'] = policy_df['plain_content'].apply(lambda x: unquote(str(x), encoding='utf-8', errors='replace'))
policy_df['plain_text'] = policy_df['plain_text'].apply(lambda x: unquote(str(x), encoding='utf-8', errors='replace'))

In [95]:
policy_df.shape

(25, 6)

In [96]:
results = []
for index, row in policy_df.iterrows():
        
    # Parse policy text as a list of strings
    policy_text = row['plain_text'].splitlines()
    
    # Preprocess text: Eliminate headers and merge lists with previous paragraph
    policy_text = merge_lists(policy_text)
    policy_text = filter_out_headings(policy_text, row['plain_content'])
    
    # Vectorize policy segments
    segments_tensor = dp.process_policy_of_interest(dictionary, policy_text)
    
    # Make predictions using the CNN model
    predictions = main_category_model.predict_proba(segments_tensor)
    
    # Filter predictions to include labels with >50% probability
    y_pred = predictions > 0.5

#     print(y_pred)
    
    result = {
        'policy_id': row['policy_id'],
        'app_id': row['app_id'],
        'policy_url': row['policy_url'],
        'segments': []
    }
    
    
    # Append result for each segment to the result list
    for result_row in range(len(policy_text)):
        segment_text = policy_text[result_row]
        main_labels = []
        predictedValues = y_pred[result_row, :]
        for label in range(12):
            if predictedValues[label] == True:
                main_labels.append(main_category_labels[label])
        if(len(main_labels) > 0):
            current_segment = {
                'Segment Text': segment_text,
                'Main Category': main_labels
            }
            if ('First Party Collection/Use' in main_labels or 'Third Party Sharing/Collection' in main_labels):
                segment_tensor = dp.process_policy_of_interest(dictionary, [segment_text,])
                
                # Does or Does Not
                does_not_results = []
                does_not_predictions = does_not_model.predict_proba(segment_tensor)
                does_not_predictions = does_not_predictions > 0.5
                does_not_predictions = does_not_predictions[0, :]
                if does_not_predictions[1] == True:
                    does_not_results.append(does_not_labels[1])
                current_segment['Does Not'] = does_not_results
                
                # Identifiability
                identifiability_results = []
                identifiability_predictions = identifiability_model.predict_proba(segment_tensor)
                identifiability_predictions = identifiability_predictions > 0.5
                identifiability_predictions = identifiability_predictions[0, :]
                for identifiability_label_index in range(3):
                    if identifiability_predictions[identifiability_label_index] == True:
                        identifiability_results.append(identifiability_labels[identifiability_label_index])
                current_segment['Identifiability'] = identifiability_results
                
                # Purpose
                purpose_results = []
                purpose_predictions = purpose_model.predict_proba(segment_tensor)
                purpose_predictions = purpose_predictions > 0.5
                purpose_predictions = purpose_predictions[0, :]
                for purpose_label_index in range(10):
                    if purpose_predictions[purpose_label_index] == True:
                        purpose_results.append(purpose_labels[purpose_label_index])
                current_segment['Purpose'] = purpose_results
                
                # Personal Information Type
                information_type_results = []
                information_type_predictions = information_type_model.predict_proba(segment_tensor)
                information_type_predictions = information_type_predictions > 0.5
                information_type_predictions = information_type_predictions[0, :]
                for information_type_label_index in range(15):
                    if information_type_predictions[information_type_label_index] == True:
                        information_type_results.append(information_type_labels[information_type_label_index])
                current_segment['Personal Information Type'] = information_type_results
                
            result['segments'].append(current_segment)
#     result['segments'] = json.dumps(result['segments'])
    results.append(result)
    

In [99]:
results

[{'policy_id': 1,
  'app_id': 1434844,
  'policy_url': 'https://www.subsplash.com/legal/privacy',
  'segments': [{'Segment Text': 'Haga clic aquí para el español/Click here for Spanish.',
    'Main Category': ['Privacy contact information']},
   {'Segment Text': 'Last updated September 10, 2020 – Version 4.2',
    'Main Category': ['Policy Change', 'Introductory/Generic']},
   {'Segment Text': 'This privacy policy describes the collection, use, protection, disclosure, correction and deletion of your personal information by Subsplash. Please take a moment to read the following to learn more about our information practices, including what type of information is gathered, how the information is used and for what purposes, to whom we disclose the information, and how we safeguard your personal information. Your privacy is a priority at Subsplash, and we go to great lengths to protect it.',
    'Main Category': ['Introductory/Generic']},
   {'Segment Text': 'This privacy policy applies to t

In [98]:
sample_list = ['abc', 'EDF', 'jhi']
sample_result = 'abc' in sample_list
int(sample_result)

1