In [1]:
#Imports needed from pytorch
import torch
from torch.utils.data import Dataset
from collections import OrderedDict
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import SGD,Adam

#Some built-in imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from os.path import join, isfile
from os import listdir
import json

# SKLearn and Skorch
from sklearn.datasets import make_classification
from skorch import NeuralNet, NeuralNetClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report
from skorch.callbacks import EarlyStopping

#Imports from the repository
from data_processing import get_weights_matrix, get_tokens
import data_processing as dp
from privacy_policies_dataset import PrivacyPoliciesDataset as PPD

from urllib.parse import unquote
from bs4 import BeautifulSoup

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dictionary

In [2]:
with open('./embeddings/word2idx_300.pkl', 'rb') as dictionary_file:
    dictionary = pickle.load(dictionary_file)
with open('./embeddings/word2vector_300.pkl', 'rb') as word2vector_file:
    word2vector = pickle.load(word2vector_file)
with open('./embeddings/weights_matrix_300.pkl', 'rb') as weights_matrix_file:
    weights_matrix = pickle.load(weights_matrix_file)

# Define CNN and Collate Data Fn.

In [3]:
class CNN(nn.Module):


    """
    
    Convolutional Neural Model used for training the models. The total number of kernels that will be used in this
    CNN is Co * len(Ks). 
    
    Args:
        weights_matrix: numpy.ndarray, the shape of this n-dimensional array must be (words, dims) were words is
        the number of words in the vocabulary and dims is the dimensionality of the word embeddings.
        Co (number of filters): integer, stands for channels out and it is the number of kernels of the same size that will be used.
        Hu: integer, stands for number of hidden units in the hidden layer.
        C: integer, number of units in the last layer (number of classes)
        Ks: list, list of integers specifying the size of the kernels to be used. 
     
    """
    
    def __init__(self, embeddings, vocab_size, emb_dim, Co, Hu, C, Ks, dropout, name = 'generic'):
        
        super(CNN, self).__init__()
              
        self.num_embeddings = vocab_size
        
        self.embeddings_dim = emb_dim

        self.padding_index = 0
        
        self.cnn_name = 'cnn_' + str(emb_dim) + '_' + str(Co) + '_' + str(Hu) + '_' + str(C) + '_' + str(Ks) + '_' + name

        self.Co = Co
        
        self.Hu = Hu
        
        self.C = C
        
        self.Ks = Ks
        
        self.embedding = nn.Embedding(self.num_embeddings, self.embeddings_dim, self.padding_index)
        self.embedding = self.embedding.from_pretrained(torch.tensor(embeddings).float(), freeze=True)

        self.convolutions = nn.ModuleList([nn.Conv2d(1,self.Co,(k, self.embeddings_dim)) for k in self.Ks])
        
        # activation function for hidden layers =  Rectified Linear Unit
        self.relu = nn.ReLU()
        
        self.drop_out = nn.Dropout(p=dropout)
        
        self.linear1 = nn.Linear(self.Co * len(self.Ks), self.Hu[0])
        
        self.linear2 = nn.Linear(self.Hu[-1], self.C)
        
        # activation function of output layer
        self.sigmoid = nn.Sigmoid()
        
        self.double()
    
    def forward(self,x):
        
        #size(N,1,length) to size(N,1,length,dims)
        
        x = self.embedding(x)
        
        #size(N,1,length,dims) to size(N,1,length)
        
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convolutions]
        
        #size(N,1,length) to (N, Co * len(Ks))
        
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        
        x = torch.cat(x,1)
        
        x = self.linear1(x)
        
        x = self.relu(x)
        
        x = self.linear2(x)

        x = self.sigmoid(x)
        
        return x
    

In [4]:
def collate_data(batch):

    def stack_segments(segments, clearance = 2):

        import numpy as np

        segments_len = map(len, segments)
        max_len = max(segments_len)

        segments_list = []

        output_len = max_len + clearance * 2

        for i, segment in enumerate(segments):

            segment_array = np.array(segment)

            zeros_to_prepend = int((output_len - len(segment_array))/2)

            zeros_to_append = output_len - len(segment_array) - zeros_to_prepend

            resized_array = np.append(np.zeros(zeros_to_prepend), segment_array)

            resized_array = np.append(resized_array, np.zeros(zeros_to_append))

            segments_list.append(torch.tensor(resized_array, dtype = torch.int64, device=torch.device("cuda")))

            segments_tensor = torch.stack(segments_list).unsqueeze(1)

        return segments_tensor                         

    segments = [item[0] for item in batch]

    labels = [item[1] for item in batch]

    segments_tensor = stack_segments(segments)

    labels_tensor = torch.stack(labels)

    return [segments_tensor, labels_tensor]

# Load Models

In [5]:
def load_model(current_attribute, current_num_levels):
    # Load Trained Model
    net = NeuralNet(
        CNN,
        module__embeddings = weights_matrix,
        module__vocab_size = weights_matrix.shape[0],
        module__emb_dim = weights_matrix.shape[1],
        module__Co = 200,
        module__Hu = [100],
        module__C = current_num_levels,
        module__Ks = [3],
        module__name = f'{current_attribute}_zeros_60-20-(no-val)_polisis',
        module__dropout = 0.5,
        max_epochs = 300,
        lr = 0.01,
        optimizer = SGD,
        optimizer__weight_decay = 0,
        optimizer__momentum=0.9,
        criterion = nn.BCELoss(),
        batch_size=40,
        # Turn the validation split off once we have the metadata values set
        train_split = None,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        iterator_train__collate_fn=collate_data,
        iterator_valid__collate_fn=collate_data,
        # Turn off verbose
        verbose = 0,
        device='cuda',
    ).initialize()
    net.load_params(f_params=f'trained_models/{current_attribute}/model.pkl',f_optimizer=f'trained_models/{current_attribute}/optimizer.pkl', f_history=f'trained_models/{current_attribute}/history.json')

In [6]:
main_category_model = load_model('Majority', 12)

RuntimeError: No CUDA GPUs are available