# Training the models

This is the notebook that was used to train the models for my capstone. Original code is meant to be used on kaggle:
https://www.kaggle.com/lukasbusch/privbert
Part is mainly modelled after:
https://towardsdatascience.com/transformers-for-multilabel-classification-71a1a0daf5e1
In this notebook:
- All clasifier models are trained for the 5-fold sets
- A gridsearch is performed for each classifier and the results are saved

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval


In [None]:
# using GPU (don't bother without)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [2]:
# import module we'll need to import our custom module (needed for kaggle)
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/privbert-data/data_processing.py", dst = "../working/data_processing.py")
copyfile(src = "../input/privbert-data/pytorch_classifier.py", dst = "../working/pytorch_classifier.py")
copyfile(src = "../input/privbert-data/privbert_gridsearches3.csv", dst = "../working/privbert_gridsearches.csv")

'../working/privbert_gridsearches.csv'

In [3]:
#initiate our data
from data_processing import Op115OneHots
import random
TRAIN_PATH = '../input/privbert-data/op115_data/op115_train_k0.csv' # here k0 is used, for each k this had to be changed
VAL_PATH = '../input/privbert-data/op115_data/op115_val_k0.csv'
ALL_PATH = '../input/privbert-data/op115_processed.csv'
op115_train = pd.read_csv(TRAIN_PATH)
op115_val = pd.read_csv(VAL_PATH)
op115_all = pd.read_csv(ALL_PATH)

#For more information on the Op115OneHots class please look atht the notebook on data_processing. 
# i.e:  data_processing_explanation.ipynb

#Initiating data for respective k-set
op115_all_c = Op115OneHots(op115_all)
op115_all_c.go2()
uniques = op115_all_c.return_oh_names()

op115_t_c = Op115OneHots(op115_train)
op115_t_c.go2(majority = True, class_tup = uniques)

op115_v_c = Op115OneHots(op115_val)
op115_v_c.go2(majority = True, class_tup = uniques)


v_catsub,v_catval,v_subval,v_cats,v_subs,v_vals,v_my_texts = op115_v_c.new_onehots() # validation data
catsub,catval,subval,cats,subs,vals,my_texts = op115_t_c.new_onehots() # training data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['str_polsegs'] = str_polseg
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.sort_values(by=['str_polsegs'], inplace=True)


In [None]:
# small function that creates a dictionary that links the subcategory classes to the respective 
a,b,c,inds = op115_t_c.len_onehots()
all_inds = list(inds.keys())
only2 = [ind for ind in all_inds if len(ind) == 2 and not isinstance(ind[0], int) and isinstance(ind[0], str)]

sub_dict = {}
for item in only2:
    cat = item[0]
    sub = item[1]
    cat_ind = inds[cat]
    if cat in sub_dict:
        sub_dict[cat].append(inds[(cat_ind,cat,sub)])
    else:
        sub_dict[cat] = [inds[(cat_ind,cat,sub)]]
        
print(sub_dict)

In [None]:
##Performing a grid search for the classifiers

#Setting the paramaters over which I want to classify
params = {  'max_length' : [128,512],
            'learning_rate': [5e-6, 1e-5, 2e-5],
            'batch_size' : [16],
            'n_epochs' : [3,6,9],
            'treshold' : [0.5]}

# Initiate list with the 'category' and 'all subcategory' texts and labels
cat_pair = [my_texts,cats]
sub_pair = [my_texts, subs]
names = ["Categories","Subcategories"]
text_label_pairs = [cat_pair,sub_pair]

# now for each other classifier we want to train we create a pair with the respective text and labels

# for the category -> subcategory classifier
for akey in sorted(list(catsub.keys())):
  valtup = catsub[akey]
  text_value_pair = (valtup[1],valtup[2])
  name = 'cs_'+ akey
  names.append(name)
  text_label_pairs.append(text_value_pair)

# for the category -> value classifier
for akey in sorted(list(catval.keys())):
  valtup = catval[akey]
  text_value_pair = (valtup[1],valtup[2])
  name = 'cv_'+ akey
  names.append(name)
  text_label_pairs.append(text_value_pair)


# list with the texts and respective labels for the validation test 
val_cat_pair = [v_my_texts,v_cats]
val_sub_pair = [v_my_texts, v_subs]
val_names = ["Categories","Subcategories"]
val_text_label_pairs = [val_cat_pair,val_sub_pair]

for akey in sorted(list(v_catsub.keys())):
  valtup = v_catsub[akey]
  v_text_value_pair = (valtup[1],valtup[2])

  val_text_label_pairs.append(v_text_value_pair)

for akey in sorted(list(v_catval.keys())):
  valtup = v_catval[akey]
  v_text_value_pair = (valtup[1],valtup[2])
  val_text_label_pairs.append(v_text_value_pair)

In [None]:
def train_model(train_dataload, val_data_load, optimizer, model, nb_labels, epochs = 3, threshold = 0.5, ret_macro = False):
        """
        Function that trains the model. Pretty straightforward,takes a base model, in case of this proejct that is the 
        model as found in the 'privbert-model' dataset. This model was trained on a privacy policy corpus. 
        
        ret_macro is an option for returning not only the model but also the final training result on the validation set.
        
        """
        
        
        # Store our loss and accuracy for plotting
        train_loss_set = []
        
        # trange is a tqdm wrapper around the normal python range
        for _ in trange(epochs, desc="Epoch"):
        
          # Training
          
          # Set our model to training mode (as opposed to evaluation mode)
          model.train()
        
          # Tracking variables
          tr_loss = 0 #running loss
          nb_tr_examples, nb_tr_steps = 0, 0
          
          # Train the data for one epoch
          for step, batch in enumerate(train_dataload):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
        
            # # Forward pass for multiclass classification
            # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            # loss = outputs[0]
            # logits = outputs[1]
        
            # Forward pass for multilabel classification
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            loss_func = BCEWithLogitsLoss() 
            loss = loss_func(logits.view(-1,nb_labels),b_labels.type_as(logits).view(-1,nb_labels)) #convert labels to float for calculation
            # loss_func = BCELoss() 
            # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
            train_loss_set.append(loss.item())    
        
            # Backward pass
            loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()
            # scheduler.step()
            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
        
          print("Train loss: {}".format(tr_loss/nb_tr_steps))
        
        ###############################################################################
        
          # Validation
        
          # Put model in evaluation mode to evaluate loss on the validation set
          model.eval()
        
          # Variables to gather full output
          logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
        
          # Predict
          for i, batch in enumerate(val_data_load):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
            with torch.no_grad():
              # Forward pass
              outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
              b_logit_pred = outs[0]
              pred_label = torch.sigmoid(b_logit_pred)
        
              b_logit_pred = b_logit_pred.detach().cpu().numpy()
              pred_label = pred_label.to('cpu').numpy()
              b_labels = b_labels.to('cpu').numpy()
        
            tokenized_texts.append(b_input_ids)
            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)
        
          # Flatten outputs
          pred_labels = [item for sublist in pred_labels for item in sublist]
          true_labels = [item for sublist in true_labels for item in sublist]
        
          # Calculate Accuracy
          pred_bools = [pl>threshold for pl in pred_labels]
          true_bools = [tl==1 for tl in true_labels]
          val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
          val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
          
        
          print('F1 Validation Accuracy: ', val_f1_accuracy)
          print('Flat Validation Accuracy: ', val_flat_accuracy)
        if ret_macro:
            val_f1_macro_accuracy = f1_score(true_bools,pred_bools,average='macro')*100
            return val_f1_accuracy, val_f1_macro_accuracy
        return val_f1_accuracy

In [None]:
from pytorch_classifier import BertClassification
from keras import backend as K


def RunBertOneLoop(labels,texts,labels_val, texts_val, max_length,batch_size,learning_rate,n_epochs,nb_labels, return_model = False, ret_macro = False):
    """
    Function that takes the train function and the raw texts and raw labels and creates the dataloaders needed for 
    the training model. 
    
    return_model can be toggled to true if one wants to not only return the f1 score of the model but also the model itself
    ret_macro can be toggled on to return both the macro and micro f1 scores
    """   
    tryout = BertClassification(device=device)
    tryout.train = False
    tryout.test = True
    tryout.init_tokenizer(TOK_NAME)
    tryout.input_labels(labels)
    tryout.input_texts(texts)
    
    tryout_v = BertClassification(gpu = False)
    tryout_v.test = True
    tryout_v.train = False
    tryout_v.init_tokenizer(TOK_NAME)
    tryout_v.input_labels(labels_val)
    tryout_v.input_texts(texts_val)


    model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=nb_labels)
    model.cuda()
    train_data=tryout.encode_texts(max_length,batchsize = batch_size)
    val_data=tryout_v.encode_texts(max_length,batchsize = batch_size)
    optimizer = tryout.init_optimizer(model,learning_rate)
    if ret_macro:
        mic_f1, mac_f1 = train_model(train_data, val_data, optimizer, model, nb_labels,n_epochs, ret_macro = ret_macro)
        return mic_f1, mac_f1
    acc = train_model(train_data, val_data, optimizer, model, nb_labels,n_epochs)
    if return_model:
        return acc,model
    else:
        return acc

# Running the girdsearch

in the code below the gridsearch is performed using the paramters that were set above.

In [None]:
# set file name and column names
SAVEFILE = 'privbert_gridsearches.csv'
TOK_NAME = 'bert-base-uncased'
MODEL_NAME = '../input/privbert-model'
PARAM_NAMES = ['max_length','learning_rate','batch_size','epochs','treshold']

for i in range(1):
    cur_name = names[12]
    cur_texts,cur_labels = text_label_pairs[12]
    nb_labels = len(cur_labels[0])
    print(nb_labels)
    if nb_labels > 1:    
        # if statements are in case that a parameter is not defined, it then sets a default variable
        if 'max_length' in params:
            max_length = params['max_length']
        else:
            max_length = [128]
        if 'learning_rate' in params:
            learning_rate = params['learning_rate']
        else:
            learning_rate = [2e-5]
        if 'batch_size' in params:
            batch_size = params['batch_size']
        else:
            batch_size = [32]
        if 'n_epochs' in params:
            n_epochs = params['n_epochs']
        else:
            n_epochs = [3]
        if 'treshold' in params:
            treshold = params['treshold']
        else:
            treshold = [0.5]
        
        param_list = []
        acc_list = []
        # go through all parameters
        for ml in max_length:
            for lr in learning_rate:
                for bs in batch_size:
                    for n in n_epochs:
                        for tr in treshold:
                            
                            # run the training function for ever variable, returing only the accuaracy (return_model = False)
                            acc = RunBertOneLoop(cur_labels,cur_texts,ml,bs,lr,n,nb_labels)
                            acc_list.append(acc)
                            pars = [ml,lr,bs,n,tr]
                            param_list.append(pars)
                            
                            # clear memory after each loop
                            K.clear_session()
                            torch.cuda.empty_cache()

        
        try:
            existing_df = pd.read_csv(SAVEFILE)
            existing_df[cur_name] = acc_list
            existing_df.to_csv(SAVEFILE,index=False)
        except:
            df = pd.DataFrame(param_list,columns=PARAM_NAMES)
            df[cur_name] = acc_list
            df.to_csv(SAVEFILE)
            print("Saved")

                        

# Training the models

After finding the optimal parameters for each classifier it is now time to actually train the models
For this we will train a model with the optimal parameters 10 times and choose the one that yielded the best results on the validation set

In [None]:
# setting some of the parameters again. Some are copied from above, this is done so that I dont have to run th whole code
# everytime
PARAM_NAMES = ['max_length','learning_rate','batch_size','epochs','treshold']
SAVEFILE = 'privbert_gridsearches.csv'
gridsearch_results = pd.read_csv(SAVEFILE)
gr_cols = list(gridsearch_results.columns)
param_df = gridsearch_results[PARAM_NAMES]
model_names = gr_cols[5:]

In [None]:
#Pick the best parameters from the parameter df that was created ealier
best_params = {}
for model_name in model_names:
  col_results = gridsearch_results[model_name].to_list()
  best_res = col_results.index(max(col_results))
  best_param = param_df.iloc[best_res].to_list()
  best_params[model_name] = best_param

#dictionary that links the model name to the index, remember that names was defined earlier. 
# Example name: 'cs_Data_Retention'
names_index = {}
for i,name in enumerate(names):
    names_index[name] = i
    
cat_names = sorted(op115_t_c.unique_cats)


In [None]:
#set parameters and the folder where the models will be stored as well as the language model used to train upon
N_TRIES = 10
SAVEFOLDER = 'trained_models'
TOK_NAME = 'bert-base-uncased'
MODEL_NAME = '../input/privbert-model'

for amodel in model_names:
    #set the parameters for each moel
    ml,lr,bs,epoch,tr = best_params[amodel]
    ml = int(ml)
    bs = int(bs)
    epoch = int(epoch)
    
    # get the texts and labels per model
    texts, labels = text_label_pairs[names_index[amodel]]
    val_texts, val_labels = val_text_label_pairs[names_index[amodel]]
    nb_labels = len(labels[0])
    
    n = 0
    highest_accuracy = 0
    while n < N_TRIES and highest_accuracy < 100:
        # return both the model and the score for a trained model
        acc,model = RunBertOneLoop(labels,texts,val_labels, val_texts,ml,bs,lr,epoch,nb_labels, return_model = True)
        saved = False
        #only save the model if the accuracy is higher than all previous ones
        if acc > highest_accuracy:
            path = SAVEFOLDER + '/' + amodel
            model.save_pretrained(path)
            highest_accuracy = acc
            saved = True
        n+=1
        
        print("finshed try-{} for {} \n accuracy={} \n saved={}".format(n,amodel,acc,saved))
    

In [None]:
!tar -zcvf trained_modelsk3.tar.gz /kaggle/working/trained_models #create a zipped folder

In [None]:
from IPython.display import FileLink
FileLink(r'./trained_modelsk0.tar.gz') # use this to donwload the respective folder

# Testing different stratificaton methods

To test the effect of choosing different stratification methods I perform a 15 fold test using either stratfication on the segments or on the policies. The code used for that is found below. Some code might be exta so that not the whole notebook has to be loaded

In [4]:
from data_processing import Op115OneHots
import pandas as pd
import numpy as np

# load all data
ALL_POLS = '../input/privbert-data/op115_processed.csv'
all_pols_df = pd.read_csv(ALL_POLS)
poll_uids = sorted(all_pols_df.policy_uid.unique())
labels_per_pol = []

op115_all_c = Op115OneHots(all_pols_df)
op115_all_c.go2(majority = True)
uniques = op115_all_c.return_oh_names()
catsub_index, catval_index, subval_index, inds = op115_all_c.len_onehots()

catsub,catval,subval,cats,subs,vals,my_texts = op115_all_c.new_onehots()

In [5]:
# get the indexes for the sgements that have a respective category labeled
cat_arry = np.array(cats)
indexes = []
for i in range(10):
    colrow= cat_arry[:,i]
    inds = np.where(colrow == 1)
    indexes.append(inds)
    

In [7]:
policies = []
polsegs = op115_all_c.pol_seg()
for i in range(10):
    pol_uids = []
    for ind in indexes[i][0].astype(int):
        pol_uids.append(polsegs[ind][0])
    policies.append(pol_uids)

specials = [0,2,6,8] # the 'specials' are the categories with few labels, these I want to make sure are somewhat divided
special_pols = []
for special in specials:
    print(set(policies[special]))
    special_pols = special_pols + policies[special]

special_pols = list(set(special_pols))
print(special_pols)

{32, 33, 98, 58, 133, 70, 135, 164, 105, 144, 82, 20, 21, 26, 93}
{32, 98, 58, 164, 70, 135, 105, 144, 82, 20, 21, 26, 93}
{32, 33, 98, 58, 133, 70, 135, 164, 105, 175, 144, 82, 20, 21, 26, 59, 93}
{32, 33, 98, 58, 133, 70, 135, 164, 105, 175, 144, 82, 20, 21, 26, 59, 93}
[32, 33, 98, 58, 133, 70, 135, 164, 105, 175, 144, 82, 20, 21, 26, 59, 93]


In [8]:
from skmultilearn.model_selection import IterativeStratification

def iterative_train_test_split(X, y, train_size):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    stratifier = IterativeStratification(
        n_splits=2, order=1, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    train_indices, test_indices = next(stratifier.split(X, y))
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    return X_train, y_train, X_test, y_test


In [None]:
all_poluids = all_pols_df['policy_uid'].unique()
without_special = [poluid for poluid in all_poluids if poluid not in special_pols]
def split_per_uid(X, y, train_size, special_pols = special_pols, without_special = without_special):
    """
    Function that splits data based on the policies, differentiates between the 'special' and 'other' policies.
    The special policies have to distributed a bit more manually because if one were to randomly pick policies there is
    a decent chance on ends up with no support for certain labels
    """
    test_size = 1 - train_size
    train_inds, val_inds = train_test_split(special_pols, test_size = 0.5) # split the special pols 50 - 50
    
    train_pols, val_pols = train_test_split(without_special, test_size = test_size - 0.04) # split the other pols based on user choice
    
    train_pols = train_pols + train_inds # policy uids for training
    val_pols = val_pols + val_inds # policy uids for validating
    train_df = all_pols_df[all_pols_df['policy_uid'].isin(train_pols)]
    val_df = all_pols_df[all_pols_df['policy_uid'].isin(val_pols)]
    
    # get respective labels and texts as done earlier
    op115_train = Op115OneHots(train_df)
    op115_train.go2(majority = True, class_tup=uniques)
    catsub,catval,subval,train_labels,subs,vals,train_texts = op115_train.new_onehots()
    
    op115_val = Op115OneHots(val_df)
    op115_val.go2(majority = True, class_tup=uniques)
    catsub,catval,subval,val_labels,subs,vals,val_texts = op115_val.new_onehots()
    
    return train_texts, train_labels, val_texts, val_labels
    

# Perform the 15 tests

In [None]:
TOK_NAME = 'bert-base-uncased'
MODEL_NAME = '../input/privbert-model'

n_per_strat = 15

tr_cats = ['tr_' + cat for cat in cat_names]
te_cats = ['te_' + cat for cat in cat_names]

ml,lr,bs,epoch,tr = best_params['Categories']
ml = int(ml)
bs = int(bs)
epoch = 5 #less epochs as it is not as important
train_size = 0.75

X = np.array(my_texts)
Y = np.array(cats)
nb_labels = len(Y[0])

COLNAMES = ['k', 'split_method', 'train_segments', 'micro_f1', 'macro_f1'] + tr_cats + te_cats
outdf = pd.DataFrame(columns = COLNAMES)
for i in range(n_per_strat):
    # perform tests for the standard stratification method
    
    train_texts, train_labels, val_texts, val_labels = iterative_train_test_split(X,Y, train_size)
    tr_labels = list(np.sum(train_labels, axis = 0)) #number of training labels per category
    te_labels = list(np.sum(val_labels, axis = 0)) #number of testing labels per category
    #micro and macro scores for this set (based on validation/testing set) ret_macro = True
    micro_f1,macro_f1 = RunBertOneLoop(list(train_labels),list(train_texts),list(val_labels), list(val_texts),ml,bs,lr,epoch,nb_labels, ret_macro = True)
    row = [i,'iterative', len(train_texts)] + [micro_f1, macro_f1] + tr_labels + te_labels
    outdf.loc[len(outdf)] = row

    print("finished with {} for iterative".format(i))

for i in range(n_per_strat):
    #perform tests for policy stratification method
    
    train_texts, train_labels, val_texts, val_labels = split_per_uid(X,Y, train_size+ 0.03)
    tr_labels = list(np.sum(train_labels, axis = 0))
    te_labels = list(np.sum(val_labels, axis = 0))
    micro_f1,macro_f1 = RunBertOneLoop(list(train_labels),list(train_texts),list(val_labels), list(val_texts),ml,bs,lr,epoch,nb_labels, ret_macro = True)
    row = [i,'uid', len(train_texts)] + [micro_f1, macro_f1] + tr_labels + te_labels
    outdf.loc[len(outdf)] = row

    print("finished with {} for uid".format(i))

outdf.to_csv('stratification_comparison.csv')

In [None]:
outdf.to_csv('new_strats.csv')
grpd = outdf.groupby(['split_method']).mean()
grpd.to_csv('strat_grouped.csv')