# Full Hierarchichal System

In this notebook one can find the code that is used for the full hierarchichal classifier as described in my capstone:
https://github.com/luka5132/NLPToS
It uses the trained classification models that were created using the privbert.ipynb notebook (https://www.kaggle.com/lukasbusch/privbert) and the training data that can be found in privbert-data.

In this notebook I:
- Define the 'hierarchical classification' function which leverages all the classification models to perform a multi level classification
- Perfrom a grid search to find the optimal parameters for the advice function as well as the best 'candidate threshold' and 'final treshold'

In [None]:
#load libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange 
from ast import literal_eval


In [None]:
#set device
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/privbert-data/data_processing.py", dst = "../working/data_processing.py")
copyfile(src = "../input/privbert-data/pytorch_classifier.py", dst = "../working/pytorch_classifier.py")
copyfile(src = "../input/privbert-data/hierarchical_data.py", dst = "../working/hierarchical_data.py")
copyfile(src = "../input/privbert-data/privbert_gridsearches3.csv", dst = "../working/privbert_gridsearches.csv")
#copyfile(src = "../input/privbert-data/tresholds.csv", dst = "../working/treshold.csv")

In [None]:
from data_processing import Op115OneHots

#Load data, for more information on Op115OneHots please look at the notebook file with the same name
VAL_PATH_K1 = '../input/privbert-data/op115_data/op115_val_k0.csv'
TEST_PATH_K1 = '../input/privbert-data/op115_data/op115_test_k0.csv'
TRAIN_PATH_K1 = '../input/privbert-data/op115_data/op115_train_k0.csv'
ALL_PATH = '../input/privbert-data/op115_processed.csv'
op115_val = pd.read_csv(VAL_PATH_K1)
op115_test= pd.read_csv(TEST_PATH_K1)
op115_train= pd.read_csv(TRAIN_PATH_K1)
op115_all = pd.read_csv(ALL_PATH)

op115_all_c = Op115OneHots(op115_all)
op115_all_c.go2()

uniques = op115_all_c.return_oh_names() # need this to make sure all one-hot vectors have the same dimension
catsub_index, catval_index, subval_index, inds = op115_all_c.len_onehots()

cat_names = sorted(op115_all_c.unique_cats)

#load validation data
op115_v_c = Op115OneHots(op115_val)
op115_v_c.go2(majority = True, class_tup = uniques)

#load test data
op115_t_c = Op115OneHots(op115_test)
op115_t_c.go2(majority = True, class_tup = uniques)

v_catsub,v_catval,v_subval,v_cats,v_subs,v_vals,v_my_texts = op115_v_c.new_onehots()
t_catsub,t_catval,t_subval,t_cats,t_subs,t_vals,t_my_texts = op115_t_c.new_onehots()

In [None]:
# Get the indexes for the suball classifier. This way one knows which indexes of the label belong to which classifier
indskeys = list(inds.keys())
len2 = []
for ba in indskeys:
    if len(ba) == 2 and ba[0] in cat_names:
        len2.append(ba)

len2 = sorted(len2)

sub_indexes = {}
for atup in len2:
    cat = atup[0]
    sub = atup[1]
    cat_ind = inds[cat]
    sub_tup = (cat_ind,cat,sub)
    if cat in sub_indexes:
        sub_indexes[cat].append(inds[sub_tup])
    else:
        sub_indexes[cat] = [inds[sub_tup]] 
    
print(sub_indexes)
                  

In [None]:
tokenizer = 'bert-base-uncased'
tryout = BertClassification(gpu = False)
tryout.test = True
tryout.train = False
tryout.init_tokenizer(tokenizer)

In [None]:
tryout.tokenizer.tokenize(v_my_texts[11])

In [None]:
from pytorch_classifier import BertClassification

def load_test_data(tokenizer, texts, batch_size, max_length, labels = None):
    """
    Quick function that loads data into dataloaders. Partly uses the 'BertClassification' as defined in
    pytorch_classifier.py
    """
    tryout = BertClassification(gpu = False)
    tryout.test = True
    tryout.train = False
    tryout.init_tokenizer(tokenizer)
    if labels:
        tryout.init_data(texts,labels)
        return tryout.encode_texts(max_length,batchsize = batch_size) # train_data, val_data
    else:
        tryout.input_texts(texts)
        return tryout.encode_texts(max_length,batchsize = batch_size, with_labels = False) # train_data, val_data

   

In [None]:
def perform_test(model, test_dataloader, with_labels = False):
    """
    Perform a test on a test_dataloader, has the option to add labels as well, that way it returns not only the predictions
    but the respective true labels as well.
    
    """ 
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()
        
    #track variables
    if with_labels:
        logit_preds,true_labels,pred_labels = [],[],[]
    else:
        logit_preds,pred_labels = [],[]
        
    # Predict
    for i, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        if with_labels:
            b_input_ids, b_input_mask, b_labels, b_token_types = batch
        else:
            b_input_ids, b_input_mask, b_token_types = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)
        
            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            
            if with_labels:
                b_labels = b_labels.to('cpu').numpy()
        
        logit_preds.append(b_logit_pred)
        pred_labels.append(pred_label)
        if with_labels:
            true_labels.append(b_labels)
            
        
    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]

    if with_labels:
        true_labels = [item for sublist in true_labels for item in sublist] # Flatten outputs
        true_bools = [tl==1 for tl in true_labels]
        return (pred_labels, true_bools)
    else:
        return pred_labels
        

## Set the variables for a gridsearch

In [None]:
import os
ML_AND_BS = ['max_length','batch_size']
SAVEFILE = 'privbert_gridsearches.csv'
gridsearch_results = pd.read_csv(SAVEFILE)
gr_cols = list(gridsearch_results.columns)
param_df = gridsearch_results[ML_AND_BS]
model_names = gr_cols[5:]


model_parameter = {} # get the best parameters for each model 
# important when loading the dataloader as we want the maximum length and batch size to be the one used for training the model
for model_name in model_names:
    col_results = gridsearch_results[model_name].to_list()
    best_res = col_results.index(max(col_results))
    best_param = param_df.iloc[best_res].to_list()
    model_parameter[model_name] = best_param

#the models are saved in such a way that when a '/' appeared in the name that was seen as a subfolder.
#I have taken the files out of the subfolder but the name of the model is thus without the '/'.
#The model_pahts dictionary takes the name of a model and return the correct path to the location of that model.
models_dir = '../input/privbert-models/trained_models'
models_paths = {}
for model_name in model_names:
    if '/' in model_name:
        ml_name = model_name.split('/')[0] #Because of the way the files were zipped, the / was seen as a subfolder, this was removed. Thus the path is without /...
        models_paths[model_name] = os.path.join(models_dir,ml_name)
    else:
        models_paths[model_name] = os.path.join(models_dir,model_name)

In [None]:
model_parameter = {} # get the best parameters for each model 
# important when loading the dataloader as we want the maximum length and batch size to be the one used for training the model
for model_name in model_names:
    col_results = gridsearch_results[model_name].to_list()
    best_res = col_results.index(max(col_results))
    best_param = gridsearch_results[gr_cols[:5]].iloc[best_res].to_list()
    model_parameter[model_name] = best_param

In [None]:
print(len(gr_cols[5:]))

In [None]:
for mp in model_parameter:
    print("for {}: \n {} \n".format(mp,model_parameter[mp]))

##### module_sizes = {} # dictionary that stores the dimension of each classifier. 
# For example: Categories : 10
#              Subcategories : 36
for amodel in model_names:
    if amodel.startswith('cs'):
        strip_cs = amodel.split('_')[1]
        output_size = catsub_index[strip_cs]
        module_sizes[amodel] = output_size
    elif amodel.startswith('cv'):
        strip_cv = amodel.split('_')[1]
        output_size = catval_index[strip_cv]
        module_sizes[amodel] = output_size
    elif amodel == 'Subcategories':
        module_sizes[amodel] = 36
    else:
        module_sizes[amodel] = 10
        
print(module_sizes)

In [None]:
# Set parameters for gridsearch
PARAMS = {  'label_treshold' : [0.4,0.5,0.6,0.7,0.8,0.9],
            'advice_val' : [0.05,0.1,0.15,0.2,0.25],
            'subseq_val' : [0.02,0.04,0.06,0.08,0.10],
            'negative_advice' : [-0.05,-0.1,-0.15,-0.2,-0.25,-0.3,-0.35,-0.4]}


In [None]:
from hierarchical_data import HierarchicalData #for more information please see the respective notebook
from keras import backend as K

def hierarchical_classification(tokname, texts, params_per_model, model_to_path, sub_index, candidate_treshold, 
                                model_size_dict = module_sizes, cat_names = cat_names, gridsearch = False, params = PARAMS):
    """
    Function that performs the full hierarchical classification process on a list of texts/segments. 
    Requires a tokenizer and a dictionary that contains the information of how texts should be turned into a dataloader.
    It passes this information to the 'load_test_data' function.
    
    Using the model_to_path dictionary the function is able to load the respective classifier when needed.
    
    The sub_index is a dictionary that links the index of the suball classification model to the respective categories
    
    The candidate_treshold determines which prediction scores require further evaluation.
    
    """
    
    pred_dict = {}
    
    #Start by loading the Categories and Subcategories predictions into the HierarchicalData class
    category_path = model_to_path['Categories']
    subcategory_path = model_to_path['Subcategories']
    cat_ml, cat_bs = params_per_model['Categories']
    sub_ml, sub_bs = params_per_model['Subcategories']
    
    model = BertForSequenceClassification.from_pretrained(category_path, num_labels=10)
    model.cuda()
    
    test_dataloader = load_test_data(tokname, texts, cat_bs, cat_ml)
    cat_preds= perform_test(model, test_dataloader) # predictions for categories
    
    #initiate the advice class and store the predictions
    advice_class = HierarchicalData(candidate_treshold)
    advice_class.set_parameters(params)
    advice_class.set_variables(cat_names)
    advice_class.read_cat_predictions(np.array(cat_preds))
    advice_class.define_candidates(candidate_treshold) # determine candidates
    
    # clear memory
    K.clear_session()
    torch.cuda.empty_cache()
    
    model = BertForSequenceClassification.from_pretrained(subcategory_path, num_labels=36)
    model.cuda()
    
    test_dataloader = load_test_data(tokname, texts, sub_bs, sub_ml)
    sub_preds = perform_test(model, test_dataloader) # get predictions for the subcategory layer
    
    pred_dict['suball'] = sub_preds
    
    advice_class.read_sub_predictions(np.array(sub_preds)) # save predictions
    
    K.clear_session()
    torch.cuda.empty_cache()
    
    # if we perform a gridsearch we go through all the paramters and store the advice given with the 'suball' layer
    if gridsearch:
        label_treshold = params['label_treshold']
        advice_val = params['advice_val']
        subseq_val = params['subseq_val']
        negative_advice = params['negative_advice']
        all_sub_advices = [] # save the advices
        all_params = [] # save the paramters
        for lt in label_treshold:
            for av in advice_val:
                for sb in subseq_val:
                    for na in negative_advice:
                        subadvice = advice_class.return_suball_advice(sub_index, np.array(sub_preds),lt,av,sb,na)
                        all_sub_advices.append(subadvice)
                        all_params.append((candidate_treshold,lt,av,sb,na))
    else: 
        # otherwise we use the normal parameters
        subadvice = advice_class.return_suball_advice(sub_index, np.array(sub_preds))
        preds_with_subadvice = cat_preds + subadvice
    
    model_names = sorted(list(model_to_path.keys()))
    model_names.remove('Categories')
    model_names.remove('Subcategories')
    
    texts = np.array(texts)
    
    if gridsearch:
        # if a gridsearch was performed we store that data into the class
        advice_class.create_gridsearch_advices(len(texts),len(all_sub_advices))
    
    # go through each classifier
    for amodel_name in model_names: 
        prefix, cat_name = amodel_name.split('_') # get category name
        cat_index = cat_names.index(cat_name)

        candidates = advice_class.return_candidate(cat_index) # see how many candidates there are for this category
        indexes = np.where(np.array(candidates))
        if any(candidates):
            candidate_texts = texts[candidates] # get the texts that are candidates
            if prefix == 'cv': #cv = category-values thus we are working with the 'values' = val here
                sub_or_val = 'val'
            else:
                sub_or_val = 'sub'
                
            #tempdict for interpretation of results
            tdict = {}
            
            # get variables
            model_ml, model_bs = params_per_model[amodel_name]
            model_path = model_to_path[amodel_name]
            model_size = model_size_dict[amodel_name]

            model = BertForSequenceClassification.from_pretrained(model_path, num_labels=model_size) #load model
            model.cuda()

            test_dataloader = load_test_data(tokname, candidate_texts, model_bs, model_ml)
            preds = perform_test(model, test_dataloader) # get predictions for respectiv model
            
            tdict['prediction'] = preds
            tdict['index'] = indexes
            pred_dict[amodel_name] = tdict

            if gridsearch:
                #if it is a gridsearch we go through all paramters
                label_treshold = params['label_treshold']
                advice_val = params['advice_val']
                subseq_val = params['subseq_val']
                negative_advice = params['negative_advice']
                n = 0
                for lt in label_treshold:
                    for av in advice_val:
                        for sb in subseq_val:
                            for na in negative_advice:
                                advice = advice_class.return_advice(cat_index, preds, lt, av, sb, na) # get advice for parametrs
                                advice_class.save_gridsearch_advice(n,advice,sub_or_val,cat_index) # save advice in class
                                n +=1

            else:
                # get unique parameters for this model
                cat_params = params[cat_name]
                label_treshold = cat_params['label_treshold'][0]
                advice_val = cat_params['advice_val'][0]
                subseq_val = cat_params['subseq_val'][0]
                negative_advice = cat_params['negative_advice'][0]
                advice = advice_class.return_advice(cat_index, preds, label_treshold,advice_val,subseq_val, negative_advice)
                advice_class.save_advice(advice,cat_index,sub_or_val) # save advice
                
            # clear memory
            K.clear_session()
            torch.cuda.empty_cache()

    
    if gridsearch:
        #if we performed a gridsearch we want to return all advices for all paramters
        gridsearch_advices = advice_class.return_gridsearch_advice()
        return cat_preds, true_labels, all_params, all_sub_advices, gridsearch_advices
    else:
        #otherwise we return only the predictions for each advice laer with the input parameters
        all_combination_preditions = advice_class.return_predictions_layers()
        return (all_combination_preditions,pred_dict)
    

In [None]:
TOKNAME = 'bert-base-uncased'
cat_preds, true_labels, all_params, all_sub,gridsearch_advices = hierarchical_classification(TOKNAME,t_my_texts,t_cats,best_params,models_paths,sub_indexes,0.15)

In [None]:
def return_predictions(cat_preds, all_sub = None, sub = None,  val = None, all_sub_go = False, sub_go = False, val_go = False):
    """Function that takes base predictions and a combination of advice layer and returns the final prediction"""
    return_predicts = cat_preds
    if all_sub_go:
        return_predicts = return_predicts + all_sub
    if sub_go:
        return_predicts = return_predicts + sub
    if val_go:
        return_predicts = return_predicts + val

    return return_predicts


In [None]:
import itertools
def return_combinations(cat_preds, all_sub, sub, val):
    """
    Function that returns all posible combinations of advice layers
    """
    all_combinations = [cat_preds]
    for i in range(1,4):
        combinations = itertools.combinations([1,2,3], i)
        for combination in combinations:
            all_sub_go = 1 in combination
            sub_go = 2 in combination
            val_go = 3 in combination
            
            all_combinations.append(return_predictions(cat_preds,all_sub,sub,val,all_sub_go,sub_go,val_go))
    return all_combinations

# Gridsearch advice parameters

In [None]:
# Set variables and savenames
SAVEFILE = 'treshold_gridsearch.csv' 
TOKNAME = 'bert-base-uncased'
final_tresholds = [0.4,0.5,0.6,0.7,0.8,0.9]
candidate_treshold = [0.1,0.15,0.2,0.25,0.3,0.4]
ADVICE_NAMES = ['cat', 'sub','allsub','val','allsub_sub', 'sub_val','allsub_val','allsub_sub_val']
#colnames are a bit different than in the paper as the anming was done differently
COLNAMES = ['class', 'candidate_treshold','label_treshold','advice_val','subseq_val','negative_advice','final_treshold'] + ADVICE_NAMES
saved = False

#progress bar (rougly 570000 different paramaters)
BAR = tf.keras.utils.Progbar(570000)

if not saved:
    result_df= pd.DataFrame(columns = COLNAMES)
else:
    result_df = pd.read_csv(SAVEFILE)

# we call the hierarchical_classification function for the different values of candidate_tresholds
for ct in candidate_treshold:
    TOKNAME = 'bert-base-uncased'
    cat_preds, true_labels, all_params, all_sub,gridsearch_advices = hierarchical_classification(TOKNAME,v_my_texts,v_cats,
                                                                                             model_parameter,models_paths,sub_indexes,ct, gridsearch = True)
    print(ct)
    for i,param in enumerate(all_params):
        # get parameter and respective advce
        allsub_advice = all_sub[i]
        sub_advice = gridsearch_advices[0][i]
        val_advice = gridsearch_advices[1][i]

        cat, allsub,sub,val,allsub_sub, allsub_val,sub_val,allsub_sub_val  = return_combinations(cat_preds,allsub_advice,sub_advice,val_advice)
        for final_treshold in final_tresholds:
            for cat_name in cat_names:
                #get the prediction bools per category per advice layer
                cat_index = cat_names.index(cat_name)
                cat_bools = [pl>final_treshold for pl in np.array(cat)[:,cat_index]]
                allsub_bools = [pl>final_treshold for pl in np.array(allsub)[:,cat_index]]
                sub_bools = [pl>final_treshold for pl in np.array(sub)[:,cat_index]]
                val_bools = [pl>final_treshold for pl in np.array(val)[:,cat_index]]
                allsub_sub_bools = [pl>final_treshold for pl in np.array(allsub_sub)[:,cat_index]]
                allsub_val_bools = [pl>final_treshold for pl in np.array(allsub_val)[:,cat_index]]
                sub_val_bools = [pl>final_treshold for pl in np.array(sub_val)[:,cat_index]]
                allsub_sub_val_bools = [pl>final_treshold for pl in np.array(allsub_sub_val)[:,cat_index]]
                
                #Use the true labels to calculate the respective micro f1 scores for the category for this 
                # particular set of parameters
                true_class_labels = np.array(true_labels)[:,cat_index]
                cat_f1_accuracy = f1_score(true_class_labels,cat_bools,average='micro')*100
                allsub_f1_accuracy = f1_score(true_class_labels,allsub_bools,average='micro')*100
                sub_f1_accuracy = f1_score(true_class_labels,sub_bools,average='micro')*100
                val_f1_accuracy = f1_score(true_class_labels,val_bools,average='micro')*100
                allsub_sub_f1_accuracy = f1_score(true_class_labels,allsub_sub_bools,average='micro')*100
                allsub_val_f1_accuracy = f1_score(true_class_labels,allsub_val_bools,average='micro')*100
                sub_val_f1_accuracy = f1_score(true_class_labels,sub_val_bools,average='micro')*100
                allsub_sub_val_f1_accuracy = f1_score(true_class_labels,allsub_sub_val_bools,average='micro')*100
                
                # list of f1 scores per advice_layer
                result_scores = [cat_f1_accuracy, allsub_f1_accuracy, sub_f1_accuracy,
                                val_f1_accuracy,allsub_sub_f1_accuracy,allsub_val_f1_accuracy,
                                sub_val_f1_accuracy,allsub_sub_val_f1_accuracy]
                
                # add results for cette parameter to df
                param_with_treshold = list(param) + [final_treshold]
                row = [cat_name] + param_with_treshold + result_scores
                result_df.loc[len(result_df)] = row
                BAR.add(1)

# save dataframe
result_df.to_csv('tresholds_per_class.csv')
        


# Using paramters

The parameter df thatw as created above has been processed and the best paramters have been selected and put into a .json file. This is done however in a different notebook, namely the data_processing notebook ('data_processing.ipynb')

In [None]:
import json
# Use best parameters as saved in the json file
with open('../input/privbert-data/advice_parameters.json') as f:
    best_parameters = json.load(f)

example_params = best_parameters['Data Retention']
candidate_treshold = example_params['candidate_treshold'][0]
final_treshold = example_params['final_treshold'][0]
    
    

In [None]:
testf = pd.read_csv('../input/privbert-data/op115_data/op115_test_k4.csv') # get test data for k0
onlyfortexts = Op115OneHots(testf)
onlyfortexts.go2(majority = True, class_tup = uniques)

t_catsub,t_catval,t_subval,train_cats,t_subs,t_vals,texts = onlyfortexts.new_onehots()

In [None]:
#Perfrom test
TOKNAME = 'bert-base-uncased'
results = hierarchical_classification(TOKNAME,texts,model_parameter,models_paths,sub_indexes,candidate_treshold, gridsearch = False, params=best_parameters)

# Save results

Save results in a df and a json

In [None]:
true_cat_names = []
for cat_name in cat_names:
    true_cat_names.append('true_'+cat_name)

In [None]:
COLNAMES = ['advice_name'] + cat_names + true_cat_names

In [None]:
result_dict = {}
prediction_df = pd.DataFrame(columns =  COLNAMES)
for i, amodel in enumerate(results):
    pred_bools = [pl>final_treshold for pl in amodel]
    val_f1_accuracy = f1_score(t_cats,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(t_cats, pred_bools)*100
    advice_name = advice_names[i]

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)
    
    clf_report = classification_report(t_cats,pred_bools, target_names = cat_names, output_dict =True)
    result_dict[advice_name] = clf_report
   # n = 0
    for j in range(len(amodel)):
        preds = amodel[j]
        trues = t_cats[j]
        newrow = [advice_name] + list(preds) + list(trues)
        #if n <5:
            #print(newrow)
           # n+=1
    
        prediction_df.loc[len(prediction_df)] = newrow

print(len(t_cats * 8))
print(len(prediction_df))
        
    

In [None]:
import json
with open('test_results.json', 'w') as f:
    json.dump(result_dict, f)

In [None]:
prediction_df.to_csv("predictions_k0.csv")