# Interpretting Privbert Hierarchical Classification Result
## Lukas Busch (AUC)
### 

Created on Kaggle:
https://www.kaggle.com/lukasbusch/privbert-results

This notebook was used in combination with the privbert-data dataset that contains classification results and label support data for my capstone (https://github.com/luka5132/NLPToS) on multi-label classification on the OPP-115 dataset (https://www.usableprivacy.org/data)
Scroll down to see how:
- The 5-fold data was combined to create final results
- The labels support for the data was calculated
...


In [None]:
# loading the needed libraries
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

# Label distribution:

With the code below the mean label distribution of the 5-fold validation sets were calculated

In [None]:
# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has the correct suffix (.py mostly))
copyfile(src = "../input/privbert-data/data_processing.py", dst = "../working/data_processing.py")
copyfile(src = "../input/privbert-data/pytorch_classifier.py", dst = "../working/pytorch_classifier.py")
copyfile(src = "../input/privbert-data/hierarchical_data.py", dst = "../working/hierarchical_data.py")

In [None]:
from data_processing import Op115OneHots
ALL_PATH = '../input/privbert-data/op115_processed.csv'

#reading all data so that we know how many different classes there are for each level:
    # Categories     10
    # Subcategories  36
    # Values         259
    
#process the data using the Op115OneHots class located in data_processing.py
op115_all = pd.read_csv(ALL_PATH)
op115_all_c = Op115OneHots(op115_all)
op115_all_c.go2(majority = True)

#get the unique labels, since some labels are sparse is is possible that a value label might not appear in the test or train
#set, that is why I initiate every class with all labels so that all one hot vectors are the same length (as they should be)
uniques = op115_all_c.return_oh_names()
catsub_index, catval_index, subval_index, inds = op115_all_c.len_onehots()

#name per category (notice how it is sorted, thus it is not in the same order as in the paper)
cat_names = sorted(op115_all_c.unique_cats)

In [None]:
#Initiate the dataframe where the respective values will be stores
COLNAMES = ['k', 'class', 'train_size', 'val_size', 'test_size']
label_df = pd.DataFrame(columns = COLNAMES , dtype=float)

for i in range(5):
    #Reading each of the 5 datasets that were used
    TRAIN_DATAPATH = '../input/privbert-data/op115_data/op115_train_k{}.csv'.format(i)
    VAL_DATAPATH = '../input/privbert-data/op115_data/op115_val_k{}.csv'.format(i)
    TEST_DATAPATH = '../input/privbert-data/op115_data/op115_test_k{}.csv'.format(i)
    traindf = pd.read_csv(TRAIN_DATAPATH)
    valdf = pd.read_csv(VAL_DATAPATH)
    testdf = pd.read_csv(TEST_DATAPATH)
    
    #get the multi-label one hot vectors (train_cats, val_cats and test_cats)
    op115_tr = Op115OneHots(traindf)
    op115_tr.go2(majority = True, class_tup = uniques)

    t_catsub,t_catval,t_subval,train_cats,t_subs,t_vals,t_my_texts = op115_tr.new_onehots()
    
    op115_val = Op115OneHots(valdf)
    op115_val.go2(majority = True, class_tup = uniques)

    t_catsub,t_catval,t_subval,val_cats,t_subs,t_vals,t_my_texts = op115_val.new_onehots()
    
    op115_te = Op115OneHots(testdf)
    op115_te.go2(majority = True, class_tup = uniques)

    t_catsub,t_catval,t_subval,test_cats,t_subs,t_vals,t_my_texts = op115_te.new_onehots()
    
    #turn type : list onehot vecotrs to type : np.array
    traincats = np.array(train_cats)
    valcats = np.array(val_cats)
    testcats = np.array(test_cats)
    print('set {}:'.format(i))
    for j in range(10):
        #For each colun count the number of labels by summing that column (label is either 0 or 1)
        cat_name = cat_names[j] #get the name of the categry
        train_labels = sum(traincats[:,j]) #num of labels for train set
        val_labels = sum(valcats[:,j]) # ''  val set
        test_labels = sum(testcats[:,j]) # '' test set
        
        # create a row for our dataframe
        row = [i,cat_name,train_labels,val_labels,test_labels]
        label_df.loc[len(label_df)] = row

In [None]:
label_df.to_csv('label_distribution.csv') #save the df

In [None]:
# Reading the dataset
labeldist = pd.read_csv('../input/privbert-data/label_distribution.csv')
labeldist = labeldist.groupby(['class']).mean() # grouping the values per class
labeldist['all_size'] = labeldist['train_size'] + labeldist['test_size'] +labeldist['val_size'] # adding column with all values
totsum = sum(labeldist['all_size']) # total number of labels
print(totsum)
labeldist['tot %'] = labeldist['all_size'] / totsum # % of labels for a category with respect to total
labeldist.head()

In [None]:
# total labels per set (train / val / test)
print(sum(labeldist['all_size']))
print(sum(labeldist['train_size']))
print(sum(labeldist['val_size']))
print(sum(labeldist['test_size']))

In [None]:
# quick check to see how well our data is stratified for each testing set (not really needed after code aoe)
for i in range(5):
    DATAPATH = '../input/privbert-data/op115_data/op115_test_k{}.csv'.format(i)
    testdf = pd.read_csv(DATAPATH)
    
    op115_t_c = Op115OneHots(testdf)
    op115_t_c.go2(majority = True, class_tup = uniques)

    t_catsub,t_catval,t_subval,t_cats,t_subs,t_vals,t_my_texts = op115_t_c.new_onehots()
    numpcats = np.array(t_cats)
    print('testset {}:'.format(i))
    for j in range(10):
        print(sum(numpcats[:,j]))

# The following code calculated the f1 scores for the 5 test sets

It is not completetly automated, instead I loaded the 5 different prediction files seperately and combined them later. That way I didn't really need to copy my code too many times.

In [None]:
# start with k = 0.
# In general I do: k in range(5) but then manually
k = 0
PRED_PATH = '../input/privbert-data/predictions_k{}.csv'.format(k)
pred_df = pd.read_csv(PRED_PATH) #read prediction
pred_df = pred_df.drop(['Unnamed: 0'],axis=1) 
pred_df.head()

advice_names = pred_df.advice_name.unique() #number of different advice layers 
pred_cols = pred_df.columns[1:11] # scores for the predicted labels
true_cols = pred_df.columns[11:] # actuall (true) labels
 
print(pred_cols,true_cols) # names

In [None]:
result_dict = {} 
for adv_name in advice_names:
    adv_df = pred_df[pred_df['advice_name'] == adv_name] # get the scores for the respective advice method
    preds = adv_df[pred_cols].values
    
    true_labels = adv_df[true_cols].values.tolist()
    pred_bools = [pl>0.6 for pl in preds] # transform predictions into bools using the final treshold of 0.6 as obtained
                                          # by gridsearch
   
    val_f1_accuracy = f1_score(true_labels,pred_bools,average='macro')*100 #only for printing
    val_flat_accuracy = accuracy_score(true_labels, pred_bools)*100 #only for printing

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)
    
    # save the scores using the 'classification_report' function from
    clf_report = classification_report(true_labels,pred_bools, target_names = pred_cols, output_dict =True) 
    result_dict[adv_name] = clf_report


In [None]:
#COLUMNS = ['class', 'support'] + list(advice_names)
def get_nice_res(k):
    """
    Function that uses the classification result dictionary and turns it into a nice easy to use dataframe
    """
    new_df = pd.DataFrame()
    class_and_sup = False
    for adv_name in advice_names:
        subdict = result_dict[adv_name]
        classname_list = []
        f1_scores = []
        support_list = []
        for classname in sorted(subdict.keys()):
            class_dict = subdict[classname]
            f1val = class_dict['f1-score']
            support = class_dict['support']

            if not class_and_sup:
                classname_list.append(classname)
                support_list.append(support)

            f1_scores.append(f1val)

        if not class_and_sup:
            new_df['class'] = classname_list
            new_df['support'] = support_list
            new_df['k'] = [k] * len(support_list)
            class_and_sup = True
        new_df[adv_name] = f1_scores
    return new_df



In [None]:
k0 = get_nice_res(k) # save results for k0

In [None]:
k1 = get_nice_res(k) # '' k1

In [None]:
k2 = get_nice_res(k) # '' k2

In [None]:
k3 = get_nice_res(k) # '' k3

In [None]:
k4 = get_nice_res(k) # '' k4

In [None]:
# adding the df's together

# alldf = k0.append(k1, ignore_index = True)
# alldf = alldf.append(k2, ignore_index = True)
# alldf = alldf.append(k3, ignore_index = True)
alldf = alldf.append(k4, ignore_index = True)


In [None]:
import json 
all_k_dict = {}
for i in range(5):
    pred_df = pd.read_csv('../input/privbert-data/predictions_k{}.csv'.format(i))
    advice_names = pred_df.advice_name.unique() #number of different advice layers 
    pred_cols = pred_df.columns[2:12] # scores for the predicted labels
    true_cols = pred_df.columns[12:] # actual (true) labels
    adv_names = pred_df['advice_name'].unique()
    k_dict = {}
    for adv in adv_names:
        adv_df = pred_df[pred_df['advice_name'] == adv]
        preds = adv_df[pred_cols].values
        bools = [p > 0.6 for p in preds]
        trues = list(adv_df[true_cols].values)
        clf = classification_report(trues,bools, target_names = pred_cols, output_dict = True)
        k_dict[adv] = clf
        
    all_k_dict[i] = k_dict

In [None]:
all_k_dict[4]

In [None]:
resdict = {}
relevant_names = ['cat', 'sub_val']
for i in range(5):
    res = all_k_dict[i]
    for adv in relevant_names:
        adv_dict = res[adv] 
        adv_res = {}
        for akey in adv_dict.keys():
            prec = adv_dict[akey]['precision']
            rec = adv_dict[akey]['recall']
            if adv in resdict:
                resdict[adv][akey]['precision'].append(prec)
                resdict[adv][akey]['recall'].append(rec)
            if akey in adv_res:
                adv_res[akey]['precision'].append(prec)
                adv_res[akey]['recall'].append(rec)
            else:
                insdict = {}
                insdict['precision'] = [prec]
                insdict['recall'] = [rec]
                adv_res[akey] = insdict
        
        if adv not in resdict:
            resdict[adv] = adv_res
    

In [None]:
COLNAMES = ['name','cat_prec', 'cat_rec', 'sv_prec', 'sv_rec']
to_df = pd.DataFrame(columns = COLNAMES)
cat_dict = resdict['cat']
sv_dict = resdict['sub_val']
cats = sorted(list(cat_dict.keys()))
for cat in cats:
    c_prec = np.mean(cat_dict[cat]['precision'])
    c_rec = np.mean(cat_dict[cat]['recall'])
    sv_prec = np.mean(sv_dict[cat]['precision'])
    sv_rec = np.mean(sv_dict[cat]['recall'])
    row = [cat, c_prec,c_rec,sv_prec,sv_rec]
    
    to_df.loc[len(to_df)] = row
    
to_df


In [None]:
# saving the (mean) results
alldf.to_csv('all_k_results2.csv')
testd = alldf.groupby(['class']).mean()
testdf.to_csv('all_k_grouped.csv')

In [None]:
all_results = pd.read_csv('../input/privbert-data/all_k_prec_recall.csv')
all_results

In [None]:
subvals = all_results[list(all_results.columns[-4:])].values
for i in range(4):
   # print(subvals[:,i][:10])
    print(np.mean(subvals[:,i][:10]))

In [None]:
subvals

# Error Analysis

The following code is used to analyze the prediction results in more depth, i.e. I have a look at the numper of True Positivesn False Positives, True Negatives and False negatives.

In [None]:
def find_differences(predicts, truths,treshold = 0.6):
  """
  Function that looks compares predictions and true labels, stroes the difference between 
  the predictions and the treshold and counts the number of TP, FP, TN, FN.
  
  """
    
  # set variables
  TP = 0
  FP = 0
  TN = 0
  FN = 0

  TP_L = []
  FP_L = []
  TN_L = []
  FN_L = []

  mistake_set = set()

  #go over each prediction 
  for i in range(len(predicts)):
    predict_array = predicts[i]
    truths_array = truths[i]
    #look at each labvel individually
    for j in range(len(predict_array)):
      label_score = predict_array[j]
    
      # set the label of the predicion
      if label_score < treshold:
        bool_score = 0
      else:
        bool_score = 1
      diff_from_treshold = abs(treshold - label_score) # save the difference between the prediction and boolean threshold
      truth_score = truths_array[j]

      # count TN, TP, FN, FP 
      if bool_score == 0 and truth_score == 0:
        TN += 1
        TN_L.append(diff_from_treshold)
      
      elif bool_score == 1 and truth_score == 1:
        TP += 1
        TP_L.append(diff_from_treshold)

      elif bool_score == 0 and truth_score == 1:
        FN += 1
        FN_L.append(diff_from_treshold)
        mistake_set.add(i) # if it is not the same the whole index is added tot the mistake set
      
      else:
        FP += 1
        FP_L.append(diff_from_treshold)
        mistake_set.add(i) # if it is not the same the whole index is added tot the mistake set
    
  mistake_list = list(mistake_set)
  return ((TP,TN,FP,FN),(TP_L,TN_L,FP_L,FN_L),mistake_set)


In [None]:
# In the paper I had a closer look at the results for the fifth (i.e. k4) set
DATAPATH = '../input/privbert-data/predictions_k4.csv'
result_df = pd.read_csv(DATAPATH)
prednames = result_df.columns[2:12]
truenames = result_df.columns[12:]
print(result_df['advice_name'].unique())

In [None]:
# similar function as the one above when analyzing the mean scores, however, here we only look at 2 advice names
# 'cat' and 'sub_val' ('sub_val' in reality is 'suball_val', but I made a mistake in naming)
from sklearn.metrics import f1_score
catlist = []
svlist = []
catf1 = []
svvf1 = []
for i in range(5):
    DATAPATH = '../input/privbert-data/predictions_k{}.csv'.format(i)
    result_df = pd.read_csv(DATAPATH)
    catdf = result_df[result_df['advice_name'] == 'cat']
    subvaldf = result_df[result_df['advice_name'] == 'sub_val']
    
    cat_preds = catdf[prednames].values
    cat_labels = catdf[truenames].values
    cat_bools = [pred > 0.6 for pred in cat_preds] # cat the boolean predictions for the 'cat' / 'base' advice layer
    
    sv_preds = subvaldf[prednames].values
    sv_labels = subvaldf[truenames].values
    sv_bools = [pred > 0.6 for pred in sv_preds] # cat the boolean predictions for the 'sub_val'= 'suball_val' advice layer
    
    cat_diffs = find_differences(cat_preds, cat_labels)
    sv_diffs = find_differences(sv_preds, sv_labels)
    
    #print(cat_preds[0],type(cat_labels[0]))
    
    cf1 = f1_score(cat_labels,cat_bools, average = 'micro') # caclualte scores
    svf1 = f1_score(sv_labels,sv_bools, average = 'micro')
    
    catf1.append(cf1) # add score
    svvf1.append(svf1)
    
    catlist.append(cat_diffs)
    svlist.append(sv_diffs)

In [None]:
# save the number of TP, TN, FP, FN for the test sets
catallpr = [0] *4 # for the categories
svallpr = [0] * 4 # for the suball_val
for i in range(5):
    pr = catlist[i][0]
    rp = svlist[i][0]
    for j,p in enumerate(pr):
        if catallpr[j]:
            catallpr[j].append(p)
        else:
            catallpr[j] = [p]
        
    for k,r in enumerate(rp):
        if svallpr[k]:
            svallpr[k].append(r)
        else:
            svallpr[k] = [r]

In [None]:
catallpr

In [None]:
svallpr

In [None]:
print(svallpr) # example of results, first row is all the TP 
# second row is TN,
# third row is FP
# fourth row is FN

In [None]:
# calculate means scores
kmean = []
svmean = []
for i in range(4):
    ksum = np.mean(catallpr[i])
    svsum = np.mean(svallpr[i])
    kmean.append(ksum)
    svmean.append(svsum)
    print(ksum,svsum)

# Below are extra functions that were used more on the fly, for example for when choosing three example segments

In [None]:
k4_cat = result_df[result_df['advice_name'] == 'cat']
k4_sv = result_df[result_df['advice_name'] == 'sub_val']

k4_preds_cat = np.array(list(k4_cat[prednames].values))
k4_bools_cat = [p > 0.6 for p in k4_preds_cat]
k4_true_cat = np.array(list(k4_cat[truenames].values))

k4_preds_sv = np.array(list(k4_sv[prednames].values))
k4_bools_sv = [p > 0.6 for p in k4_preds_sv]
k4_true_sv = np.array(list(k4_sv[truenames].values))

res_cat = find_differences(k4_bools_cat,k4_true_cat)
res_sv = find_differences(k4_bools_sv,k4_true_sv)

In [None]:
uncats = [i for i in res_cat[2] if i not in res_sv[2]]
unsv = [i for i in res_sv[2] if i not in res_cat[2]]

In [None]:
# uncats and unsv stand for unique-cats and unique-sv. That is unique mistakes, so a unique-cat is a mistake that was only
# a mistake in that advice layer, but not in the next

uncatbools = np.array(cat_bools)[uncats]
unsvbools = np.array(sv_bools)[list(unsv)]
uncatlabels = np.array(cat_labels)[uncats]
unsvlabels = np.array(sv_labels)[list(unsv)]


In [None]:
catsumlab = []
for i in range(len(uncatlabels)):
    catsumlab.append(sum(uncatlabels[i]))
    
catsumbool = []
for i in range(len(uncatbools)):
    catsumbool.append(sum(uncatbools[i]))

print(catsumlab)
print(catsumbool)

In [None]:
sumlabels = []
for i in range(len(unsvlabels)):
    sumlabels.append(sum(unsvlabels[i]))

In [None]:
sumbools = []
for i in range(len(unsvbools)):
    sumbools.append(sum(unsvbools[i]))

In [None]:
stayedsame = [a for a in res_cat[2] if a not in uncats]
print(len(stayedsame))
catsameb = np.array(cat_bools)[stayedsame]
catsamel = np.array(cat_labels)[stayedsame]
svsameb = np.array(sv_bools)[stayedsame]
svsamel = np.array(sv_labels)[stayedsame]

catsumlabs = []
for i in range(len(catsamel)):
    catsumlabs.append(sum(catsamel[i]))
    
catsumbools = []
for i in range(len(catsameb)):
    catsumbools.append(sum(catsameb[i]))
    
svsumlabs = []
for i in range(len(svsamel)):
    svsumlabs.append(sum(svsamel[i]))
    
svsumbools = []
for i in range(len(svsameb)):
    svsumbools.append(sum(svsameb[i]))

diffarr_cat = np.array(catsumlabs) - np.array(catsumbools)
diffarr_lab = np.array(svsumlabs) - np.array(svsumbools)
totdiff = diffarr_cat + diffarr_lab
print(totdiff)

In [None]:
slist = []
for i in range(10):
    aa = sum(catsameb[:,i]) - sum(catsamel[:,i])
    print(aa)
    slist.append(aa)

print()
print(sum(slist))

In [None]:
fplist = []
fnlist = []
for i in range(10):
    aal =svsamel[:,i] - svsameb[:,i]
    print("FP count for {} : {}".format(cat_names[i],np.count_nonzero(aal == -1)))
    print("FN count for {} : {}".format(cat_names[i],np.count_nonzero(aal == 1)))
    #print(sum(aal))
    fplist.append(np.count_nonzero(aal == -1))
    fnlist.append(np.count_nonzero(aal == 1))

print("total false positives: ",sum(fplist))
print("total false negatives: ",sum(fnlist))

In [None]:
cat_fp = fplist
cat_fn = fnlist

In [None]:
fp_inc = [fp / cat_fp[i] for i,fp in enumerate(fplist) if cat_fp[i]]
fn_inc = [fn / cat_fn[i] for i,fn in enumerate(fnlist) if cat_fn[i]]

In [None]:
fp_inc = [1] + fp_inc
fp_inc = fp_inc[:2] + [1] + fp_inc[2:]
fp_inc

In [None]:
fn_inc

In [None]:
cat_names[9]

In [None]:
sv_fpperc = [fp / lcount[i] for i,fp in enumerate(fplist)]
sfnperc = [fn / lcount[i] for i,fn in enumerate(fnlist)]

In [None]:
cat_fpperc = [fp / lcount[i] for i,fp in enumerate(fplist)]
cat_fnperc = [fn / lcount[i] for i,fn in enumerate(fnlist)]

In [None]:
diff_fp = np.array(cat_fpperc) - np.array(sv_fpperc)
diff_fn = np.array(cat_fnperc) - np.array(sv_fnperc)

In [None]:
slist = []
sslist = []
for i in range(10):
    aal = unsvlabels[:,i] - unsvbools[:,i]
    print("FP count for {} : {}".format(cat_names[i],np.count_nonzero(aal == -1)))
    print("FN count for {} : {}".format(cat_names[i],np.count_nonzero(aal == 1)))
    #print(sum(aal))
    slist.append(np.count_nonzero(aal == -1))
    sslist.append(np.count_nonzero(aal == 1))

print(sum(slist))
print(sum(sslist))

In [None]:
catsameb

In [None]:
#example of poorly calculated one
sumbools.index(3)

In [None]:
sum([True,False,True])

In [None]:
# poorer :
madeworse = list(unsv)[17]
# better:
madebetter = uncats[0]
#same :
madesame = stayedsame[7]
print(madebetter,madeworse,madesame)

In [None]:
truenames

In [None]:
result_df.iloc[madeworse]

In [None]:
len(onlyfortexts.segments_vals)

In [None]:
made_df = onlyfortexts.df[onlyfortexts.df['segment_text'] == texts[madeworse]]

In [None]:
made_df

In [None]:
texts[madeworse]

In [None]:
testf = pd.read_csv('../input/privbert-data/op115_data/op115_test_k4.csv')
onlyfortexts = Op115OneHots(testf)
onlyfortexts.go2(majority = True, class_tup = uniques)

t_catsub,t_catval,t_subval,train_cats,t_subs,t_vals,texts = onlyfortexts.new_onehots()
print(len(texts))
    

In [None]:
lcount = []
for i in range(10):
    sumcounts = sum(np.array(train_cats)[:,i])
    print("label count for {} : {}".format(cat_names[i],sumcounts))
    lcount.append(sumcounts)

In [None]:
k4_sv.iloc[madeworse]

In [None]:
labelz_c = []
predz_c = []
labelz_sv = []
predz_sv = []
subadvices = []
valadvices = []
textz = []

for index in [madeworse]:
    catsub = catdf.iloc[index]
    svsub = subvaldf.iloc[index]
    subsub = subdf.iloc[index]
    valsub = valdf.iloc[index]
    text = texts[index]

    
    cat_preds = catsub[prednames].values
    cat_labels = catsub[truenames].values
    
    sv_preds = svsub[prednames].values
    sv_labels = svsub[truenames].values
    
    sub_preds = subsub[prednames].values
    val_preds = valsub[prednames].values
    
    sub_adv = sub_preds - cat_preds
    val_adv = val_preds - cat_preds
    
    subadvices.append(sub_adv)
    valadvices.append(val_adv)
    
    labelz_c.append(cat_labels)
    labelz_sv.append(sv_labels)
    
    predz_c.append(cat_preds)
    predz_sv.append(sv_preds)
    
    textz.append(text)