## Tuning threshold

This notebook demonstrates the intended threshold tuning mechanism:

- For each outer fold:
    - Choose the best model with ROC AUC
    - For each inner fold:
        - make predictions
        - for each threshold:
            - calculate metrics
    - Calculate best threshold on average
    - Calculate metrics of best model with best threshold against test set

In [1]:
import pandas as pd
import numpy as np
import cv_setup_simplified as cvs
import time

In [106]:
class args():
    pass
args.model_name = "climatebert/distilroberta-base-climate-f"
#args.model_name = "distilbert-base-uncased"
args.y_prefix = "INCLUDE"
#.y_prefix = "4 -"
args.n_splits = 3
args.make_predictions = False
args.roundup = False

In [107]:
t0 = time.time()

# Establish what task number we have if running from slurm, otherwise just get a random number
# This means we are just running the script in test mode
try:
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test = False
except:
    import random
    rank = random.randint(0,args.n_splits**2)
    print(rank)
    test = True
    
rank_i = rank%args.n_splits
rank_j = rank//args.n_splits

print("Rank I ", rank_i, "Rank j", rank_j)
# Import the rest of the libraries
import gc
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold
import pickle
import re

# Load data
seen_df = pd.read_csv('../data/0_labelled_documents.csv')

seen_df = (seen_df
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)

weights_df = pd.read_csv('../data/0_label_weights.csv')

# Get the target labels from the y_prefix argument passed to this script
if len(args.y_prefix) < 2:
    args.y_prefix+=" "
cols = [x for x in seen_df.columns if re.match(f"^{args.y_prefix}",x)]


# If the target is inclusion, use only those documents for which we have a non-na value
# Otherwise, only use those documents which are included
# also define what subset is to be treated as a random representative sample
# For labels beyond inclusion, we treat all those that are representative of the included
# studies as representative
if "INCLUDE" in args.y_prefix:
    y_var = cols[0]
    seen_df = seen_df.loc[pd.notna(seen_df[y_var]),:].reset_index(drop=True)
    seen_df['random'] = seen_df['representative_sample']
else:
    seen_df = seen_df[seen_df['INCLUDE']==1]
    seen_df['random'] = seen_df['representative_relevant']
    

# Turn the columns into target variables and get class-weights to counteract class imbalances
if len(cols)==1:
    y_var = cols[0]
    seen_df = seen_df.loc[pd.notna(seen_df[y_var]),:].reset_index(drop=True)
    print(seen_df.shape)
    seen_df['labels'] = list(seen_df[y_var].values.astype(int))
    cw = seen_df[(seen_df['random']==1) & (seen_df[y_var]==0)].shape[0] / seen_df[(seen_df['random']==1) & (seen_df[y_var]==1)].shape[0]
    class_weight={1:cw}
    scorer = "F1"
    weights_df["sample_weight"] = list(weights_df[y_var+"_sample_weight"].fillna(1).values)
else:
    num_labels = len(cols) 
    weights_df['sample_weight'] = list(weights_df[[x+"_sample_weight" for x in cols]].fillna(1).values)
    seen_df = seen_df.replace(2,1)
    seen_df['labels'] = list(seen_df[cols].values.astype(int))
    seen_df = seen_df.dropna(subset=cols)
    seen_df = seen_df.reset_index(drop=True)
    scorer = "ROC AUC macro"
    class_weight = {}
    for i, t in enumerate(cols):
        cw = seen_df[(seen_df['random']==1) & (seen_df[t]==0)].shape[0] / seen_df[(seen_df['random']==1) & (seen_df[t]==1)].shape[0]
        class_weight[i] = cw

# Remove unneccessary columns
seen_df = seen_df[["id","title","content","labels","random"]+cols].merge(
    weights_df[["doc__id","sample_weight"]].rename(columns={"doc__id":"id"})
)    


# Merge with the unseen data if necessary
seen_df['seen']  = 1
if args.make_predictions=="True":
    unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')
    unseen_df['seen'] = 0
    df = (pd.concat([seen_df,unseen_df])
          .sort_values('id')
          .sample(frac=1, random_state=1)
          .reset_index(drop=True)
    )
    df.content = df.content.astype(str)
else:
    df = seen_df

# This is the index of nonrandom/nonrepresentative documents, and these will be removed from validation sets
nonrandom_index = df[(df['random']!=1) & (df['seen']==1)].index
random_index = df[df['random']==1].index
seen_index = df[df['seen']==1].index
unseen_index = df[df['seen']==0].index

print("seen_index", seen_index)
print("nonrandom_index", nonrandom_index)

8
Rank I  2 Rank j 2
(2513, 104)
seen_index Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512],
           dtype='int64', length=2513)
nonrandom_index Int64Index([   0,    1,    2,    3,    4,    7,    8,   10,   11,   12,
            ...
            2495, 2498, 2500, 2503, 2504, 2505, 2507, 2508, 2510, 2512],
           dtype='int64', length=1811)


In [108]:
import cv_setup as cvs
from sklearn.metrics import f1_score
import ast

# Get the BERT parameters, and include class_weight as a parameter to be tested
bert_params = cvs.bert_params
bert_params['class_weight'].append(class_weight)
bert_params['class_weight'] = [class_weight]
param_space = list(cvs.product_dict(**bert_params))
params = list(bert_params.keys())

outer_cv = cvs.KFoldRandom(args.n_splits, seen_index, nonrandom_index, discard=False)

# Iterate through the folds
for k, (train, test) in enumerate(outer_cv):   
    inner_scores = []
    for l in range(args.n_splits):
        fname = f'cv/df_{len(seen_index)}_cv_results_{args.y_prefix}_{args.model_name.replace("/","__")}_{k}_{l}.csv'
        inner_df = pd.read_csv(fname)
        inner_df = inner_df.sort_values(scorer,ascending=False).reset_index(drop=True)
        inner_scores += inner_df.to_dict('records')

    inner_scores = pd.DataFrame.from_dict(inner_scores).fillna(-1)
    best_model = (inner_scores
                  .groupby(params)[scorer]
                  .mean()
                  .sort_values(ascending=False)
                  .reset_index() 
                 ).to_dict('records')[0]
    
    del best_model[scorer]
    if best_model['class_weight']==-1:
        best_model['class_weight']=None
    elif isinstance(best_model['class_weight'],str):
        best_model['class_weight'] = ast.literal_eval(best_model['class_weight'])
    
    print("Best model from this round: ", best_model)

    inner_cv = cvs.KFoldRandom(args.n_splits, train, nonrandom_index, discard=False)
    inner_scores = []
    
    threshold_results = []
    
    for l, (l_train, l_test) in enumerate(inner_cv):
        # outer_scores, y_preds = cvs.train_eval_bert(
        #     #args.model_name, 
        #     "distilbert-base-uncased",
        #     best_model, df=df, targets=cols, 
        #     train=l_train, test=l_test, roundup=args.roundup, return_predictions=True
        # )
        
        if "INCLUDE" in args.y_prefix:
            y_preds = np.zeros((len(l_test),2))
            y_preds[:,1] = np.random.uniform(size=len(l_test))
            y_preds[:,0] = 1 - y_preds[:,1]
            for t in np.linspace(0.1, 0.9, 50):
                y_pred_bin = np.where(y_preds[:,1]>t,1,0)
                threshold_results.append({
                    "t": t, 
                    "f1": f1_score(df['labels'][l_test], y_pred_bin),
                    "k": l,
                    "label_i": 0
                })
        else:
            y_preds = np.zeros((len(l_test),num_labels))
            for label_i, col in enumerate(cols):
                y_preds[:,label_i] = np.random.uniform(size=len(l_test))                
                for t in np.linspace(0.1, 0.9, 50):
                    y_pred_bin = np.where(y_preds[:,label_i]>t,1,0)
                    y_true = [x[label_i] for x in df.labels[l_test]]
                    threshold_results.append({
                        "t": t, 
                        "f1": f1_score(y_true, y_pred_bin),
                        "k": l,
                        "label_i": label_i
                    })
            
    thresh_df = pd.DataFrame.from_dict(threshold_results)
    optimal_t = (thresh_df.groupby(["label_i","t"])["f1"]
     .mean()
     .sort_values(ascending=False)
     .reset_index()
     .groupby(["label_i"])
     .first()
    )
    fname = f'cv/df_{len(seen_index)}_tune_results_{args.y_prefix}_{args.model_name.replace("/","__")}_{k}.csv'
    thresh_df.to_csv(fname, index=False)

        
        
            

    
    

Best model from this round:  {'class_weight': {1: 14.954545454545455}, 'sample_weighted': True, 'batch_size': 16, 'weight_decay': 0.1, 'learning_rate': 7e-05, 'num_epochs': 4}
Best model from this round:  {'class_weight': {1: 14.954545454545455}, 'sample_weighted': False, 'batch_size': 32, 'weight_decay': 0.1, 'learning_rate': 5e-05, 'num_epochs': 5}
Best model from this round:  {'class_weight': {1: 14.954545454545455}, 'sample_weighted': True, 'batch_size': 16, 'weight_decay': 0.0, 'learning_rate': 2e-05, 'num_epochs': 7}


array([0.81836735])

In [116]:
f1_score(df["labels"][l_test], y_preds.round())

ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets

In [80]:
thresh_df = pd.DataFrame.from_dict(threshold_results)
thresh_df.groupby(["label_i","t"])["f1"].mean().sort_values(ascending=False)

label_i  t       
0        0.165306    0.671711
         0.100000    0.670946
         0.116327    0.669059
         0.181633    0.666093
         0.148980    0.663371
                       ...   
2        0.802041    0.018018
         0.785714    0.015873
         0.769388    0.015152
         0.753061    0.014493
         0.736735    0.013072
Name: f1, Length: 250, dtype: float64

In [100]:
optimal_t = (thresh_df.groupby(["label_i","t"])["f1"]
 .mean()
 .sort_values(ascending=False)
 .reset_index()
 .groupby(["label_i"])
 .first()
)


Unnamed: 0_level_0,t,f1
label_i,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.165306,0.671711
1,0.1,0.303203
2,0.508163,0.087644
3,0.116327,0.572968
4,0.181633,0.341797


In [34]:
thresh_df = pd.DataFrame.from_dict(threshold_results)
optimal_t = thresh_df.groupby("t")["f1"].mean().sort_values(ascending=False).index[:1].values
optimal_t

array([0.1])

In [11]:
y_pred = np.random.uniform(size=len(train))
y_pred

array([0.1938433 , 0.04409522, 0.87590919, ..., 0.65369151, 0.39914938,
       0.11373069])

In [15]:
y_pred = np.zeros((len(train),2))
y_pred[:,1] = np.random.uniform(size=len(train))
y_pred[:,0] = 1 - y_pred[:,1]
y_pred

array([[0.27605815, 0.72394185],
       [0.98910636, 0.01089364],
       [0.31293071, 0.68706929],
       ...,
       [0.65493963, 0.34506037],
       [0.35898006, 0.64101994],
       [0.62577374, 0.37422626]])

In [9]:
from transformers import DistilBertForSequenceClassification
model_name = "distilbert-base-uncased"
num_labels = 2
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, cache_dir="transformers")

Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


OSError: Can't load the configuration of 'distilbert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'distilbert-base-uncased' is the correct path to a directory containing a config.json file