In [1]:
import sys



In [2]:
from transformers import AutoTokenizer, BertTokenizer, RobertaTokenizer, AutoModel, trainer, RobertaForSequenceClassification


model_name = "climatebert/distilroberta-base-climate-f"

model = RobertaForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at climatebert/distilroberta-base-climate-f were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.ou

In [3]:
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [4]:
import gc
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold
import pickle
import re



class args():
    pass
args.model_name = "climatebert/distilroberta-base-climate-f"
args.y_prefix = "INCLUDE"
args.y_prefix = "4 -"
args.n_splits = 3
args.make_predictions = False

try:
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test = False
except:
    import random
    rank = random.randint(0,args.n_splits**2)
    print(rank)
    test = True



9


In [5]:
# Load data                                                                                                                                                                
seen_df = pd.read_csv('../data/0_labelled_documents.csv')

seen_df = (seen_df
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
)

# Get the target labels from the y_prefix argument passed to this script                                                                                                   
if len(args.y_prefix) < 2:
    args.y_prefix+=" "
cols = [x for x in seen_df.columns if re.match(f"^{args.y_prefix}",x)]
print(cols)
num_labels=len(cols)

# If the target is inclusion, use only those documents for which we have a non-na value                                                                                    
# Otherwise, only use those documents which are included                                                                                                                   
# also define what subset is to be treated as a random representative sample                                                                                               
# For labels beyond inclusion, we treat all those that are representative of the included                                                                                  
# studies as representative                                                                                                                                                
if "INCLUDE" in args.y_prefix:
    y_var = cols[0]
    seen_df = seen_df.loc[pd.notna(seen_df[y_var]),:].reset_index(drop=True)
    seen_df['random'] = seen_df['representative_sample']
else:
    seen_df = seen_df[seen_df['INCLUDE']==1]
    seen_df['random'] = seen_df['representative_relevant']

weights_df = pd.read_csv('../data/0_label_weights.csv')

# Turn the columns into target variables and get class-weights to counteract class imbalances                                                                              
if len(cols)==1:
    y_var = cols[0]
    seen_df = seen_df.loc[pd.notna(seen_df[y_var]),:].reset_index(drop=True)
    print(seen_df.shape)
    seen_df['labels'] = list(seen_df[y_var].values.astype(int))
    cw = seen_df[(seen_df['random']==1) & (seen_df[y_var]==0)].shape[0] / seen_df[(seen_df['random']==1) & (seen_df[y_var]==1)].shape[0]
    class_weight={1:cw}
    scorer = "F1"
    weights_df["sample_weight"] = list(weights_df[y_var+"_sample_weight"].fillna(1).values)
else:
    num_labels = len(cols)
    weights_df['sample_weight'] = list(weights_df[[x+"_sample_weight" for x in cols]].fillna(1).values)
    seen_df = seen_df.replace(2,1)
    seen_df['labels'] = list(seen_df[cols].values.astype(int))
    seen_df = seen_df.dropna(subset=cols)
    seen_df = seen_df.reset_index(drop=True)
    scorer = "F1 macro"
    class_weight = {}
    for i, t in enumerate(cols):
        cw = seen_df[(seen_df['random']==1) & (seen_df[t]==0)].shape[0] / seen_df[(seen_df['random']==1) & (seen_df[t]==1)].shape[0]
        class_weight[i] = cw

seen_df = seen_df[["id","title","content","labels","random"]].merge(weights_df[["doc__id","sample_weight"]].rename(columns={"doc__id":"id"}))

print(seen_df.shape)

# Merge with the unseen data if necessary                                                                                                                                  
seen_df['seen']  = 1
if args.make_predictions=="True":
    unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')
    unseen_df['seen'] = 0
    df = (pd.concat([seen_df,unseen_df])
          .sort_values('id')
          .sample(frac=1, random_state=1)
          .reset_index(drop=True)
    )
    df.content = df.content.astype(str)
else:
    df = seen_df


df.head()

['4 - 1. Economic instruments', '4 - 2. Regulatory Instruments', '4 - 3. Information, education and training', '4 - 4. Governance, strategies and targets', '4 - 5. Agreements']
(740, 6)


Unnamed: 0,id,title,content,labels,random,sample_weight,seen
0,127799,Implications of emissions timing on the cost-e...,Conventional cost-effectiveness calculations i...,"[0, 0, 0, 1, 0]",1,"[1.0, 1.0, 1.0, 1.0, 1.0]",1
1,437752,Ways of Seeing in Environmental Law: How Defor...,Few areas of law are as deeply implicated with...,"[0, 0, 0, 0, 1]",1,"[1.0, 1.0, 1.0, 1.0, 0.6666666666666667]",1
2,551194,Defining zero carbon and zero energy homes fro...,The development of a framework for defining ne...,"[0, 1, 0, 0, 0]",1,"[1.0, 1.0, 1.0, 1.0, 1.0]",1
3,892366,Income elasticity of willingness-to-pay for a ...,Climate change policy is steadily gaining mome...,"[1, 0, 0, 0, 0]",1,"[1.0, 1.0, 1.0, 1.0, 1.0]",1
4,3595910,Impacts of horizontal integration on social we...,Both a carbon tax and green subsidies are effi...,"[1, 0, 0, 0, 0]",1,"[1.0, 1.0, 1.0, 1.0, 1.0]",1


In [6]:
class_weight

{0: 0.7670886075949367,
 1: 4.67479674796748,
 2: 20.151515151515152,
 3: 1.5474452554744527,
 4: 4.287878787878788}

In [9]:
x = df['content']
y = df['labels']
w = df['sample_weight']


array([array([1., 1., 1., 1., 1.]),
       array([1.        , 1.        , 1.        , 1.        , 0.66666667]),
       array([1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1.]),
       array([1., 1., 1., 1., 1.]),
       array([0.66666667, 1.        , 1.        , 0.83333333, 0.5       ]),
       array([1., 1., 1., 1., 1.]),
       array([1.        , 0.5       , 1.        , 0.83333333, 0.66666667]),
       array([1., 1., 1., 1., 1.]),
       array([0.66666667, 1.        , 1.        , 1.        , 0.83333333]),
       array([0.83333333, 1.        , 1.        , 0.5       , 1.        ]),
       array([0.66666667, 1.        , 1.        , 0.66666667, 0.66666667]),
       array([1.        , 0.83333333, 1.        , 0.66666667, 1.        ]),
       array([0.83333333, 1.        , 0.66666667, 1.        , 1.        ]),
       array([0.6       , 1.        , 1.        , 0.66666667, 1.        ]),
       array([1., 1., 1., 1., 1.]),
       array([0.83333333, 1.        , 1.        , 1.        , 1.       

In [14]:
w

0                     [1.0, 1.0, 1.0, 1.0, 1.0]
1      [1.0, 1.0, 1.0, 1.0, 0.6666666666666667]
2                     [1.0, 1.0, 1.0, 1.0, 1.0]
3                     [1.0, 1.0, 1.0, 1.0, 1.0]
4                     [1.0, 1.0, 1.0, 1.0, 1.0]
                         ...                   
735    [0.8333333333333334, 1.0, 1.0, 1.0, 1.0]
736    [0.7795207240779508, 1.0, 1.0, 1.0, 1.0]
737                   [1.0, 1.0, 1.0, 1.0, 1.0]
738                   [1.0, 1.0, 1.0, 1.0, 1.0]
739                   [1.0, 1.0, 1.0, 1.0, 1.0]
Name: sample_weight, Length: 740, dtype: object

In [31]:
np.stack(w)[:,3]

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       0.83333333, 1.        , 0.83333333, 1.        , 1.        ,
       0.5       , 0.66666667, 0.66666667, 1.        , 0.66666667,
       1.        , 1.        , 0.66666667, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.5       , 1.        , 1.        , 0.66666667, 0.66666667,
       1.        , 1.        , 1.        , 0.66666667, 1.        ,
       1.        , 1.        , 1.        , 0.66666667, 0.66666667,
       0.66666667, 1.        , 1.        , 0.5       , 1.        ,
       1.        , 0.5       , 1.        , 1.        , 1.        ,
       1.        , 0.5       , 1.        , 1.        , 1.        ,
       0.66666667, 1.        , 0.66666667, 0.66666667, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.66666667, 1.        , 1.        ,
       0.68573032, 1.        , 1.        , 1.        , 1.     

In [8]:
MAX_LEN = 512

train = df.index[0:100]
val = df.index[100:200]

train_encodings= tokenizer(list(x[train]),truncation=True,padding=True,max_length=512)
val_encodings = tokenizer(list(x[val]),truncation=True,padding=True,max_length=512)

import torch
class WTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, weights):
        self.encodings = encodings
        self.labels = labels
        self.sample_weight = weights

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)
        item['sample_weight'] = torch.tensor(self.sample_weight[idx],dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)
    

    
train_dataset = WTDataset(train_encodings, list(y[train]), list(w[train]))
val_dataset = WTDataset(val_encodings, list(y[val]), list(w[val]))

In [83]:
import itertools

def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    for instance in itertools.product(*vals):
        yield dict(zip(keys, instance))
        
bert_params = {
  "class_weight": [None],
  "batch_size": [16, 32],
  "weight_decay": (0, 0.3),
  "learning_rate": (1e-5, 5e-5),
  "num_epochs": [2, 3, 4]
}
bert_params['class_weight'].append(class_weight)
params = list(bert_params.keys())
print(params)
param_space = list(product_dict(**bert_params))
params = param_space[1]
params

['class_weight', 'batch_size', 'weight_decay', 'learning_rate', 'num_epochs']


{'class_weight': None,
 'batch_size': 16,
 'weight_decay': 0,
 'learning_rate': 1e-05,
 'num_epochs': 3}

In [84]:
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


loading configuration file https://huggingface.co/climatebert/distilroberta-base-climate-f/resolve/main/config.json from cache at /home/max/.cache/huggingface/transformers/82c3d717cd158a55810d12484b95ea019844ecb6e5e860ead378d0d98b2c190a.2ae894217f03434702f7a4881de106e5bd76e3f3bd6ee1f81fd1551c47bc898d
Model config RobertaConfig {
  "_name_or_path": "language_model/model/ClimateBERT_16102021_acc_grad_roberta/checkpoint-7908",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_no

In [85]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=params['num_epochs'],
    per_device_train_batch_size=params['batch_size'],
    learning_rate=params['learning_rate'],
    weight_decay=params['weight_decay'],
    gradient_checkpointing=True
)



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [86]:
logits = torch.tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
logits.view(-1)

tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [87]:
class CWTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        if "sample_weight" in inputs.keys():
            sample_weight = inputs.pop("sample_weight")
        else:
            sample_weight = None
        outputs = model(**inputs)
        logits = outputs.logits
        print(logits)
        print(labels)
        cw = torch.tensor(list(self.class_weight.values()))
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=cw,reduction='none')
        loss = loss_fct(logits.view(-1, model.num_labels),
                        labels.float().view(-1, model.num_labels))
        print(labels)
        loss = loss_fct(logits, labels)
        print(labels)
        if sample_weight is not None:
            loss = (loss * sample_weight / sample_weight.sum()).sum().mean()
        else:
            loss = loss.mean()
        return (loss, outputs) if return_outputs else loss

trainer = CWTrainer(
    model=model,
    args = training_args, 
)
params['class_weight'] = class_weight
trainer.class_weight = params['class_weight']
#trainer.class_weight=torch.tensor(list(params["class_weight"].values()))
trainer.train_dataset = train_dataset
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 21


tensor([[ 0.0981,  0.1867,  0.1738, -0.0144, -0.1854],
        [ 0.0917,  0.1694,  0.1316,  0.0773, -0.3124],
        [ 0.0786,  0.1808,  0.1642,  0.0661, -0.2461],
        [ 0.1397,  0.1304,  0.0979,  0.1009, -0.2441],
        [ 0.1278,  0.1221,  0.2323,  0.0282, -0.2180],
        [ 0.3345,  0.4151,  0.3799,  0.2432, -0.4885],
        [ 0.1418,  0.1318,  0.1054,  0.0364, -0.2144],
        [ 0.0469,  0.2202,  0.2187, -0.0161, -0.3293],
        [ 0.1603,  0.1952,  0.1202,  0.1706, -0.2170],
        [ 0.0391,  0.0565,  0.0255,  0.0257,  0.0881],
        [ 0.1626,  0.1413,  0.2098,  0.0841, -0.1598],
        [ 0.1056,  0.0809,  0.1305,  0.1270, -0.2640],
        [ 0.3573,  0.4319,  0.5203,  0.0327, -0.4859],
        [ 0.1358,  0.1538,  0.1914,  0.0573, -0.1636],
        [ 0.0019,  0.0114, -0.0091,  0.0017,  0.0251],
        [ 0.3257,  0.4000,  0.4595,  0.1909, -0.5857]],
       grad_fn=<AddmmBackward>)
tensor([[1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 1.

Step,Training Loss


tensor([[ 0.0128,  0.1058,  0.2481,  0.0148, -0.2611],
        [ 0.0883,  0.1890,  0.1694,  0.1094, -0.2644],
        [ 0.1373,  0.1043,  0.1184,  0.0200, -0.1076],
        [ 0.0508,  0.0687,  0.1614, -0.0065, -0.1978],
        [ 0.1065,  0.0656,  0.1843, -0.0047, -0.1929],
        [ 0.1879,  0.4062,  0.3859,  0.1398, -0.4533],
        [ 0.1026,  0.1239,  0.1939,  0.0382, -0.1998],
        [ 0.0474,  0.0643,  0.1867,  0.0243, -0.2314],
        [ 0.0707,  0.0893,  0.1652,  0.0524, -0.1522],
        [ 0.1167,  0.1184,  0.1549,  0.0893, -0.2207],
        [ 0.1674,  0.1340,  0.1792,  0.0121, -0.1788],
        [ 0.3067,  0.1844,  0.1396,  0.1257, -0.4648],
        [ 0.1159,  0.0674,  0.0706,  0.0231, -0.1732],
        [ 0.0815,  0.1608,  0.1209,  0.0745, -0.2319],
        [ 0.1771,  0.2354,  0.1507,  0.0516, -0.2239],
        [ 0.1500,  0.1713,  0.0855,  0.0747, -0.2229]],
       grad_fn=<AddmmBackward>)
tensor([[0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.

KeyboardInterrupt: 

In [None]:
class_weight.values()