In [1]:
#!/usr/bin/env python
# coding: utf-8
import os

from sklearn.model_selection import StratifiedKFold, train_test_split
from config import LSHS_DATAFILE, gpt_filtered_rephrase_lshs_file, HEVAL_DATAFILE
import numpy as np
import time
import pandas as pd
from tqdm import tqdm

In [2]:
import json
gpt_filtered_rephrase_tweets_file = gpt_filtered_rephrase_lshs_file

out_file = open(gpt_filtered_rephrase_tweets_file, "r")
filtered_rephrase_tweet_gpt = json.load(out_file)
out_file.close()

In [3]:
df = pd.read_csv(LSHS_DATAFILE)
domains = df['Domain'].unique().tolist()
for d in domains:
    print(d, df[df['Domain'] == d].shape)

Gender (9454, 4)
Religion (10869, 4)
Race (12013, 4)
Politics (11018, 4)
Sports (12306, 4)


In [4]:
# HateLabel	Final hate label decision 0-Normal, 1-Offensive, 2-Hate

In [5]:
# Convert to binary labels, combine Offensive and Hate as one class.
NUM_LABELS = 2
df['Label'] = df['Label'].replace(2, 1)


In [6]:
df

Unnamed: 0,Domain,TweetID,Tweet,Label
0,Gender,1344706773245038592,WATCH: Video previews #SurreyBC-shot film to f...,0
1,Gender,1344706877217792005,Men and women don’t have to solve their proble...,0
2,Gender,1344707261155962880,"At last I awake, very queer about the head, as...",0
3,Gender,1344707529213792256,WATCH: Video previews Surrey-shot film to focu...,0
4,Gender,1344709019865403394,heteronormativity is killing my people. how so...,1
...,...,...,...,...
55655,Sports,1277315350254751747,"Fuck off Gayle, professional footballer and yo...",1
55656,Sports,1277319456071581698,Omo I hate mancity abeg. What is this fluid fo...,1
55657,Sports,1277316487271854082,I hate playing Manchester United again,1
55658,Sports,1277319975305445381,I'll get trolled to fuck but I'd give anything...,1


In [7]:
# Labels:0 (nonhate) 1 (hate)
df_test = pd.read_csv(HEVAL_DATAFILE)
df_test = df_test.rename(columns={'text': 'Tweet', 'HS': 'Label', 'id': 'TweetID'})

In [8]:
df_test

Unnamed: 0,TweetID,Tweet,Label,TR,AG
0,34243,"@local1025 @njdotcom @GovMurphy Oh, I could ha...",0,0,0
1,30593,Several of the wild fires in #california and #...,0,0,0
2,31427,@JudicialWatch My question is how do you reset...,0,0,0
3,31694,"#Europe, you've got a problem! We must hurry...",1,0,0
4,31865,This is outrageous! #StopIllegalImmigration #...,1,0,0
...,...,...,...,...,...
2995,31368,you can never take a L off a real bitch😩 im ho...,1,1,0
2996,30104,@Brian_202 likes to call me a cunt & a bitch b...,1,1,0
2997,31912,@kusha1a @Camio_the_wise @shoe0nhead 1. Never ...,1,1,0
2998,31000,If i see and know you a hoe why would i hit yo...,1,1,0


# Setup for PyTorch-Lightning

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.utils.data import DataLoader, Dataset
import torchmetrics
import torch
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm
2023-06-23 19:01:05.956144: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-23 19:01:05.985173: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# https://wandb.ai/jack-morris/david-vs-goliath/reports/Does-Model-Size-Matter-A-Comparison-of-BERT-and-DistilBERT--VmlldzoxMDUxNzU
MAX_EPOCHS = 5 #5
BATCH_SIZE = 16*2 #+ int(55 * 0.9*0.5)
LEARNING_RATE = 1e-5
# MODEL_LLM = 'distilbert-base-uncased'
MODEL_LLM = 'bert-base-uncased'

# Setting the seed
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_LLM)
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [12]:
class MyDataset(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    '''
    encoding.items() -> 
      -> input_ids : [1,34, 32, 67,...]
      -> attention_mask : [1,1,1,1,1,....]
    '''
    item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len((self.labels))

In [13]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name_or_path, num_labels, learning_rate=LEARNING_RATE):
        super().__init__()

        self.learning_rate = learning_rate
        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)

        # self.val_conf_mat = torchmetrics.ConfusionMatrix(task="multiclass", num_classes=NUM_LABELS)
        self.val_f1_macro_score = torchmetrics.classification.MulticlassF1Score(average="macro", num_classes=NUM_LABELS)
        self.val_f1_weighted_score = torchmetrics.classification.MulticlassF1Score(average="weighted", num_classes=NUM_LABELS)
        self.val_f1_non_avg_score = torchmetrics.classification.MulticlassF1Score(average="none", num_classes=NUM_LABELS)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_LABELS)
        
        # self.test_conf_mat = torchmetrics.ConfusionMatrix(task="multiclass", num_classes=NUM_LABELS)
        self.test_f1_macro_score = torchmetrics.classification.MulticlassF1Score(average="macro", num_classes=NUM_LABELS)
        self.test_f1_weighted_score = torchmetrics.classification.MulticlassF1Score(average="weighted", num_classes=NUM_LABELS)
        self.test_f1_non_avg_score = torchmetrics.classification.MulticlassF1Score(average="none", num_classes=NUM_LABELS)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_LABELS)
        
        self.metrics = {'val': [('val_f1_macro_score', self.val_f1_macro_score), ('val_f1_weighted_score', self.val_f1_weighted_score), ('val_acc', self.val_acc)],
                         'test': [('F1-Macro', self.test_f1_macro_score), ('F1-Weighted', self.test_f1_weighted_score),
                                  ('F1_Class 0', self.test_f1_non_avg_score.cpu()[0], 'test_f1_non_avg_score'), 
                                  ('F1_Class 1', self.test_f1_non_avg_score.cpu()[1], 'test_f1_non_avg_score'),
                                  ('Accuracy', self.test_acc)
                                 ]
                        }
        

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)
        
    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["labels"])        
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def echo_metrics(self, key, predicted_labels, batch_labels):
        for itm in self.metrics[key]:
            if len(itm) == 2:
                mt_str, met = itm
                metric_attribute = None
            else:
                mt_str, met, metric_attribute = itm
                # print('this', mt_str, met, metric_attribute)
            met(predicted_labels, batch_labels)
            self.log(mt_str, met, prog_bar=True, metric_attribute=metric_attribute)
        
        
    
    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["labels"])        
        self.log("val_loss", outputs["loss"], prog_bar=True)
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)

        self.echo_metrics('val', predicted_labels, batch["labels"])
        # print('#n here->',self.val_f1_non_avg_score(predicted_labels, batch["labels"]).cpu()[0])
        
        
    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["labels"])        
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.echo_metrics('test', predicted_labels, batch["labels"])

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

# Experiment

In [14]:
CF_LABEL = 0
import random
random.seed(42)

In [15]:
print(df['Label'].value_counts())
problematic_df = df[df['Label']>0]
problematic_df.shape

Label
0    44874
1    10786
Name: count, dtype: int64


(10786, 4)

In [16]:
def get_tweets():
    gpt_counterfactual_tweets = {}
    tot = problematic_df.shape[0]
    # print(tot)
    for i in range(0, tot):
        idx = problematic_df.iloc[i].name
        if str(i) in filtered_rephrase_tweet_gpt:
            gpt_counterfactual_tweets[idx] = filtered_rephrase_tweet_gpt[str(i)]
    return gpt_counterfactual_tweets
    
gpt_counterfactual_tweets = get_tweets()

In [17]:
import config
from nltk.tokenize import TweetTokenizer
import mosestokenizer
import numpy as np

def get_offensive_words():
    _df = pd.read_csv(config.en_swear_words_datafile, index_col=0)
    
    s = np.logical_or(_df['Level of offensiveness']=='Strongest words', _df['Level of offensiveness']=='Strong words')
    # display(_df[s]['Word'].to_list())
    wd_list = _df['Word'].to_list()
    
    _df = pd.read_csv(config.en_profanity_datafile, index_col=None)
    s = _df['severity_description'] == 'Severe'
    # wd_list.extend(_df[s]['text'].to_list())
    wd_list.extend(_df['text'].to_list())
    wd_list = set(map(str.lower, wd_list))
    return wd_list

offensive_wd_list = get_offensive_words()

In [18]:
def find_phrases(tokens, phrases):
    tokens = list(map(str.lower, tokens))
    """
    Find phrases in a list of sequential tokens.
    
    Args:
        tokens (list): List of sequential tokens.
        phrases (list): List of phrases to search for.
        
    Returns:
        A list of tuples containing the start and end index of each found phrase.
    """
    found_phrases = []
    
    for i in range(len(tokens)):
        for phrase in phrases:
            if tokens[i:i+len(phrase)] == phrase:
                found_phrases.append((i, i+len(phrase)-1))
    
    return found_phrases

def offensive_lexicon_used(t):
    tk = TweetTokenizer()
    detk = mosestokenizer.MosesDetokenizer('en')
    tk = tk.tokenize(t)
    # print(tk)
    phrase_index = find_phrases(tk, list(map(str.split, offensive_wd_list)))
    return len(phrase_index)

In [19]:
def get_counterfactual_tweets(data, labels, cf_label, single_cf_per_tweet=False, cf_size_prop_to_data=1.0, only_tweets_with_offensive_lexicon=True):
    tweets = []
    cnt =0 
    for idx in data.index:
        if idx in gpt_counterfactual_tweets:
            if (not only_tweets_with_offensive_lexicon) or offensive_lexicon_used(X[idx]):
                cnt += 1
                if not single_cf_per_tweet:
                    tweets.extend(gpt_counterfactual_tweets[idx])
                else:
                    tweets.append(gpt_counterfactual_tweets[idx][0])
    print('> Total Tweets used to generate counterfactuals ' + str(cnt))
    print('> Total counterfactuals added ' + str(len(tweets)))
    k = round(cf_size_prop_to_data * len(tweets))
    
    tweets = random.sample(tweets, k=k)
    print('> Counterfactual size ' + str(k) + ' at rate ' + str(cf_size_prop_to_data))
    cf_target = k*[cf_label]
    return pd.concat([data, pd.Series(tweets)], axis=0), pd.concat([labels, pd.Series(cf_target)], axis=0)

In [20]:
only_tweets_with_offensive_lexicon = False

In [21]:
domains

['Gender', 'Religion', 'Race', 'Politics', 'Sports']

In [22]:
# # only selected domains for this experiment
# domains = ['Gender', 'Race']

In [23]:
for d in domains:
    sel_df = df[df['Domain'] == d]
    print(d, sel_df.shape)
    X, y = sel_df['Tweet'], sel_df['Label'].astype(int)
    skf = StratifiedKFold(n_splits=2, random_state=None)
    for splt_idx, (train_index , test_index) in enumerate(skf.split(X, y)):
        print(splt_idx)
        X_train , X_test = X.iloc[train_index], X.iloc[test_index]
        y_train , y_test = y.iloc[train_index] , y.iloc[train_index]
        _,_ = get_counterfactual_tweets(
                X_train, y_train, cf_label=CF_LABEL, single_cf_per_tweet=True, cf_size_prop_to_data=.1, only_tweets_with_offensive_lexicon=only_tweets_with_offensive_lexicon)

Gender (9454, 4)
0
> Total Tweets used to generate counterfactuals 843
> Total counterfactuals added 843
> Counterfactual size 84 at rate 0.1
1
> Total Tweets used to generate counterfactuals 769
> Total counterfactuals added 769
> Counterfactual size 77 at rate 0.1
Religion (10869, 4)
0
> Total Tweets used to generate counterfactuals 766
> Total counterfactuals added 766
> Counterfactual size 77 at rate 0.1
1
> Total Tweets used to generate counterfactuals 767
> Total counterfactuals added 767
> Counterfactual size 77 at rate 0.1
Race (12013, 4)
0
> Total Tweets used to generate counterfactuals 564
> Total counterfactuals added 564
> Counterfactual size 56 at rate 0.1
1
> Total Tweets used to generate counterfactuals 526
> Total counterfactuals added 526
> Counterfactual size 53 at rate 0.1
Politics (11018, 4)
0
> Total Tweets used to generate counterfactuals 994
> Total counterfactuals added 994
> Counterfactual size 99 at rate 0.1
1
> Total Tweets used to generate counterfactuals 10

In [24]:
def __exp__(train_texts, train_labels, valid_texts, valid_labels, test_texts, test_labels, CF=False):
   
    if not CF:
        print('> Train samples', len(train_texts))
    else:
        print('> Train with CF samples', len(train_texts))
    
    train_encodings = tokenizer(list(train_texts), truncation = True, padding = True)
    valid_encodings = tokenizer(list(valid_texts), truncation = True, padding = True)
    test_encodings = tokenizer(list(test_texts), truncation = True, padding = True)

    #datasets
    train_dataset = MyDataset(train_encodings, train_labels)
    valid_dataset = MyDataset(valid_encodings, valid_labels)
    test_dataset = MyDataset(test_encodings, test_labels)
    
    #dataloaders
    bs = BATCH_SIZE
    train_loader = DataLoader(train_dataset, batch_size = bs, shuffle = True, num_workers=4)
    valid_loader = DataLoader(valid_dataset, batch_size = bs, shuffle = True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size = bs, shuffle = True, num_workers=4)
    print(len(train_loader), len(valid_loader))

    # Setting the seed
    pl.seed_everything(42, workers=True)
    lightning_model = LightningModel(MODEL_LLM, NUM_LABELS)

    trainer = pl.Trainer(
        max_epochs=MAX_EPOCHS,
        accelerator="gpu",
        devices=1,
        deterministic=True,
        # log_every_n_steps=30,
        enable_checkpointing=True,  
        logger=False
    )
    
    trainer.fit(model=lightning_model,
                train_dataloaders=train_loader,
                val_dataloaders=valid_loader)
    
    r = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
    del lightning_model
    del trainer
    return r



In [25]:
def get_splits(_X, _y, _X_test, _y_test, train_size=0.8):
    # 80-20 train-val size
    x_train, x_val, y_train, y_val = \
                    train_test_split(_X, _y, train_size=train_size)

    train_texts = x_train.values
    train_labels = y_train.values
    
    valid_texts = x_val.values
    valid_labels = y_val.values
    
    test_texts = _X_test.values
    test_labels = _y_test.values

    return x_train, y_train, train_texts, train_labels, valid_texts, valid_labels, test_texts, test_labels

def run_experiment_org(_X, _y, _X_test, _y_test):
    out_lst = []
    
    x_train, y_train, train_texts, train_labels, valid_texts, valid_labels, test_texts, test_labels = get_splits(_X, _y, _X_test, _y_test)
    org = __exp__(train_texts, train_labels, valid_texts, valid_labels, test_texts, test_labels, CF=False)

    l = [('splt_idx', -1, len(test_labels)),  ('train', len(train_labels)), {'Org': org}]
    out_lst.append(l)
    return out_lst

def run_experiment_counter_factuals(_X, _y, _X_test, _y_test, cf_size_prop_to_data=0.1):
    out_lst = []
    
    x_train, y_train, train_texts, train_labels, valid_texts, valid_labels, test_texts, test_labels = get_splits(_X, _y, _X_test, _y_test)
    x_train_with_cf, y_training_with_cf = get_counterfactual_tweets(
        x_train, y_train, cf_label=CF_LABEL, single_cf_per_tweet=True, cf_size_prop_to_data=cf_size_prop_to_data, only_tweets_with_offensive_lexicon=only_tweets_with_offensive_lexicon)
    train_texts_cf = x_train_with_cf.values
    train_labels_cf = y_training_with_cf.values

    cf = __exp__(train_texts_cf, train_labels_cf, valid_texts, valid_labels, test_texts, test_labels, CF=True)
    l = [('splt_idx', -1, len(test_labels)),  ('train', len(y_training_with_cf)), {'CF': cf}]
    out_lst.append(l)
    return out_lst

In [26]:
cf_size_prop_to_data_lst = list(np.arange(0.1, 1.1, 0.1))
try:
    del X, y
except:
    pass

In [None]:
def run_now():
    complete_result = []
    out_dict = {}
    for d in domains:
        out_dict[d] = {}
        sel_df = df[df['Domain'] == d]
        print(d, sel_df.shape)
        X, y = sel_df['Tweet'], sel_df['Label'].astype(int)

        X_test, y_test = df_test['Tweet'], df_test['Label'].astype(int)
        res_bert_lst = run_experiment_org(X, y, X_test, y_test)
        out_dict[d]['Org'] = res_bert_lst
        out_dict[d]['CF'] = {}
        for cf_size_prop_to_data in cf_size_prop_to_data_lst:
            res_bert_lst = run_experiment_counter_factuals(X, y, X_test, y_test, cf_size_prop_to_data=cf_size_prop_to_data)
            out_dict[d]['CF'][cf_size_prop_to_data] = res_bert_lst
        complete_result.append(out_dict)
        json.dump(out_dict, open('out/'+ MODEL_LLM + '-EP_'+ str(MAX_EPOCHS) + '-ft-lshd22-heval19-ood.json', 'w'))
    return complete_result

start = time.time()
complete_result = run_now()
end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed/60:.2f} min")

Gender (9454, 4)
> Train samples 7563


Global seed set to 42


237 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 237/237 [00:56<00:00,  4.20it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 17.68it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.32it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.24it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.17it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.21it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.18it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.23it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:03, 17.21it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 237/237 [01:02<00:00,  3.81it/s, val_loss=0.246, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1185-v7.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1185-v7.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5183333158493042
        F1-Macro            0.5160736441612244
       F1-Weighted          0.5213645696640015
       F1_Class 0           0.5491419434547424
       F1_Class 1           0.4830053746700287
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 130 at rate 0.1
> Train with CF samples 7693


Global seed set to 42


241 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 241/241 [00:57<00:00,  4.18it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 17.85it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.26it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.34it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.18it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.20it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.24it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.25it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:03, 17.23it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 241/241 [01:03<00:00,  3.81it/s, val_loss=0.189, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1205-v5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1205-v5.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5189999938011169
        F1-Macro            0.5179046392440796
       F1-Weighted          0.5215814113616943
       F1_Class 0           0.5408844947814941
       F1_Class 1           0.49492475390434265
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 260 at rate 0.2
> Train with CF samples 7823


Global seed set to 42


245 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 245/245 [00:58<00:00,  4.18it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.20it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.53it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.29it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.24it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.31it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.35it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.38it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.37it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 245/245 [01:04<00:00,  3.80it/s, val_loss=0.277, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1225-v5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1225-v5.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5293333530426025
        F1-Macro            0.5249553322792053
       F1-Weighted          0.5322520136833191
       F1_Class 0           0.5705596208572388
       F1_Class 1           0.4793510437011719
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 390 at rate 0.30000000000000004
> Train with CF samples 7953


Global seed set to 42


249 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 249/249 [00:59<00:00,  4.18it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 17.53it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.29it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.11it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.12it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.22it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.26it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.29it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:03, 17.30it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 249/249 [01:05<00:00,  3.81it/s, val_loss=0.218, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1245-v5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1245-v5.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5263333320617676
        F1-Macro            0.5249651670455933
       F1-Weighted          0.5290441513061523
       F1_Class 0           0.5504587292671204
       F1_Class 1           0.49947163462638855
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 520 at rate 0.4
> Train with CF samples 8083


Global seed set to 42


253 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 253/253 [01:00<00:00,  4.17it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.12it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.79it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.72it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.58it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.58it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.58it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.52it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.50it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 253/253 [01:06<00:00,  3.80it/s, val_loss=0.197, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1265-v5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1265-v5.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5199999809265137
        F1-Macro            0.5185258388519287
       F1-Weighted          0.5227884650230408
       F1_Class 0           0.5451673865318298
       F1_Class 1           0.4918842613697052
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 650 at rate 0.5
> Train with CF samples 8213


Global seed set to 42


257 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 257/257 [01:01<00:00,  4.17it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.31it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.37it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.42it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.29it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.38it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.45it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.50it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.47it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 257/257 [01:07<00:00,  3.80it/s, val_loss=0.220, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1285-v4.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1285-v4.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5196666717529297
        F1-Macro            0.5176703929901123
       F1-Weighted          0.5226352214813232
       F1_Class 0           0.5487002730369568
       F1_Class 1           0.4866405427455902
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 780 at rate 0.6
> Train with CF samples 8343


Global seed set to 42


261 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 261/261 [01:02<00:00,  4.16it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.30it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.38it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.46it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.42it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.47it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.51it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.54it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.51it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 261/261 [01:08<00:00,  3.81it/s, val_loss=0.236, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1305-v4.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1305-v4.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5223333239555359
        F1-Macro            0.5197269916534424
       F1-Weighted          0.5253878235816956
       F1_Class 0           0.5551071166992188
       F1_Class 1           0.4843468964099884
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 910 at rate 0.7000000000000001
> Train with CF samples 8473


Global seed set to 42


265 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 265/265 [01:03<00:00,  4.16it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.35it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.48it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.54it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.48it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.48it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.49it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.53it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.51it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 265/265 [01:09<00:00,  3.82it/s, val_loss=0.208, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1325-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1325-v2.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5176666378974915
        F1-Macro            0.5144281983375549
       F1-Weighted          0.5207729935646057
       F1_Class 0           0.5540832281112671
       F1_Class 1           0.4747731387615204
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 1040 at rate 0.8
> Train with CF samples 8603


Global seed set to 42


269 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 269/269 [01:04<00:00,  4.17it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.27it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.70it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.60it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.38it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.43it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.47it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.48it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.42it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 269/269 [01:10<00:00,  3.82it/s, val_loss=0.205, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1345-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1345-v2.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5270000100135803
        F1-Macro            0.5254217982292175
       F1-Weighted          0.5298005938529968
       F1_Class 0           0.5527891516685486
       F1_Class 1           0.49805447459220886
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 1170 at rate 0.9
> Train with CF samples 8733


Global seed set to 42


273 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 273/273 [01:05<00:00,  4.17it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 17.74it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.67it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.48it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.32it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.39it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.45it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.49it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.48it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 273/273 [01:11<00:00,  3.81it/s, val_loss=0.207, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1365-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1365-v2.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5263333320617676
        F1-Macro            0.5233079195022583
       F1-Weighted          0.5293841361999512
       F1_Class 0           0.5612843632698059
       F1_Class 1           0.4853314161300659
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1300
> Total counterfactuals added 1300
> Counterfactual size 1300 at rate 1.0
> Train with CF samples 8863


Global seed set to 42


277 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 277/277 [01:06<00:00,  4.17it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/60 [00:00<?, ?it/s][A
Validation DataLoader 0:   2%|▎                  | 1/60 [00:00<00:03, 18.03it/s][A
Validation DataLoader 0:   3%|▋                  | 2/60 [00:00<00:03, 17.73it/s][A
Validation DataLoader 0:   5%|▉                  | 3/60 [00:00<00:03, 17.36it/s][A
Validation DataLoader 0:   7%|█▎                 | 4/60 [00:00<00:03, 17.22it/s][A
Validation DataLoader 0:   8%|█▌                 | 5/60 [00:00<00:03, 17.31it/s][A
Validation DataLoader 0:  10%|█▉                 | 6/60 [00:00<00:03, 17.36it/s][A
Validation DataLoader 0:  12%|██▏                | 7/60 [00:00<00:03, 17.40it/s][A
Validation DataLoader 0:  13%|██▌                | 8/60 [00:00<00:02, 17.41it/s][A
Validation DataLoader 0:  15%|██▊            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 277/277 [01:12<00:00,  3.82it/s, val_loss=0.232, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1385-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1385-v2.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5199999809265137
        F1-Macro            0.5172114372253418
       F1-Weighted          0.5230821371078491
       F1_Class 0            0.553903341293335
       F1_Class 1           0.48051947355270386
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Religion (10869, 4)
> Train samples 8695


Global seed set to 42


272 68


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 272/272 [00:51<00:00,  5.29it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|▎                  | 1/68 [00:00<00:03, 18.66it/s][A
Validation DataLoader 0:   3%|▌                  | 2/68 [00:00<00:03, 17.99it/s][A
Validation DataLoader 0:   4%|▊                  | 3/68 [00:00<00:03, 17.90it/s][A
Validation DataLoader 0:   6%|█                  | 4/68 [00:00<00:03, 17.81it/s][A
Validation DataLoader 0:   7%|█▍                 | 5/68 [00:00<00:03, 17.85it/s][A
Validation DataLoader 0:   9%|█▋                 | 6/68 [00:00<00:03, 17.91it/s][A
Validation DataLoader 0:  10%|█▉                 | 7/68 [00:00<00:03, 17.96it/s][A
Validation DataLoader 0:  12%|██▏                | 8/68 [00:00<00:03, 17.95it/s][A
Validation DataLoader 0:  13%|██▌            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 272/272 [00:57<00:00,  4.70it/s, val_loss=0.230, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1360-v6.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1360-v6.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5236666798591614
        F1-Macro            0.5226120948791504
       F1-Weighted          0.5262020826339722
       F1_Class 0           0.5450493693351746
       F1_Class 1            0.500174880027771
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1235
> Total counterfactuals added 1235
> Counterfactual size 124 at rate 0.1
> Train with CF samples 8819


Global seed set to 42


276 68


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 276/276 [00:52<00:00,  5.30it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|▎                  | 1/68 [00:00<00:03, 18.48it/s][A
Validation DataLoader 0:   3%|▌                  | 2/68 [00:00<00:03, 18.09it/s][A
Validation DataLoader 0:   4%|▊                  | 3/68 [00:00<00:03, 18.01it/s][A
Validation DataLoader 0:   6%|█                  | 4/68 [00:00<00:03, 17.84it/s][A
Validation DataLoader 0:   7%|█▍                 | 5/68 [00:00<00:03, 17.93it/s][A
Validation DataLoader 0:   9%|█▋                 | 6/68 [00:00<00:03, 17.99it/s][A
Validation DataLoader 0:  10%|█▉                 | 7/68 [00:00<00:03, 18.03it/s][A
Validation DataLoader 0:  12%|██▏                | 8/68 [00:00<00:03, 18.04it/s][A
Validation DataLoader 0:  13%|██▌            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 276/276 [00:58<00:00,  4.72it/s, val_loss=0.218, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1380-v11.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1380-v11.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:13<00:00,  7.21it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5203333497047424
        F1-Macro            0.5200644731521606
       F1-Weighted          0.5218818783760071
       F1_Class 0           0.5314229726791382
       F1_Class 1           0.5087060332298279
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1235
> Total counterfactuals added 1235
> Counterfactual size 247 at rate 0.2
> Train with CF samples 8942


Global seed set to 42


280 68


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.10/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
    rmtree(tempdir)
  File "/usr/lib/python3.10/shutil.py", line 730, in rmtree
    onerror(os.rmdir, path, sys.exc_info())
  File "/usr/lib/python3.10/shutil.py", line 728, in rmtree
    os.rmdir(path)
OSError: [Errno 39] Directory not empty: '/tmp/pymp-r5y6wzjt'
  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 280/280 [00:52<00:00,  5.30it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|▎                  | 1/68 [00:00<00:03, 19.05it/s][A
Validation DataLoader 0:   3%|▌                  | 2/68 [00:00<00:03, 18.36it/s][A
Validation DataLoader 0:   4%|▊                  | 3/68 [00:00<00:03, 18.29it/s][A
Validation DataLoader 0:   6%|█                  | 4/68 [00:00<00:03, 18.06it/s][A
Validation DataLoader 0:   7%|█▍                 | 5/68 [00:00<00:03, 18.10it/s][A
Validation DataLoader 0:   9%|█▋                 | 6/68 [00:00<00:03, 18.06it/s][A
Validation DataLoader 0:  10%|█▉                 | 7/68 [00:00<00:03, 18.03it/s][A
Validation DataLoader 0:  12%|██▏                | 8/68 [00:00<00:03, 18.00it/s][A
Validation DataLoader 0:  13%|██▌            

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 280/280 [00:59<00:00,  4.74it/s, val_loss=0.229, val_f1_macro_s


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1400-v5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=1400-v5.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5216666460037231
        F1-Macro            0.5215283632278442
       F1-Weighted          0.5228298306465149
       F1_Class 0           0.5296624302864075
       F1_Class 1           0.5133943557739258
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 1235
> Total counterfactuals added 1235
> Counterfactual size 371 at rate 0.30000000000000004
> Train with CF samples 9066


Global seed set to 42


284 68


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|████████████████████████████████| 284/284 [00:53<00:00,  5.30it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                        | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                           | 0/68 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|▎                  | 1/68 [00:00<00:03, 18.46it/s][A
Validation DataLoader 0:   3%|▌                  | 2/68 [00:00<00:03, 18.20it/s][A
Validation DataLoader 0:   4%|▊                  | 3/68 [00:00<00:03, 18.02it/s][A
Validation DataLoader 0:   6%|█                  | 4/68 [00:00<00:03, 17.98it/s][A
Validation DataLoader 0:   7%|█▍                 | 5/68 [00:00<00:03, 18.00it/s][A
Validation DataLoader 0:   9%|█▋                 | 6/68 [00:00<00:03, 18.00it/s][A
Validation DataLoader 0:  10%|█▉                 | 7/68 [00:00<00:03, 18.03it/s][A
Validation DataLoader 0:  12%|██▏                | 8/68 [00:00<00:03, 18.04it/s][A
Validation DataLoader 0:  13%|██▌            

In [28]:
def run_now_full():
    complete_result = []
    out_dict = {}
    d = 'Complete'
    out_dict[d] = {}
    sel_df = df
    print(d, sel_df.shape)
    X, y = sel_df['Tweet'], sel_df['Label'].astype(int)

    X_test, y_test = df_test['Tweet'], df_test['Label'].astype(int)
    res_bert_lst = run_experiment_org(X, y, X_test, y_test)
    out_dict[d]['Org'] = res_bert_lst
    out_dict[d]['CF'] = {}
    for cf_size_prop_to_data in cf_size_prop_to_data_lst:
        res_bert_lst = run_experiment_counter_factuals(X, y, X_test, y_test, cf_size_prop_to_data=cf_size_prop_to_data)
        out_dict[d]['CF'][cf_size_prop_to_data] = res_bert_lst
    complete_result.append(out_dict)
    json.dump(out_dict, open('out/'+ MODEL_LLM + '-EP_'+ str(MAX_EPOCHS) + '-ft-lshd22-heval19-ood-full.json', 'w'))
    return complete_result


start = time.time()
complete_result = run_now_full()
end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed/60:.2f} min")

Complete (55660, 4)
> Train samples 44528


Global seed set to 42


1392 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1392/1392 [05:36<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.37it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 14.00it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.82it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:24, 13.77it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.80it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.82it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.84it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.82it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1392/1392 [06:03<00:00,  3.83it/s, val_loss=0.216, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=6960.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=6960.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.26it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5223333239555359
        F1-Macro            0.5217249393463135
       F1-Weighted          0.5244542360305786
       F1_Class 0           0.5387833714485168
       F1_Class 1           0.5046664476394653
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 655 at rate 0.1
> Train with CF samples 45183


Global seed set to 42


1412 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1412/1412 [05:41<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.32it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 13.94it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:25, 13.77it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.70it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.75it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.81it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.81it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1412/1412 [06:08<00:00,  3.83it/s, val_loss=0.215, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7060.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7060.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5176666378974915
        F1-Macro            0.5173147916793823
       F1-Weighted          0.5194000005722046
       F1_Class 0           0.5303472876548767
       F1_Class 1           0.5042822957038879
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 1310 at rate 0.2
> Train with CF samples 45838


Global seed set to 42


1433 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1433/1433 [05:45<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.11it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 13.90it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.88it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.76it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.77it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.77it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.77it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.76it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1433/1433 [06:13<00:00,  3.83it/s, val_loss=0.188, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7165.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7165.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5213333368301392
        F1-Macro            0.5201336741447449
       F1-Weighted          0.5239726305007935
       F1_Class 0           0.5441269874572754
       F1_Class 1           0.49614036083221436
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 1965 at rate 0.30000000000000004
> Train with CF samples 46493


Global seed set to 42


1453 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1453/1453 [05:51<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.31it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 14.10it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.93it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:24, 13.82it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.84it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.86it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.86it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.83it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1453/1453 [06:18<00:00,  3.83it/s, val_loss=0.214, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7265.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7265.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5243333578109741
        F1-Macro            0.5243053436279297
       F1-Weighted          0.5248888731002808
       F1_Class 0           0.5279523730278015
       F1_Class 1           0.5206583738327026
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 2620 at rate 0.4
> Train with CF samples 47148


Global seed set to 42


1474 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1474/1474 [05:55<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.20it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:25, 13.72it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:25, 13.78it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.73it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.76it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1474/1474 [06:23<00:00,  3.84it/s, val_loss=0.208, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7370.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7370.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy             0.527999997138977
        F1-Macro            0.5264068245887756
       F1-Weighted          0.5308017730712891
       F1_Class 0            0.553875207901001
       F1_Class 1           0.4989384412765503
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 3274 at rate 0.5
> Train with CF samples 47802


Global seed set to 42


1494 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1494/1494 [06:00<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.06it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 13.93it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.80it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.71it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.73it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.74it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.74it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.73it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1494/1494 [06:28<00:00,  3.85it/s, val_loss=0.193, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7470.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7470.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5163333415985107
        F1-Macro            0.5144870281219482
       F1-Weighted          0.5192773938179016
       F1_Class 0           0.5444269776344299
       F1_Class 1           0.48454707860946655
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 3929 at rate 0.6
> Train with CF samples 48457


Global seed set to 42


1515 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1515/1515 [06:05<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.37it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:25, 13.83it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.87it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:24, 13.83it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.84it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.85it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.86it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.85it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1515/1515 [06:33<00:00,  3.85it/s, val_loss=0.213, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7575.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7575.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5246666669845581
        F1-Macro            0.5230990648269653
       F1-Weighted          0.5274738073348999
       F1_Class 0           0.5504413843154907
       F1_Class 1           0.49575671553611755
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 4584 at rate 0.7000000000000001
> Train with CF samples 49112


Global seed set to 42


1535 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1535/1535 [06:10<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.19it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:25, 13.79it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:25, 13.60it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.51it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:25, 13.58it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:25, 13.62it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.66it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.67it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1535/1535 [06:38<00:00,  3.85it/s, val_loss=0.209, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7675.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7675.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:12<00:00,  7.24it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5233333110809326
        F1-Macro             0.521869421005249
       F1-Weighted          0.5261024236679077
       F1_Class 0           0.5483259558677673
       F1_Class 1           0.4954128563404083
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 5239 at rate 0.8
> Train with CF samples 49767


Global seed set to 42


1556 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1556/1556 [06:15<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 14.26it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:24, 14.01it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:24, 13.85it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:24, 13.83it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.84it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.81it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.79it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1556/1556 [06:43<00:00,  3.86it/s, val_loss=0.202, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7780.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7780.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:13<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5243333578109741
        F1-Macro            0.5234521627426147
       F1-Weighted          0.5267308354377747
       F1_Class 0           0.5439437627792358
       F1_Class 1           0.5029606223106384
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 5894 at rate 0.9
> Train with CF samples 50422


Global seed set to 42


1576 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1576/1576 [06:20<00:00,  4.14it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 13.97it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:25, 13.60it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:25, 13.69it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.56it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:25, 13.62it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:25, 13.66it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.69it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.69it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1576/1576 [06:48<00:00,  3.86it/s, val_loss=0.199, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7880.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7880.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:13<00:00,  7.23it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5203333497047424
        F1-Macro            0.5183398723602295
       F1-Weighted          0.5232977271080017
       F1_Class 0           0.5493266582489014
       F1_Class 1           0.48735305666923523
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
> Total Tweets used to generate counterfactuals 6549
> Total counterfactuals added 6549
> Counterfactual size 6549 at rate 1.0
> Train with CF samples 51077


Global seed set to 42


1597 348


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Epoch 0: 100%|██████████████████████████████| 1597/1597 [08:33<00:00,  3.11it/s]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                       | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                          | 0/348 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                  | 1/348 [00:00<00:24, 13.95it/s][A
Validation DataLoader 0:   1%|                  | 2/348 [00:00<00:25, 13.84it/s][A
Validation DataLoader 0:   1%|▏                 | 3/348 [00:00<00:25, 13.70it/s][A
Validation DataLoader 0:   1%|▏                 | 4/348 [00:00<00:25, 13.66it/s][A
Validation DataLoader 0:   1%|▎                 | 5/348 [00:00<00:25, 13.71it/s][A
Validation DataLoader 0:   2%|▎                 | 6/348 [00:00<00:24, 13.74it/s][A
Validation DataLoader 0:   2%|▎                 | 7/348 [00:00<00:24, 13.74it/s][A
Validation DataLoader 0:   2%|▍                 | 8/348 [00:00<00:24, 13.74it/s][A
Validation DataLoader 0:   3%|▍              

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|█| 1597/1597 [09:01<00:00,  2.95it/s, val_loss=0.203, val_f1_macro


You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7985.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/atif/work/notebook-data/chatgpt/checkpoints/epoch=4-step=7985.ckpt
  rank_zero_warn(


Testing DataLoader 0: 100%|█████████████████████| 94/94 [00:13<00:00,  7.21it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Accuracy            0.5356666445732117
        F1-Macro            0.5354746580123901
       F1-Weighted          0.5369858741760254
       F1_Class 0           0.5449199676513672
       F1_Class 1           0.5260292887687683
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Time elapsed 371.38 min
