In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from datasets import load_from_disk
from sentence_transformers import SentenceTransformer
import numpy as np
import srsly
import hnswlib as hb
from src.data.datamodule import DataModule
from src.huggingface.datamodule import ClassificationDataModule, ClassificationActiveDataModule
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from src.huggingface.estimators import EstimatorForSequenceClassification

In [3]:
data_path = Path("../data/prepared/ag_news/")
meta = srsly.read_yaml(data_path / "metadata.yaml")
dataset_dict = load_from_disk(data_path)



In [4]:
tokenizer = AutoTokenizer.from_pretrained(meta["name_or_path"])

In [5]:
dm = ClassificationDataModule.from_dataset_dict(dataset_dict, tokenizer)
dm.hparams

"batch_size":             32
"drop_last":              False
"eval_batch_size":        32
"max_source_length":      128
"num_workers":            0
"persistent_workers":     False
"pin_memory":             True
"replacement":            False
"seed":                   42
"shuffle":                True
"tokenizer_name_or_path": google/bert_uncased_L-2_H-128_A-2

In [22]:
adm = ClassificationActiveDataModule.from_dataset_dict(dataset_dict, tokenizer)
adm.hparams

[<SpecialKeys.ID: 'unique_id'>]


"batch_size":             32
"drop_last":              False
"eval_batch_size":        32
"max_source_length":      128
"num_workers":            0
"persistent_workers":     False
"pin_memory":             True
"replacement":            False
"seed":                   42
"shuffle":                True
"tokenizer_name_or_path": google/bert_uncased_L-2_H-128_A-2

In [8]:
dm.show_batch("test")

{<InputKeys.INPUT_IDS: 'input_ids'>: tensor([[  101, 10069,  2005,  ...,     0,     0,     0],
         [  101,  1996,  2679,  ...,     0,     0,     0],
         [  101, 18712,  1012,  ...,     0,     0,     0],
         ...,
         [  101, 10478, 19439,  ...,     0,     0,     0],
         [  101,  2027,  1005,  ...,     0,     0,     0],
         [  101,  6505,  4057,  ...,     0,     0,     0]]),
 <InputKeys.ATT_MASK: 'attention_mask'>: tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 <InputKeys.TARGET: 'labels'>: tensor([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 1, 1, 1, 1, 1, 1]),
 <InputKeys.ON_CPU: 'on_cpu'>: {<SpecialKeys.ID: 'unique_id'>: [120000,
   120001,
   120002,
   120003,
   120004,
   120005,
   120006,
   120007,
   120008,
   120009,
   120010,


In [7]:
adm.show_batch("test")

{<InputKeys.INPUT_IDS: 'input_ids'>: tensor([[  101, 10069,  2005,  ...,     0,     0,     0],
         [  101,  1996,  2679,  ...,     0,     0,     0],
         [  101, 18712,  1012,  ...,     0,     0,     0],
         ...,
         [  101, 10478, 19439,  ...,     0,     0,     0],
         [  101,  2027,  1005,  ...,     0,     0,     0],
         [  101,  6505,  4057,  ...,     0,     0,     0]]),
 <InputKeys.ATT_MASK: 'attention_mask'>: tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 <InputKeys.TARGET: 'labels'>: tensor([2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 1, 1, 1, 1, 1, 1])}

In [13]:
adm.test_dataset[0].keys()

dict_keys(['unique_id', 'labels', 'input_ids', 'attention_mask'])

In [19]:
SpecialKeys.ID in adm.test_dataset.features

True

In [18]:
from src.enums import SpecialKeys

In [15]:
adm.columns_on_cpu

[]

In [9]:
adm.label([0])

In [10]:
adm.train_size

1

In [11]:
from hydra.utils import instantiate

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(meta["name_or_path"])

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [29]:
from src.active_learning.strategies import RandomStrategy

In [31]:
r = RandomStrategy(model, 1)

In [34]:
r.loggers

AttributeError: 'RandomStrategy' object has no attribute 'loggers'

In [38]:
df = pd.read_parquet("/home/pl487/allset/outputs/dry_run/ag_news_2023-02-14T19-15-13/labelled_dataset.parquet")

In [43]:
adm.test_dataset[0].keys()

dict_keys(['unique_id', 'labels', 'input_ids', 'attention_mask'])

In [None]:
ad

In [39]:
df

Unnamed: 0,unique_id,labels,text,input_ids,token_type_ids,attention_mask,is_labelled,is_validation,labelling_round
0,201,3,Fake goods tempting young adults Young people ...,"[101, 8275, 5350, 23421, 2402, 6001, 2402, 211...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
1,1749,2,US Army to withhold portion of Halliburton pay...,"[101, 2149, 2390, 2000, 2007, 12640, 4664, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
2,2619,0,Medical Experts Fear Charley's Aftermath PUNTA...,"[101, 2966, 8519, 3571, 20430, 1005, 1055, 105...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
3,2662,0,"Google Cuts Its IPO Price Range SAN JOSE, Cali...","[101, 8224, 7659, 2049, 12997, 2080, 3976, 284...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,True,0
4,7168,3,Flying the Sun to Safety When the Genesis caps...,"[101, 3909, 1996, 3103, 2000, 3808, 2043, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
5,11203,1,The U.S. Gives Mia Hamm a Golden Farewell ATH...,"[101, 1996, 1057, 1012, 1055, 1012, 3957, 8764...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
6,11980,2,Krispy Kreme #39;s hot streak cools The niche ...,"[101, 19031, 7685, 1047, 28578, 2063, 1001, 44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
7,13664,0,Man dies in blast as Chechens go to polls OISK...,"[101, 2158, 8289, 1999, 8479, 2004, 18178, 866...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
8,13752,3,Alaska Brown Bears Gather for Annual Salmon Fe...,"[101, 7397, 2829, 6468, 8587, 2005, 3296, 1184...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0
9,14005,1,"Hewitt Wins Long Island, Sends U.S. Open Warni...","[101, 19482, 5222, 2146, 2479, 1010, 10255, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",True,False,0


In [18]:
random = {
    "_target_": "src.huggingface.estimators.RandomStrategyForSequenceClassification",
    "seed": 42,
}
instantiate(random, model=model, accelerator="cpu")

<src.huggingface.estimators.RandomStrategyForSequenceClassification at 0x7f672f8678b0>

In [35]:
estimator = EstimatorForSequenceClassification(model)

In [36]:
estimator.loggers

AttributeError: 'EstimatorForSequenceClassification' object has no attribute 'loggers'

In [None]:
model_name = "all-mpnet-base-v2" # "all-MiniLM-L6-v2"
sentence_encoder = SentenceTransformer(model_name)