In [1]:
import os
import pandas as pd
from torch import Generator
from torch.utils.data import DataLoader, RandomSampler
# transformer
from transformers.optimization import AdamW, SchedulerType
# peft
from peft import LoraConfig, TaskType
# native
from NlpAnalytics import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
USE_LORA = False
GENERATOR = Generator().manual_seed(42)
PATH = 'NlpAnalytics/data/dummy_data'

In [3]:
###  Data Loader
# get tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer
# create DatasetNLP
train_sup_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "sup_train.csv")), tokenizer=tokenizer, cols_label=['label'])
train_unsup_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "unsup_train.csv"))[:16], tokenizer=tokenizer, cols_to_tokenize=['orig_text', 'aug_text'])
test_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "sup_test.csv"))[:16], tokenizer=tokenizer, cols_to_tokenize=['text'], cols_label=['label'])
# assemble data loader
datamodeler = {
    DataLoaderType.TRAINING: DataLoader(train_sup_ds, sampler=RandomSampler(train_sup_ds, generator=GENERATOR), batch_size=8),
    DataLoaderType.TRAINING_UNLABELED: DataLoader(train_unsup_ds, sampler=RandomSampler(train_unsup_ds, generator=GENERATOR), batch_size=8),
    DataLoaderType.VALIDATION: DataLoader(test_ds, sampler=RandomSampler(test_ds, generator=GENERATOR), batch_size=8)}



In [4]:
### Model & Optimization
num_labels = 2
# it has to be NATIVE one not HF [TODO: make it compatible with _HF]
loader = BertClassifierLoader(ClassifierType.BERT_CLASSIFIER, "bert-base-uncased", num_labels, 0.1)

In [5]:
### Optimizer set up
if not USE_LORA:
    optimizer = AdamNLP.newNLPAdam(loader.model, {'embeddings':True, 'encoder': 9}, lr = 0.0005)
    model = optimizer.get_model_transformed()
else:
    lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,target_modules=["query", "key", "value"], r=1, lora_alpha=1, lora_dropout=0.1)
    optimizer = AdamNLP.newNLPAdam_LORA(loader.model, lora_config)
    model = optimizer.get_model_transformed()
# aux model
aux_model = MultiLabelClassifier(model.bert.config.hidden_size, num_labels)
aux_optimizer = AdamW(aux_model.parameters(), lr=0.0005)



In [6]:
### Training
trainer = TrainerMixAndMatch(model, aux_model, datamodeler, optimizer, aux_optimizer)
trainer.train(2, schedule_type = SchedulerType.CONSTANT, save_model_freq=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

For epoch 1, the mean sup loss is: 0.6774942278862, and accuracy is: 0.625.
Validation accuracy is: 0.0.



Epoch:  50%|█████     | 1/2 [00:02<00:02,  2.09s/it]

For epoch 2, the mean sup loss is: 1.2225176095962524, and accuracy is: 0.5.
Validation accuracy is: 0.0.



Epoch: 100%|██████████| 2/2 [00:04<00:00,  2.07s/it]
