In [1]:
import os
import pandas as pd
# torch
from torch import Generator
from peft import LoraConfig, TaskType
from torch.utils.data import DataLoader, RandomSampler
# transformer
from transformers.optimization import SchedulerType
# native
from NlpAnalytics import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### global var
USE_LORA = False
GENERATOR = Generator().manual_seed(42)
PATH = 'NlpAnalytics/data/dummy_data'

In [3]:
###  Data Loader
# get tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer
# create DatasetNLP
train_sup_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "sup_train.csv")), tokenizer=tokenizer, cols_label=['label'])
train_unsup_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "unsup_train.csv"))[:16], tokenizer=tokenizer, cols_to_tokenize=['orig_text', 'aug_text'])
test_ds = DatasetNLP(input_df=pd.read_csv(os.path.join(PATH, "sup_test.csv"))[:16], tokenizer=tokenizer, cols_to_tokenize=['text'], cols_label=['label'])
# assemble data loader
datamodeler = {
    DataLoaderType.TRAINING: DataLoader(train_sup_ds, sampler=RandomSampler(train_sup_ds, generator=GENERATOR), batch_size=8),
    DataLoaderType.TRAINING_UNLABELED: DataLoader(train_unsup_ds, sampler=RandomSampler(train_unsup_ds, generator=GENERATOR), batch_size=8),
    DataLoaderType.VALIDATION: DataLoader(test_ds, sampler=RandomSampler(test_ds, generator=GENERATOR), batch_size=8)}



In [4]:
### model loader
loader = BertClassifierLoader(ClassifierType.BERT_CLASSIFIER_HF, "bert-base-uncased", 2, 0.1, load_tokenizer=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
### loss functions
loss_dict = {
    'sup': get_loss_functions(LossFuncType.CROSS_ENTROPY, reduce='none'), 
    'unsup': get_loss_functions(LossFuncType.KL_DIV, reduce='none')}

In [6]:
### set up optimizer
if not USE_LORA:
    optimizer = AdamNLP.newNLPAdam(loader.model, {'embeddings':True, 'encoder': 9}, lr=2e-4)
    model = optimizer.get_model_transformed()
else:
    lora_config = LoraConfig(task_type=TaskType.SEQ_CLS, target_modules=["query", "key", "value"], r=1, lora_alpha=1, lora_dropout=0.1)
    optimizer = AdamNLP.newNLPAdam_LORA(loader.model, lora_config)
    model = optimizer.get_model_transformed()



In [7]:
### Training
trainer = TrainerUDA(model, datamodeler, loss_dict, optimizer)
trainer.train(2, schedule_type=SchedulerType.CONSTANT, save_model_freq=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

For epoch 1, the mean sup loss is: 0.7799562811851501, and accuracy is: 0.625.
Validation accuracy is: 0.0.



Epoch:  50%|█████     | 1/2 [00:01<00:01,  1.47s/it]

For epoch 2, the mean sup loss is: 0.8996076583862305, and accuracy is: 0.5.
Validation accuracy is: 0.0.



Epoch: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
