In [1]:
import os
import pandas as pd
# torch
from torch.utils.data import DataLoader, RandomSampler
# peft
from peft import LoraConfig, TaskType
# native
from NlpAnalytics import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### global vars
USE_LORA = True
ROOT_PATH = 'NlpAnalytics/data/dummy_data'

In [3]:
### load data
df_train = pd.read_csv(os.path.join(f'{ROOT_PATH}/amazon_train.csv')).drop(['id', 'label_text'], axis=1)
df_valid = pd.read_csv(os.path.join(f'{ROOT_PATH}/amazon_valid.csv')).drop(['id', 'label_text'], axis=1)
df_test = pd.read_csv(os.path.join(f'{ROOT_PATH}/amazon_test.csv')).drop(['id', 'label_text'], axis=1)

In [4]:
### assemble data modeler
tokenizer = BertLoader(load_tokenizer=True).tokenizer
train_dataset = DatasetNLP(input_df=df_train, tokenizer=tokenizer, cols_to_tokenize=['text'], cols_label=['label'])
valid_dataset = DatasetNLP(input_df=df_valid, tokenizer=tokenizer, cols_to_tokenize=['text'], cols_label=['label'])
test_dataset = DatasetNLP(input_df=df_test, tokenizer=tokenizer, cols_to_tokenize=['text'], cols_label=['label'])
datamodeler = {
    DataLoaderType.TRAINING: DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32), 
    DataLoaderType.VALIDATION: DataLoader(valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=32),
    DataLoaderType.TESTING: DataLoader(test_dataset, batch_size=32)}



In [5]:
### load model
num_labels = len(df_train['label'].unique())
loader = BertClassifierLoader(ClassifierType.BERT_CLASSIFIER, 'bert-base-uncased', num_labels=num_labels)

In [6]:
### loss func & optimizer
loss_func = get_loss_functions(LossFuncType.CROSS_ENTROPY)
if not USE_LORA:
    optimizer = AdamNLP.newNLPAdam(loader.model, {'embeddings' : True, 'encoder' : 9}, lr=2e-4)
else:
    lora_config = LoraConfig(task_type=TaskType.SEQ_CLS, target_modules=['query', 'key', 'value'], r=1, lora_alpha=1., lora_dropout=0.1)
    optimizer = AdamNLP.newNLPAdam_LORA(loader.model, lora_config)
model = optimizer.get_model_transformed()



In [7]:
### start training
trainer = Trainer(model, datamodeler, loss_func, optimizer)
trainer.train(5, schedule_type=SchedulerType.CONSTANT, save_model_freq=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

At step 100, the training (sup)loss is 3.6342701649665834.
At step 200, the training (sup)loss is 3.160478878617287.
At step 300, the training (sup)loss is 2.7086252494653067.
For epoch 1, the mean sup loss is: 2.5007002628511854, and accuracy is: 0.35591453313827515.


Epoch:  20%|██        | 1/5 [01:05<04:20, 65.24s/it]

Validation accuracy is: 0.7068371772766113.

At step 100, the training (sup)loss is 1.1758556139469147.
At step 200, the training (sup)loss is 1.0946638363599777.
At step 300, the training (sup)loss is 1.0428383094072342.
For epoch 2, the mean sup loss is: 1.0173466237054931, and accuracy is: 0.7344971299171448.


Epoch:  40%|████      | 2/5 [02:10<03:15, 65.26s/it]

Validation accuracy is: 0.8140679001808167.

At step 100, the training (sup)loss is 0.779270369708538.
At step 200, the training (sup)loss is 0.7670558834075928.
At step 300, the training (sup)loss is 0.7602354889114697.
For epoch 3, the mean sup loss is: 0.753211014635033, and accuracy is: 0.7978113889694214.


Epoch:  60%|██████    | 3/5 [03:15<02:10, 65.11s/it]

Validation accuracy is: 0.829316258430481.

At step 100, the training (sup)loss is 0.6606304344534873.
At step 200, the training (sup)loss is 0.6334922241419554.
At step 300, the training (sup)loss is 0.6410610896845659.
For epoch 4, the mean sup loss is: 0.6377354768001371, and accuracy is: 0.8277748823165894.


Epoch:  80%|████████  | 4/5 [04:20<01:04, 64.97s/it]

Validation accuracy is: 0.84456467628479.

At step 100, the training (sup)loss is 0.548561694920063.
At step 200, the training (sup)loss is 0.5736738024652004.
At step 300, the training (sup)loss is 0.5663673825562.
For epoch 5, the mean sup loss is: 0.5670469277434879, and accuracy is: 0.84392911195755.


Epoch: 100%|██████████| 5/5 [05:25<00:00, 65.10s/it]

Validation accuracy is: 0.84800785779953.




