In [1]:
import pandas as pd
import numpy as np
import torch
from torch import Generator
from peft import LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, ConcatDataset, Subset, random_split, RandomSampler
# transformer
from transformers.optimization import AdamW, get_scheduler, SchedulerType
# native
from NlpAnalytics import *



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /Users/lunli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load dataset
df_train = pd.read_csv('./dataset/amazon_train.csv')
df_valid = pd.read_csv('./dataset/amazon_valid.csv')
df_test = pd.read_csv('./dataset/amazon_test.csv')

In [3]:
# remove the id,label_text columns
df_train = df_train.drop(['id','label_text'], axis = 1)
df_valid = df_valid.drop(['id','label_text'], axis = 1)
df_test = df_test.drop(['id','label_text'], axis = 1)

In [None]:
df_train

In [4]:
 ### Load tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer

df_train_ = DatasetNLP(input_df=df_train, 
                    tokenizer=tokenizer,
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )
df_valid_ = DatasetNLP(input_df=df_valid, 
                    tokenizer=tokenizer,  
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )
df_test_ = DatasetNLP(input_df=df_test, 
                    tokenizer=tokenizer,  
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )

In [5]:
# To dataloader

generator = Generator().manual_seed(42)
train_dataloader = DataLoader(df_train_, sampler=RandomSampler(df_train_, generator=generator), batch_size=32)
valid_dataloader = DataLoader(df_valid_, sampler=RandomSampler(df_valid_, generator=generator), batch_size=32)
test_dataloader = DataLoader(df_test_, sampler=RandomSampler(df_test_, generator=generator), batch_size=32)

In [10]:

#### trainer ####
### load HF BERT Classifier
num_labels = len(df_train['label'].unique())
loader = BertClassifierLoader(ClassifierType.BERT_CLASSIFIER, "bert-base-uncased", num_labels, 0.1, load_tokenizer=True)


In [11]:
datamodeler = {DataLoaderType.TRAINING: train_dataloader,DataLoaderType.VALIDATION: valid_dataloader,DataLoaderType.TESTING:test_dataloader}
my_loss_func = get_loss_functions(LossFuncType.CROSS_ENTROPY)
##### no lora ####
optimizer = AdamNLP.newNLPAdam(loader.model, {'embeddings':True, 'encoder': 9}, lr = 0.0005)
model = optimizer.get_model_transformed()
##### lora #####
# lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,target_modules=["query", "key", "value"], r=1, lora_alpha=1, lora_dropout=0.1)
# optimizer = AdamNLP.newNLPAdam_LORA(loader.model, lora_config)
# model = optimizer.get_model_transformed()

trainer = Trainer(model, datamodeler, my_loss_func, optimizer)
trainer.train(5, schedule_type = SchedulerType.CONSTANT, save_model_freq=1)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

At step 100, the training loss is 2.4227107644081114.
At step 200, the training loss is 1.8437885788083077.
At step 300, the training loss is 1.5516526286800703.
For epoch 1, the mean sup loss is: 1.4600541857381661, and accuracy is: 0.6208094358444214.
Validation accuracy is: 0.7840629816055298.



Epoch:  20%|██        | 1/5 [00:44<02:57, 44.42s/it]

At step 100, the training loss is 0.743461574614048.
At step 200, the training loss is 0.7599235337972641.
At step 300, the training loss is 0.7798667089144389.
For epoch 2, the mean sup loss is: 0.7667223841779762, and accuracy is: 0.793034553527832.
Validation accuracy is: 0.812100350856781.



Epoch:  40%|████      | 2/5 [01:29<02:13, 44.52s/it]

At step 100, the training loss is 0.6850616621971131.
At step 200, the training loss is 0.6757959394901991.
At step 300, the training loss is 0.6902452560762564.
For epoch 3, the mean sup loss is: 0.6890239578568274, and accuracy is: 0.8064095973968506.
Validation accuracy is: 0.8278406262397766.



Epoch:  60%|██████    | 3/5 [02:12<01:28, 44.17s/it]

At step 100, the training loss is 0.5648182702064514.
At step 200, the training loss is 0.5726697836816311.
At step 300, the training loss is 0.5934335879981518.
For epoch 4, the mean sup loss is: 0.59676102163891, and accuracy is: 0.8343755602836609.
Validation accuracy is: 0.8057058453559875.



Epoch:  80%|████████  | 4/5 [02:56<00:43, 43.90s/it]

At step 100, the training loss is 0.5744256457686424.
At step 200, the training loss is 0.5780070266872644.
At step 300, the training loss is 0.5575636709233125.
For epoch 5, the mean sup loss is: 0.5617036698179112, and accuracy is: 0.8449713587760925.
Validation accuracy is: 0.8273487687110901.



Epoch: 100%|██████████| 5/5 [03:39<00:00, 43.94s/it]
