In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from torch import cuda
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from nlp_models.base.data import create_label_dict, label_to_id_int, CustomDataset
from nlp_models.base.metrics import classification_metrics
from nlp_models.base.io import (
    get_pretrained_tokenizer,
    get_pretrained_model,
    save_model,
    save_label_dict
)
from nlp_models.bert_classifier.bert import BertClass
from nlp_models.bert_classifier.train import custom_trainer, validate, optimizer_obj

In [3]:
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
TRAINING_DATA = Path('../data/0_external/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'label'
DATA_COL = 'data'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_DIR = MODEL_DIR / 'fine-tuned'
FINETUNED_MODEL = FINETUNED_DIR / 'fine-tuned-uncased'
FINETUNED_MODEL_STATE = FINETUNED_DIR / 'model-state-dict'
FINETUNED_OPT_STATE = FINETUNED_DIR / 'opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(MODEL_NAME, PRETRAINED_TOKENIZER)
pretrained_model = get_pretrained_model(MODEL_NAME, PRETRAINED_MODEL)

In [5]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data[LABEL_COL] = data[LABEL_COL].apply(lambda c: label_to_id_int(c, label_dict))

In [6]:
test_size = 0.2
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [18]:
training_set = CustomDataset(df_train, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN, int_labels=True)
testing_set = CustomDataset(df_test, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN, int_labels=True)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [21]:
model = BertClass(len(label_dict), pretrained_model)
optimizer = optimizer_obj(model, LEARNING_RATE)
custom_trainer(model, optimizer, train_dataloader, test_dataloader, EPOCHS, DEVICE)


Total steps: 1 || Training in progress...


0it [00:00, ?it/s]

1it [00:45, 45.88s/it]



            Training time: 45.886006116867065 seconds ||
            Training loss: 0.6435834765434265 ||
            Training accuracy: 0.75
            
Evaluation in progress...

            Validation time: 48.50241994857788 seconds
            Validation loss: 0.6544889211654663 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:40, 40.69s/it]



            Training time: 40.690561056137085 seconds ||
            Training loss: 0.5998444557189941 ||
            Training accuracy: 0.875
            
Evaluation in progress...

            Validation time: 42.543277978897095 seconds
            Validation loss: 0.6307095289230347 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:31, 31.09s/it]



            Training time: 31.095382928848267 seconds ||
            Training loss: 0.5841808915138245 ||
            Training accuracy: 0.875
            
Evaluation in progress...

            Validation time: 33.1079638004303 seconds
            Validation loss: 0.603857696056366 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [00:28, 28.89s/it]



            Training time: 28.888659954071045 seconds ||
            Training loss: 0.4942058324813843 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 30.851701021194458 seconds
            Validation loss: 0.5770883560180664 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [00:26, 26.57s/it]



            Training time: 26.574002027511597 seconds ||
            Training loss: 0.41605648398399353 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 28.43067193031311 seconds
            Validation loss: 0.5508862137794495 ||
            Validation accuracy: 1.0
            


In [22]:
outputs, targets, loss = validate(model, test_dataloader, DEVICE)
classification_metrics(outputs, targets)

{'n': 2,
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'confusion': array([[1, 0],
        [0, 1]])}

In [23]:
save_model(model, optimizer)
save_label_dict(label_dict)