In [1]:
import pandas as pd
from pathlib import Path
from torch import cuda
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bert_classifier.data import create_label_dict, label2id, CustomDataset
from bert_classifier.bert import BertClass, bert_encoder
from bert_classifier.train import custom_trainer, validate, optimizer
from bert_classifier.metrics import classification_metrics
from bert_classifier.io import (
    get_pretrained_tokenizer,
    get_pretrained_model,
    save_model,
    save_label_dict
)

In [2]:
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
TRAINING_DATA = Path('../data/0_external/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'label'
DATA_COL = 'data'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_DIR = MODEL_DIR / 'fine-tuned'
FINETUNED_MODEL = FINETUNED_DIR / 'fine-tuned-uncased'
FINETUNED_MODEL_STATE = FINETUNED_DIR / 'model-state-dict'
FINETUNED_OPT_STATE = FINETUNED_DIR / 'opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(MODEL_NAME, PRETRAINED_TOKENIZER)
pretrained_model = get_pretrained_model(MODEL_NAME, PRETRAINED_MODEL)

In [4]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data = label2id(data, LABEL_COL, label_dict)

In [5]:
test_size = 0.2
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [6]:
training_set = CustomDataset(df_train, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [7]:
model = BertClass(len(label_dict), pretrained_model)
optimizer = optimizer(model, LEARNING_RATE)
custom_trainer(model, optimizer, train_dataloader, test_dataloader, EPOCHS, DEVICE)


Total steps: 1 || Training in progress...


1it [01:04, 64.34s/it]



            Training time: 64.38544797897339 seconds ||
            Training loss: 0.7413671016693115 ||
            Training accuracy: 0.5
            
Evaluation in progress...

            Validation time: 68.7049651145935 seconds
            Validation loss: 0.5861829519271851 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [00:56, 56.14s/it]



            Training time: 56.15104913711548 seconds ||
            Training loss: 0.6296225190162659 ||
            Training accuracy: 0.75
            
Evaluation in progress...

            Validation time: 59.771759271621704 seconds
            Validation loss: 0.5529849529266357 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [01:09, 69.58s/it]



            Training time: 69.59554600715637 seconds ||
            Training loss: 0.5394025444984436 ||
            Training accuracy: 0.875
            
Evaluation in progress...

            Validation time: 75.20333790779114 seconds
            Validation loss: 0.5303083658218384 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [01:07, 67.89s/it]



            Training time: 67.90025019645691 seconds ||
            Training loss: 0.4971851408481598 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 72.36053514480591 seconds
            Validation loss: 0.5115190744400024 ||
            Validation accuracy: 1.0
            

Total steps: 1 || Training in progress...


1it [01:02, 62.32s/it]



            Training time: 62.326988697052 seconds ||
            Training loss: 0.5546638369560242 ||
            Training accuracy: 0.75
            
Evaluation in progress...

            Validation time: 66.95899486541748 seconds
            Validation loss: 0.49639031291007996 ||
            Validation accuracy: 1.0
            


In [8]:
outputs, targets, loss = validate(model, test_dataloader, DEVICE)
classification_metrics(outputs, targets)

{'n': 2,
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'confusion': array([[1, 0],
        [0, 1]])}

In [9]:
save_model(model, optimizer)
save_label_dict(label_dict)