In [1]:
import torch
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bert_classifier.data import create_label_dict, label2id, CustomDataset
from bert_classifier.bert import BertClass, bert_encoder
from bert_classifier.train import custom_trainer, validate
from bert_classifier.metrics import classification_metrics
from bert_classifier.io import (
    get_pretrained_tokenizer,
    get_pretrained_model,
    save_model,
    save_label_dict
)

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
TRAINING_DATA = Path('../data/0_external/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'label'
DATA_COL = 'data'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_DIR = MODEL_DIR / 'fine-tuned'
FINETUNED_MODEL = FINETUNED_DIR / 'fine-tuned-uncased'
FINETUNED_MODEL_STATE = FINETUNED_DIR / 'model-state-dict'
FINETUNED_OPT_STATE = FINETUNED_DIR / 'opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(MODEL_NAME, PRETRAINED_TOKENIZER)
pretrained_model = get_pretrained_model(MODEL_NAME, PRETRAINED_MODEL)

In [4]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data = label2id(data, LABEL_COL, label_dict)

In [5]:
test_size = 0.2
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [6]:
training_set = CustomDataset(df_train, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [7]:
model = BertClass(len(label_dict), pretrained_model)

optimizer = torch.optim.AdamW(
    params=model.parameters(), 
    lr=LEARNING_RATE
)

custom_trainer(model, optimizer, train_dataloader, test_dataloader, EPOCHS, DEVICE)


Total steps: 1 || Training in progress...


1it [00:46, 46.06s/it]



            Training time: 46.09762477874756 seconds ||
            Training loss: 0.7763466835021973 ||
            Training accuracy: 0.375
            
Evaluation in progress...

            Validation time: 49.28972792625427 seconds
            Validation loss: 0.7055730223655701 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:41, 41.82s/it]



            Training time: 41.818588733673096 seconds ||
            Training loss: 0.7088087797164917 ||
            Training accuracy: 0.5
            
Evaluation in progress...

            Validation time: 44.949063777923584 seconds
            Validation loss: 0.6958093643188477 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:41, 41.15s/it]



            Training time: 41.15194892883301 seconds ||
            Training loss: 0.6472956538200378 ||
            Training accuracy: 0.375
            
Evaluation in progress...

            Validation time: 44.188668966293335 seconds
            Validation loss: 0.6875118017196655 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:41, 41.19s/it]



            Training time: 41.192286014556885 seconds ||
            Training loss: 0.5874937772750854 ||
            Training accuracy: 0.75
            
Evaluation in progress...

            Validation time: 44.41243290901184 seconds
            Validation loss: 0.677262544631958 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:41, 41.65s/it]



            Training time: 41.6529221534729 seconds ||
            Training loss: 0.5252465605735779 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 44.729873180389404 seconds
            Validation loss: 0.6648757457733154 ||
            Validation accuracy: 1.0
            


In [8]:
outputs, targets, loss = validate(model, test_dataloader, DEVICE)
classification_metrics(outputs, targets)

{'n': 2,
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'confusion': array([[1, 0],
        [0, 1]])}

In [9]:
save_model(model, optimizer)
save_label_dict(label_dict)