In [1]:
import torch
import pandas as pd
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bert_classifier.data import create_label_dict, label2id, CustomDataset
from bert_classifier.bert import BertClass, bert_encoder
from bert_classifier.train import custom_trainer, validate
from bert_classifier.metrics import classification_metrics
from bert_classifier.io import (
    get_pretrained_tokenizer,
    get_pretrained_model,
    save_model,
    save_label_dict
)

In [2]:
# os.environ['CUDA_VISIBLE_DEVICES']=''
# device='cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
TRAINING_DATA = Path('../data/0_external/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'label'
DATA_COL = 'data'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_DIR = MODEL_DIR / 'fine-tuned'
FINETUNED_MODEL = FINETUNED_DIR / 'fine-tuned-uncased'
FINETUNED_MODEL_STATE = FINETUNED_DIR / 'model-state-dict'
FINETUNED_OPT_STATE = FINETUNED_DIR / 'opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(MODEL_NAME, PRETRAINED_TOKENIZER)
pretrained_model = get_pretrained_model(MODEL_NAME, PRETRAINED_MODEL)

In [4]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data = label2id(data, LABEL_COL, label_dict)

In [5]:
test_size = 0.2
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [6]:
training_set = CustomDataset(df_train, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [7]:
model = BertClass(len(label_dict), pretrained_model)

optimizer = torch.optim.AdamW(
    params=model.parameters(), 
    lr=LEARNING_RATE
)

custom_trainer(model, optimizer, train_dataloader, test_dataloader, EPOCHS)


Total steps: 1 || Training in progress...


1it [01:11, 71.89s/it]



            Training time: 71.91619825363159 seconds ||
            Training loss: 0.6141266822814941 ||
            Training accuracy: 0.75
            
Evaluation in progress...

            Validation time: 75.70688605308533 seconds
            Validation loss: 0.6468579173088074 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:50, 50.02s/it]



            Training time: 50.032525062561035 seconds ||
            Training loss: 0.6151097416877747 ||
            Training accuracy: 0.625
            
Evaluation in progress...

            Validation time: 54.74058389663696 seconds
            Validation loss: 0.6368004679679871 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [01:04, 64.95s/it]



            Training time: 64.95808410644531 seconds ||
            Training loss: 0.5412606000900269 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 69.18830418586731 seconds
            Validation loss: 0.6256762146949768 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [00:59, 59.94s/it]



            Training time: 59.95087695121765 seconds ||
            Training loss: 0.4792165160179138 ||
            Training accuracy: 0.875
            
Evaluation in progress...

            Validation time: 64.63899302482605 seconds
            Validation loss: 0.614830493927002 ||
            Validation accuracy: 0.5
            

Total steps: 1 || Training in progress...


1it [01:02, 62.70s/it]



            Training time: 62.70720195770264 seconds ||
            Training loss: 0.4453120827674866 ||
            Training accuracy: 1.0
            
Evaluation in progress...

            Validation time: 66.76218485832214 seconds
            Validation loss: 0.6067008972167969 ||
            Validation accuracy: 0.5
            


In [8]:
outputs, targets, loss = validate(model, test_dataloader)
classification_metrics(outputs, targets)

  _warn_prf(average, modifier, msg_start, len(result))


{'n': 2,
 'accuracy': 0.5,
 'precision': 0.25,
 'recall': 0.5,
 'f1': 0.3333333333333333,
 'confusion': array([[1, 0],
        [1, 0]])}

In [9]:
save_model(model, optimizer)
save_label_dict(label_dict)

In [10]:
from bert_classifier.io import load_model

In [11]:
model = load_model()