In [6]:
import pandas as pd
from pathlib import Path
from torch import cuda
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bert_classifier.data import create_label_dict, label2id, CustomDataset
from bert_classifier.bert import BertClass
from bert_classifier.train import custom_trainer, validate, optimizer
from bert_classifier.metrics import classification_metrics
from bert_classifier.io import (
    get_pretrained_tokenizer,
    get_pretrained_model,
    save_model,
    save_label_dict
)

In [7]:
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

In [8]:
TRAINING_DATA = Path('../data/0_external/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'label'
DATA_COL = 'data'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_DIR = MODEL_DIR / 'fine-tuned'
FINETUNED_MODEL = FINETUNED_DIR / 'fine-tuned-uncased'
FINETUNED_MODEL_STATE = FINETUNED_DIR / 'model-state-dict'
FINETUNED_OPT_STATE = FINETUNED_DIR / 'opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(MODEL_NAME, PRETRAINED_TOKENIZER)
pretrained_model = get_pretrained_model(MODEL_NAME, PRETRAINED_MODEL)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data = label2id(data, LABEL_COL, label_dict)

In [10]:
test_size = 0.2
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [11]:
training_set = CustomDataset(df_train, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, DATA_COL, LABEL_COL, pretrained_tokenizer, MAX_LEN)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [12]:
model = BertClass(len(label_dict), pretrained_model)
optimizer = optimizer(model, LEARNING_RATE)
custom_trainer(model, optimizer, train_dataloader, test_dataloader, EPOCHS, DEVICE)

In [8]:
outputs, targets, loss = validate(model, test_dataloader, DEVICE)
classification_metrics(outputs, targets)

{'n': 2,
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'confusion': array([[1, 0],
        [0, 1]])}

In [9]:
save_model(model, optimizer)
save_label_dict(label_dict)