In [4]:
import torch
from pathlib import Path
from bert_classifier.io import get_pretrained_tokenizer, get_pretrained_model
from bert_classifier.data import create_label_dict, CustomDataset
from bert_classifier.bert import BertClass, bert_encoder
from bert_classifier.train import custom_trainer, validate
from bert_classifier.metrics import classification_metrics

In [6]:
# os.environ['CUDA_VISIBLE_DEVICES']=''
# device='cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
TRAINING_DATA = Path('../data/processed/data.csv')
MODEL_DIR = Path('../models/bert')
LABEL_COL = 'Category'
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-05
MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = MODEL_DIR / 'pretrained/tokenizer-uncased'
PRETRAINED_MODEL = MODEL_DIR / 'pretrained/bert-base-uncased'
FINETUNED_MODEL = MODEL_DIR / 'fine-tuned/fine-tuned-uncased'
FINETUNED_MODEL_STATE = MODEL_DIR / 'fine-tuned/model-state-dict'
FINETUNED_OPT_STATE = MODEL_DIR / 'fine-tuned/opt-state-dict'
CHECKPOINT_DIR = MODEL_DIR / 'checkpoint'
pretrained_tokenizer = get_pretrained_tokenizer(PRETRAINED_TOKENIZER, MODEL_NAME)
pretrained_model = get_pretrained_model(PRETRAINED_MODEL, MODEL_NAME)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
data = pd.read_csv(TRAINING_DATA)
label_dict = create_label_dict(data, LABEL_COL)
data = label2id(data, label_col, label_dict)

In [None]:
test_size = 0.1
df_test = data.groupby(LABEL_COL, group_keys=False).apply(pd.DataFrame.sample, frac=test_size)
df_train = data[~data.index.isin(df_test.index)]
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
training_set = CustomDataset(df_train, pretrained_tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, pretrained_tokenizer, MAX_LEN)

train_dataloader = DataLoader(
            training_set,  
            sampler=RandomSampler(training_set), 
            batch_size=TRAIN_BATCH_SIZE 
        )
test_dataloader = DataLoader(
            testing_set,
            sampler=SequentialSampler(testing_set),
            batch_size=TEST_BATCH_SIZE
        )

In [None]:
model = BertClass(pretrained_model, 4)

optimizer = torch.optim.AdamW(
    params=model.parameters(), 
    lr=LEARNING_RATE
)

custom_trainer(model, optimizer, train_dataloader, test_dataloader)

In [None]:
outputs, targets, loss = validate(model, test_dataloader)
classification_metrics(outputs, targets)

In [None]:
from bert_classifier.io import save_model
save_model(model)