In [29]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


# Entity-Marker tokenizer + [CLS] output model
## Imports

In [1]:
import numpy as np

import torch, gc
import torch.nn as nn

from transformers import BertConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import collections

In [2]:
from persian_re.preprocess import PerlexData, create_data_loader
from persian_re.tokenizers import BertEntityMarkerTokenizer
from persian_re.settings import MODEL_NAME_OR_PATH, MAX_LEN, TRAIN_BATCH_SIZE, VALID_BATCH_SIZE, TEST_BATCH_SIZE,\
    INITIAL_LEARNING_RATE
from persian_re.models import CLSModel

## GPU configuration

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cpu
CUDA is not available.  Training on CPU ...


In [4]:
#clear cuda cache
gc.collect()
torch.cuda.empty_cache()

## Load Data

In [5]:
data = PerlexData.get_instance()

## Entity Marker tokenizer

In [6]:
tokenizer: BertEntityMarkerTokenizer = BertEntityMarkerTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertEntityMarkerTokenizer'.


## Configuration

In [7]:
EPOCHS = 10

In [8]:
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': data.label2ids,
        'id2label': data.id2labels,
        'hidden_dropout_prob': 0.2,
    })

In [9]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "id2label": {
    "0": "Cause-Effect(e1,e2)",
    "1": "Cause-Effect(e2,e1)",
    "2": "Component-Whole(e1,e2)",
    "3": "Component-Whole(e2,e1)",
    "4": "Content-Container(e1,e2)",
    "5": "Content-Container(e2,e1)",
    "6": "Entity-Destination(e1,e2)",
    "7": "Entity-Origin(e1,e2)",
    "8": "Entity-Origin(e2,e1)",
    "9": "Instrument-Agency(e1,e2)",
    "10": "Instrument-Agency(e2,e1)",
    "11": "Member-Collection(e1,e2)",
    "12": "Member-Collection(e2,e1)",
    "13": "Message-Topic(e1,e2)",
    "14": "Message-Topic(e2,e1)",
    "15": "Other",
    "16": "Product-Producer(e1,e2)",
    "17": "Product-Producer(e2,e1)"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Cause-Effect(e1,e2)": 0,
    "Cause-

## *DataLoader* objects

In [None]:
train_data_loader = create_data_loader(data.x_train, train['re_type'].to_numpy(), tokenizer, MAX_LEN,
                                       TRAIN_BATCH_SIZE, label_list)
valid_data_loader = create_data_loader(valid['text'].to_numpy(), valid['re_type'].to_numpy(), tokenizer, MAX_LEN,
                                       VALID_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test['text'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

## [CLS] Relation Extraction Model

In [10]:
pt_model = CLSModel(config=config)
# considering special tokens
pt_model.resize_token_embeddings(len(tokenizer))
pt_model = pt_model.to(device)

Downloading:   0%|          | 0.00/452M [00:00<?, ?B/s]

KeyboardInterrupt: 

## Training
with:
- **AdamW** optimizer with initial learning rate `INITIAL_LEARNING_RATE`
- **Linear Scheduler** with no warmup
- **CrossEntropyLoss** with class weights to balance data

In [None]:
optimizer = AdamW(pt_model.parameters(), lr=INITIAL_LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss(weight=torch.FloatTensor(data.class_weights).to(device))

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)

In [26]:
scheduler

TypeError: __init__() missing 1 required positional argument: 'params'

In [None]:
for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model,
        data_loader=train_data_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        step=step,
        print_every_step=EEVERY_EPOCH,
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=valid_data_loader,
        clip=CLIP)

    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

    eval_y, eval_loss = eval_op(
        model=pt_model,
        data_loader=valid_data_loader,
        loss_fn=loss_fn)

    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)

## Model Evaluation

In [None]:
test_comments = test['text'].to_numpy()
preds, probs = predict(pt_model, test_comments, tokenizer, max_len=128)

print(preds.shape, probs.shape)

In [None]:
y_test, y_pred = [label_list.index(label) for label in test['re_type'].values], preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=label_list))