In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lr.text_processing.util import pre_process_nli_df
from lr.training.util import get_positive_labels, filter_df_by_label
from IPython.display import display, HTML

### Trasformers
ref https://github.com/huggingface/transformers/blob/master/examples/run_glue.py

## Loading data

In [2]:
train_path = "data/snli/train.csv"
dev_path = "data/snli/dev.csv"

train_path_mod = "data/snli/train_p_h_syn_noun.csv"
dev_path_mod = "data/snli/dev_p_h_syn_noun.csv"

train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
train = filter_df_by_label(train.dropna()).reset_index(drop=True)
dev = filter_df_by_label(dev.dropna()).reset_index(drop=True)

train = train.head(10000)
dev = dev.head(1000)
pre_process_nli_df(train)
pre_process_nli_df(dev)


print(train.shape)
print(dev.shape)

(10000, 3)
(1000, 3)


In [3]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Example of loss generation

In [4]:
seed = 123 

torch.manual_seed(seed)
np.random.seed(seed)

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels = 3)


batch_size = 3
sentences = (train.premise + ", " + train.hypothesis)
labels = get_positive_labels(train)
sentence_batch = sentences.sample(batch_size)
batch_id = sentence_batch.index.values
labels_batch = labels[batch_id]
sentence_batch = sentence_batch.values
sentence_batch_encoded = [tokenizer.encode(s, add_special_tokens=True) for s in sentence_batch]
len_max = np.max(list(map(lambda x: len(x),sentence_batch_encoded)))
pad_code = 0
for i,s in enumerate(sentence_batch_encoded):
    diff = len_max - len(s)
    if diff > 0:
        new_s = s + ([pad_code] * diff) 
        sentence_batch_encoded[i] = new_s

        
# to torch
input_ids = torch.tensor(sentence_batch_encoded)
labels_batch = torch.tensor(labels_batch).unsqueeze(0)  # Batch size 1

outputs = model(input_ids, labels=labels_batch)
loss = outputs[0].mean()
print(batch_id, loss)

[2656  445 9505] tensor(0.9323, grad_fn=<MeanBackward0>)


In [5]:
[2656,  445, 9505], 0.9323

([2656, 445, 9505], 0.9323)