# Use transfer-learning with huggingface-transformers library

## Load data and make it accessible for huggingface environment

In [1]:
%load_ext autoreload
%autoreload 2
from transformers import (AutoTokenizer, 
                          TFAutoModel,
                          TFAutoModelForSequenceClassification
                          )
from datasets import Dataset, DatasetDict # to use huggingface datasets
from detector.utils import load_data
import tensorflow as tf
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_data()

In [3]:
train, val, test = data["train"], data["valid"], data["test"]

def remove_newline(text: str) -> str:
    return text.replace("\n", " ")

for df in [train, val, test]:
    df["text"] = df["text"].apply(remove_newline)

In [4]:
ds_train = Dataset.from_pandas(train, split="train")
ds_val = Dataset.from_pandas(val, split="valid")
ds_test = Dataset.from_pandas(test, split="test")

In [5]:
# pack datasets into a dictionary to tokenize them in parallel
ds_dict = DatasetDict({"train": ds_train, "valid": ds_val, "test": ds_test})

In [6]:
ds_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'AI', '__index_level_0__'],
        num_rows: 500000
    })
    valid: Dataset({
        features: ['text', 'AI', '__index_level_0__'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'AI', '__index_level_0__'],
        num_rows: 10000
    })
})

## Create Tokenizer suitable for the model

In [7]:
model_ckpt = "roberta-large" # the model we will use
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) # define the tokenizer the model was trained with

In [8]:
tokenizer("this is a test!", return_tensors="tf")

{'input_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[   0, 9226,   16,   10, 1296,  328,    2]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [9]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [10]:
# define a tokenize function that tokenizes the text in batches
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [11]:
ds_encoded = ds_dict.map(tokenize, batched=True, batch_size=10_000)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


                                                                     

In [12]:
ds_encoded["train"].column_names

['text', 'AI', '__index_level_0__', 'input_ids', 'attention_mask']

## Option 1: Use pre-trained model as feature extractor

For this approach the model-weights of our RoBERTA model are frozen and provide features for a classifier 

In [13]:
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True) # load the model from the checkpoint

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [14]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="tf")
outputs = model(**inputs)
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 6, 1024), dtype=float32, numpy=
array([[[ 0.04603523, -0.00462829,  0.01070012, ..., -0.05166839,
          0.10207295,  0.0616852 ],
        [ 0.16665444, -0.10331249, -0.35997286, ...,  0.06356604,
         -0.14786597,  0.08259281],
        [ 0.36435947, -0.06037569, -0.32145435, ...,  0.09165332,
          0.07294418,  0.12116893],
        [-0.088461  , -0.09831412,  0.01492168, ..., -0.04664345,
         -0.18895248,  0.17244472],
        [ 0.07978426, -0.15301386, -0.12229703, ..., -0.16046485,
          0.11752778,  0.00620476],
        [ 0.06696583, -0.00879123,  0.03385547, ..., -0.06855679,
          0.08994165,  0.0196259 ]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 1024), dtype=float32, numpy=
array([[ 0.15439384,  0.71824986,  0.45034173, ..., -0.00599452,
         0.4090493 , -0.354411  ]], dtype=float32)>, past_key_values=None, hidden_states=None, attentions=None, cross_at

In [15]:
outputs.last_hidden_state.shape #output [batch_size, n_tokens, hidden_dim]

TensorShape([1, 6, 1024])

In [16]:
# for classification it is common practice use hidden state associated to start 
# of sequence token
outputs.last_hidden_state[:, 0].shape 

TensorShape([1, 1024])

In [22]:
# extract last hidden state for whole dataset
def extract_hidden_states(batch):
    inputs = {k: v for k,v in batch.items() if k in tokenizer.model_input_names}
    last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:, 0].numpy()}

In [23]:
ds_encoded.set_format("tensorflow", columns=["input_ids", "attention_mask", "AI"])

In [28]:
extract_hidden_states(ds_encoded["train"][:2])

{'hidden_state': array([[-0.27323678,  0.01947757,  0.01292926, ...,  0.23666379,
         -0.5726616 ,  0.55346173],
        [-0.36674288, -0.1628358 , -0.0582368 , ...,  0.20550966,
         -0.1738765 ,  0.40705544]], dtype=float32)}

In [30]:
#ds_hidden = ds_encoded.map(extract_hidden_states, batched=True, batch_size=8)