# Loading RoBERTa from local files 

This notebook shows how to load a RoBERTa tokenizer and model from a local directory (without downloading from HF hub, or other network location). 

## References: 
1. [Kaggle - tokenizers cheat sheet](https://www.kaggle.com/code/debanga/huggingface-tokenizers-cheat-sheet)
2. [Kaggle dataset with saved roberta model](https://www.kaggle.com/datasets/abhishek/roberta-base)


In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/rust_model.ot
/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/README.md
/kaggle/input/roberta-base/tokenizer.json
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/tf_model.h5
/kaggle/input/roberta-base/dict.txt
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/roberta-base/flax_model.msgpack


In [2]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import tokenizers 

## Method 1 (not preferred)

In [3]:
ROBERTA_PATH = '../input/roberta-base'

tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab=f'{ROBERTA_PATH}/vocab.json', 
    merges=f'{ROBERTA_PATH}/merges.txt', 
)

In [4]:
tokenizer

Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [None]:
text = 'Today is Monday. Long live Monday. Monday is before Tuesday. Tokenizing, eh?'
text_encoded = tokenizer.encode(text)
text_encoded


In [None]:
text_encoded.tokens

In [None]:
text_encoded.ids

## Method 2 (Preferred)

### Loading tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=ROBERTA_PATH)
tokenizer

RobertaTokenizerFast(name_or_path='../input/roberta-base', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [6]:
text = ['Today is Monday. Long live Monday. Monday is before Tuesday', 'Tokenizing, eh?']
text_encoded = tokenizer(text, padding=True)

In [7]:
text_encoded

{'input_ids': [[0, 5625, 16, 302, 4, 2597, 697, 302, 4, 302, 16, 137, 294, 2], [0, 45643, 2787, 6, 35670, 116, 2, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}

In [8]:
for txt in [0,1]: 
    tokens = tokenizer.convert_ids_to_tokens(text_encoded.input_ids[txt])
    print(tokens)

['<s>', 'Today', 'Ġis', 'ĠMonday', '.', 'ĠLong', 'Ġlive', 'ĠMonday', '.', 'ĠMonday', 'Ġis', 'Ġbefore', 'ĠTuesday', '</s>']
['<s>', 'Token', 'izing', ',', 'Ġeh', '?', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [9]:
model_inputs = tokenizer(text, return_tensors='pt', padding=True)
model_inputs

{'input_ids': tensor([[    0,  5625,    16,   302,     4,  2597,   697,   302,     4,   302,
            16,   137,   294,     2],
        [    0, 45643,  2787,     6, 35670,   116,     2,     1,     1,     1,
             1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

### Loading model 

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=ROBERTA_PATH, num_labels=4)

Some weights of the model checkpoint at ../input/roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/roberta-base and are newly initialized: ['classifier.out_pro

In [23]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [24]:
model.config

RobertaConfig {
  "_name_or_path": "../input/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [25]:
weights = model.state_dict()
weights.keys()

odict_keys(['roberta.embeddings.position_ids', 'roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.layer.0.outp

In [26]:
model(**model_inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1283, -0.2068,  0.2273,  0.1487],
        [ 0.1159, -0.1957,  0.2251,  0.1513]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)