# Loading RoBERTa from local files 

This notebook shows how to load a RoBERTa tokenizer and model from a local directory (without downloading from HF hub, or other network location). 

## References: 
1. [Kaggle - tokenizers cheat sheet](https://www.kaggle.com/code/debanga/huggingface-tokenizers-cheat-sheet)
2. [Kaggle dataset with saved roberta model](https://www.kaggle.com/datasets/abhishek/roberta-base)


In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/rust_model.ot
/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/README.md
/kaggle/input/roberta-base/tokenizer.json
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/tf_model.h5
/kaggle/input/roberta-base/dict.txt
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/roberta-base/flax_model.msgpack


In [2]:
import transformers
from transformers import AutoTokenizer
import tokenizers 

## Method 1 (not preferred)

In [3]:
ROBERTA_PATH = '../input/roberta-base'

tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab=f'{ROBERTA_PATH}/vocab.json', 
    merges=f'{ROBERTA_PATH}/merges.txt', 
)

In [4]:
tokenizer

Tokenizer(vocabulary_size=50265, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [5]:
text = 'Today is Monday. Long live Monday. Monday is before Tuesday. Tokenizing, eh?'
text_encoded = tokenizer.encode(text)
text_encoded


Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [6]:
text_encoded.tokens

['Today',
 'Ġis',
 'ĠMonday',
 '.',
 'ĠLong',
 'Ġlive',
 'ĠMonday',
 '.',
 'ĠMonday',
 'Ġis',
 'Ġbefore',
 'ĠTuesday',
 '.',
 'ĠToken',
 'izing',
 ',',
 'Ġeh',
 '?']

In [7]:
text_encoded.ids

[5625,
 16,
 302,
 4,
 2597,
 697,
 302,
 4,
 302,
 16,
 137,
 294,
 4,
 29464,
 2787,
 6,
 35670,
 116]

## Method 2 (Preferred)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=ROBERTA_PATH)
tokenizer

RobertaTokenizerFast(name_or_path='../input/roberta-base', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [9]:
text = ['Today is Monday. Long live Monday. Monday is before Tuesday', 'Tokenizing, eh?']
text_encoded = tokenizer(text, padding=True)

In [10]:
text_encoded

{'input_ids': [[0, 5625, 16, 302, 4, 2597, 697, 302, 4, 302, 16, 137, 294, 2], [0, 45643, 2787, 6, 35670, 116, 2, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]}

In [11]:
for txt in [0,1]: 
    tokens = tokenizer.convert_ids_to_tokens(text_encoded.input_ids[txt])
    print(tokens)

['<s>', 'Today', 'Ġis', 'ĠMonday', '.', 'ĠLong', 'Ġlive', 'ĠMonday', '.', 'ĠMonday', 'Ġis', 'Ġbefore', 'ĠTuesday', '</s>']
['<s>', 'Token', 'izing', ',', 'Ġeh', '?', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [12]:
model_inputs = tokenizer(text, return_tensors='pt', padding=True)
model_inputs

{'input_ids': tensor([[    0,  5625,    16,   302,     4,  2597,   697,   302,     4,   302,
            16,   137,   294,     2],
        [    0, 45643,  2787,     6, 35670,   116,     2,     1,     1,     1,
             1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}