# About

This is an internal notebok to help create code snippets 

In [1]:
from transformers import BertTokenizer

In [2]:
# Bert uses WordPiece Tokenizer
# splitting words either into the full forms
# (e.g., one word becomes one token) or into word piece
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
tokenizer.tokenize("cheap nike men running shoes")

['cheap', 'nike', 'men', 'running', 'shoes']

In [4]:
# chep/runing is mispelled
tokenizer.tokenize("chep nike men shoes runing under 100$ ")

['che', '##p', 'nike', 'men', 'shoes', 'run', '##ing', 'under', '100', '$']

In [5]:
# size of vocabulary
tokenizer.vocab_size

30522

# Training Code

In [None]:
# hugging face library to load existing/custom datasets
import datasets
# hugging face library contains tokenizers / models 
import transformers

In [None]:
# dataset contains two columns "text/label"
raw_datasets = datasets.load_from_disk(dataset_path)

In [None]:
# use existing distilbert tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased" )

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# calculate ['input_ids' , 'attention_mask']
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) 

In [None]:
# use pretrained distilbert model
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased"
                                                                        , num_labels=len(labels) ... )

In [None]:
training_args = transformers.TrainingArguments("trainer",num_train_epochs=5...)             
                                 )
trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['validation'],.... )

In [None]:
# train on datasets/argumets passed to trainer args
trainer.train()


# Inference Code

In [None]:
query = 'comfortable men sandals'

In [None]:
# compute input id / attention mask
tokenized_res = tokenizer.encode_plus(query, return_tensors="pt")

In [None]:
# pass input to model
model_res = model(**tokenized_res)
# get softmax of logits
logits = model_res.logits
softmax_res = torch.softmax(logits, dim=1).toList()[0]

In [None]:
# get the label and probability sorted
predictions = list ( zip (labels , softmax_res ) )
predictions = sorted (predictions , key=lambda x:x[1] , reverse =True)

# Torch Archiving

In [None]:
# name and version of the model
MODEL_NAME="pt_classifier"
MODEL_VERSION="1.0"

# folder where model is saved
MODEL_STORE="model_store"
# path of saved pytorch models
MODEL_SERIALIZED_FILE="traced_model.pt"
# path of extra files to include
MODEL_EXTRA_FILES="index_to_name.json,setup_config.json"
# model code
MODEL_CODE="handler.py"


torch-model-archiver --model-name ${MODEL_NAME} \
--version ${MODEL_VERSION} \
--serialized-file ${MODEL_SERIALIZED_FILE} \
--export-path ${MODEL_STORE} \
--extra-files ${MODEL_EXTRA_FILES} \
--handler ${MODEL_CODE} \