In [1]:
# Transformers installation
#! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git


In [2]:
import pathlib
import sklearn
import datasets
import pandas as pd
import torch

import numpy as np
import transformers
import os

In [3]:
 os.environ["WANDB_DISABLED"] = "true"

In [4]:
dataset_path = '../artifacts/dataset_processed/imdb'

In [5]:
raw_datasets = datasets.load_from_disk(dataset_path)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [6]:
raw_datasets['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None)}

In [7]:
raw_datasets['train'][0]

{'text': "King of Masks (Bian Lian in China) is a shockingly beautiful and profoundly touching film. Winner of 16 awards from around the world, this film based on a true story centers on Wang Bianlian, a street performer in 1930s China who is growing older but has no heir to pass on his art of face-change opera. He has a unique talent of quickly changing masks in performance, and no one knows how he does it. He has a longing desire to have a grandson, as his art is a family heirloom that can only be passed on to a male heir. We then go to the streets, and see that people are selling their children because they can't afford to take care of them: some are even begging to take their daughters for free, because daughters are not worth much in this society. Wang Bianlian's story goes on from there.<br /><br />The film was so astonishingly good, the acting was amazing, and the issues were so weighty and well-addressed. There is the gender inequality and the depressing fact that in this time 

In [8]:
labels = raw_datasets['train'].features['label'].names
labels

['neg', 'pos']

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at ../artifacts/dataset_processed/imdb/train/cache-c1218c477deccdf3.arrow
Loading cached processed dataset at ../artifacts/dataset_processed/imdb/test/cache-30625507ace4ca91.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

In [11]:
subset = 1_000
subset = 25_000
subset = 1_000


train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(subset)) 
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(subset)) 


Loading cached shuffled indices for dataset at ../artifacts/dataset_processed/imdb/train/cache-e6d99e83bf751d8a.arrow
Loading cached shuffled indices for dataset at ../artifacts/dataset_processed/imdb/test/cache-5b82b302e16af136.arrow


In [12]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [13]:

training_args = transformers.TrainingArguments("test-trainer"
                                  , evaluation_strategy="epoch"
                                  , save_strategy="epoch"
                                  , logging_steps=100
                                 # , eval_steps=100
                                  , load_best_model_at_end=True
                                 
                                 )

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
metric = datasets.load_metric("accuracy")


In [15]:


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:

trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    

)




In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Epoch,Training Loss,Validation Loss,Accuracy
1,0.488,0.378699,0.843
2,0.257,0.519023,0.866
3,0.128,0.533083,0.88


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to test-trainer/checkpoint-125
Configuration saved in test-trainer/checkpoint-125/config.json
Model weights saved in test-trainer/checkpoint-125/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to test-trainer/checkpoint-250
Configuration saved in test-trainer/checkpoint-250/config.json
Model weights saved in test-trainer/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Run

TrainOutput(global_step=375, training_loss=0.24978535842895508, metrics={'train_runtime': 214.9867, 'train_samples_per_second': 13.954, 'train_steps_per_second': 1.744, 'total_flos': 397402195968000.0, 'train_loss': 0.24978535842895508, 'epoch': 3.0})

In [18]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'eval_loss': 0.3786992132663727,
 'eval_accuracy': 0.843,
 'eval_runtime': 17.1231,
 'eval_samples_per_second': 58.401,
 'eval_steps_per_second': 7.3,
 'epoch': 3.0}

In [19]:
tokenizer.encode_plus('men shoes', return_token_type_ids = True, return_attention_mask=True)

{'input_ids': [101, 2273, 6007, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [20]:
tokenizer.convert_ids_to_tokens([101, 2273, 6007, 102])

['[CLS]', 'men', 'shoes', '[SEP]']

# Saving artifacts

In [21]:
?model.save_pretrained

In [22]:
model_dir ='../artifacts/model/'

In [23]:
trainer.save_model(model_dir)

Saving model checkpoint to ../artifacts/model/
Configuration saved in ../artifacts/model/config.json
Model weights saved in ../artifacts/model/pytorch_model.bin


In [24]:
tokenizer.save_pretrained(model_dir)

tokenizer config file saved in ../artifacts/model/tokenizer_config.json
Special tokens file saved in ../artifacts/model/special_tokens_map.json


('../artifacts/model/tokenizer_config.json',
 '../artifacts/model/special_tokens_map.json',
 '../artifacts/model/vocab.txt',
 '../artifacts/model/added_tokens.json',
 '../artifacts/model/tokenizer.json')

In [25]:
model2 = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=len(labels))

loading configuration file ../artifacts/model/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.11.1",
  "vocab_size": 30522
}

loading weights file ../artifacts/model/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ../artifacts/model/.
If your task is simil

# Loading Artifacts

In [26]:
tokenizer2 = transformers.AutoTokenizer.from_pretrained(
                model_dir
            )

Didn't find file ../artifacts/model/added_tokens.json. We won't load it.
loading file ../artifacts/model/vocab.txt
loading file ../artifacts/model/tokenizer.json
loading file None
loading file ../artifacts/model/special_tokens_map.json
loading file ../artifacts/model/tokenizer_config.json


# Predicting on new example

In [27]:
tokenizer2('men shoes')

{'input_ids': [101, 2273, 6007, 102], 'attention_mask': [1, 1, 1, 1]}

In [28]:
query = 'this movie sucks'
query = 'this movie is awesome'

In [29]:
res = tokenizer2.encode_plus(query, return_tensors="pt")

In [30]:
res

{'input_ids': tensor([[  101,  2023,  3185,  2003, 12476,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [31]:
model_res = model2(**res)
model_res

SequenceClassifierOutput(loss=None, logits=tensor([[-1.2999,  0.8312]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [32]:
model_res[0]

tensor([[-1.2999,  0.8312]], grad_fn=<AddmmBackward>)

In [33]:
list ( zip (labels , torch.softmax(model_res.logits, dim=1).tolist()[0] ) )

[('neg', 0.10610862076282501), ('pos', 0.8938913345336914)]

# FIN 

<a id='additional-resources'></a>

## Additional resources

To look at more fine-tuning examples you can refer to:

- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/master/examples) which includes scripts
  to train on all common NLP tasks in PyTorch and TensorFlow.

- [🤗 Transformers Notebooks](https://huggingface.co/transformers/notebooks.html) which contains various notebooks and in particular one per task (look
  for the *how to finetune a model on xxx*).