In [1]:
# Transformers installation
#! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git


In [2]:
import pathlib
import sklearn
import datasets
import pandas as pd
import torch

import numpy as np
import transformers
import os

In [3]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
dataset_path = '../artifacts/dataset_processed/amazon'

In [5]:
raw_datasets = datasets.load_from_disk(dataset_path)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 83031
    })
    test: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 27677
    })
    valid: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 27677
    })
})

In [6]:
raw_datasets['train'][0]

{'text': 'Amazon-merk - vinden. Dames Gladiator Sandalen,ZILVER,5 UK',
 'item_name': 'Amazon-merk - vinden. Dames Gladiator Sandalen,ZILVER,5 UK',
 'label': 83,
 'brand': 'find.',
 'item_id': 'B07ZW5GC82',
 'main_image_id': '617ttnFcuHL',
 'node': '/Categorieën/Dames/Schoenen/Modieuze sandalen'}

In [7]:
labels = raw_datasets['train'].features['label'].names
labels

['ABIS_DRUGSTORE',
 'ABIS_LAWN_AND_GARDEN',
 'ACCESSORY',
 'ACCESSORY_OR_PART_OR_SUPPLY',
 'AUTO_ACCESSORY',
 'BABY_PRODUCT',
 'BACKPACK',
 'BATTERY',
 'BEAUTY',
 'BED',
 'BED_FRAME',
 'BENCH',
 'BISS',
 'BOOT',
 'BRACELET',
 'BREAD',
 'CABINET',
 'CELLULAR_PHONE_CASE',
 'CHAIR',
 'CHARGING_ADAPTER',
 'CLEANING_AGENT',
 'CLOCK',
 'CLOTHES_HANGER',
 'COFFEE',
 'COMPUTER_ADD_ON',
 'COMPUTER_COMPONENT',
 'DAIRY_BASED_DRINK',
 'DESK',
 'DRINKING_CUP',
 'EARRING',
 'EDIBLE_OIL_VEGETABLE',
 'ELECTRONIC_ADAPTER',
 'FILE_FOLDER',
 'FINEEARRING',
 'FINENECKLACEBRACELETANKLET',
 'FINEOTHER',
 'FINERING',
 'FLAT_SCREEN_DISPLAY_MOUNT',
 'FLAT_SHEET',
 'FOOD_SERVICE_SUPPLY',
 'FURNITURE',
 'FURNITURE_COVER',
 'GROCERY',
 'HANDBAG',
 'HARDWARE',
 'HARDWARE_HANDLE',
 'HAT',
 'HEADBOARD',
 'HEADPHONES',
 'HEALTH_PERSONAL_CARE',
 'HERB',
 'HOME',
 'HOME_BED_AND_BATH',
 'HOME_FURNITURE_AND_DECOR',
 'HOME_LIGHTING_AND_LAMPS',
 'HOME_MIRROR',
 'INSTRUMENT_PARTS_AND_ACCESSORIES',
 'JANITORIAL_SUPPLY',
 'JA

In [8]:
label2id = {}
for idx, label in enumerate(labels) : 
    label2id[label] = idx

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased" )

In [10]:
?transformers.AutoTokenizer.from_pretrained

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Loading cached processed dataset at ../artifacts/dataset_processed/amazon/train/cache-b6d3f4749c461a4d.arrow


Loading cached processed dataset at ../artifacts/dataset_processed/amazon/test/cache-a5b316bc084fad9c.arrow


Loading cached processed dataset at ../artifacts/dataset_processed/amazon/valid/cache-a5b316bc084fad9c.arrow


In [12]:
subset = tokenized_datasets["train"].num_rows
#subset = 1_000


train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)


if subset < 5_000:
    train_dataset = train_dataset.select(range(subset)) 
    eval_dataset = eval_dataset.select(range(subset)) 

Loading cached shuffled indices for dataset at ../artifacts/dataset_processed/amazon/train/cache-15da75d5ff4b6a86.arrow


Loading cached shuffled indices for dataset at ../artifacts/dataset_processed/amazon/test/cache-50c3e3ce001a0445.arrow


In [13]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [14]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased"
                                                                        , num_labels=len(labels)
                                                                       , label2id=label2id
                                                                       , id2label=id2label 
                                                                       )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:

training_args = transformers.TrainingArguments("test-trainer"
                                  , evaluation_strategy="epoch"
                                  , save_strategy="epoch"
                                  , logging_steps=100
                                 # , eval_steps=100
                                  , load_best_model_at_end=True
                                ,num_train_epochs=10              # total number of training epochs
                                 )

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:

metric_f1 = datasets.load_metric('f1')
metric_accuracy = datasets.load_metric('accuracy')
metric_precision = datasets.load_metric('precision')
metric_recall = datasets.load_metric('recall')

In [17]:



def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    metrics_dict.update(metric_f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(metric_accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(metric_precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(metric_recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [18]:

trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    

)




In [19]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running training *****


  Num examples = 83031


  Num Epochs = 10


  Instantaneous batch size per device = 8


  Total train batch size (w. parallel, distributed & accumulation) = 32


  Gradient Accumulation steps = 1


  Total optimization steps = 25950




Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.5215,0.496482,0.470413,0.863172,0.589029,0.464116
2,0.3524,0.381476,0.651389,0.894425,0.751316,0.628433
3,0.2318,0.347246,0.702064,0.90624,0.750567,0.68462
4,0.1735,0.346559,0.734746,0.91296,0.766703,0.72497
5,0.1083,0.367227,0.747245,0.915453,0.766736,0.741852
6,0.0815,0.396619,0.760536,0.919139,0.77658,0.752885
7,0.0749,0.421265,0.758352,0.918235,0.766364,0.760141
8,0.0465,0.439155,0.761308,0.919717,0.768705,0.760465
9,0.0393,0.461953,0.765855,0.920475,0.778437,0.761033
10,0.0323,0.472356,0.765323,0.920331,0.77864,0.758738


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to test-trainer/checkpoint-2595


Configuration saved in test-trainer/checkpoint-2595/config.json


Model weights saved in test-trainer/checkpoint-2595/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to test-trainer/checkpoint-5190


Configuration saved in test-trainer/checkpoint-5190/config.json


Model weights saved in test-trainer/checkpoint-5190/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to test-trainer/checkpoint-7785


Configuration saved in test-trainer/checkpoint-7785/config.json


Model weights saved in test-trainer/checkpoint-7785/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-10380


Configuration saved in test-trainer/checkpoint-10380/config.json


Model weights saved in test-trainer/checkpoint-10380/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-12975


Configuration saved in test-trainer/checkpoint-12975/config.json


Model weights saved in test-trainer/checkpoint-12975/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-15570


Configuration saved in test-trainer/checkpoint-15570/config.json


Model weights saved in test-trainer/checkpoint-15570/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-18165


Configuration saved in test-trainer/checkpoint-18165/config.json


Model weights saved in test-trainer/checkpoint-18165/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-20760


Configuration saved in test-trainer/checkpoint-20760/config.json


Model weights saved in test-trainer/checkpoint-20760/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-23355


Configuration saved in test-trainer/checkpoint-23355/config.json


Model weights saved in test-trainer/checkpoint-23355/pytorch_model.bin




The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32


Saving model checkpoint to test-trainer/checkpoint-25950


Configuration saved in test-trainer/checkpoint-25950/config.json


Model weights saved in test-trainer/checkpoint-25950/pytorch_model.bin




Training completed. Do not forget to share your model on huggingface.co/models =)




Loading best model from test-trainer/checkpoint-10380 (score: 0.3465588390827179).


TrainOutput(global_step=25950, training_loss=0.20476344028649301, metrics={'train_runtime': 14713.3535, 'train_samples_per_second': 56.432, 'train_steps_per_second': 1.764, 'total_flos': 1.1020477053321216e+17, 'train_loss': 0.20476344028649301, 'epoch': 10.0})

In [20]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: item_id, brand, text, node, main_image_id, item_name.


***** Running Evaluation *****


  Num examples = 27677


  Batch size = 32




{'eval_loss': 0.3465588390827179,
 'eval_f1': 0.7347462286125184,
 'eval_accuracy': 0.9129602196769881,
 'eval_precision': 0.7667029797387779,
 'eval_recall': 0.7249703024822686,
 'eval_runtime': 164.1857,
 'eval_samples_per_second': 168.571,
 'eval_steps_per_second': 5.268,
 'epoch': 10.0}

In [21]:
tokenizer.encode_plus('men shoes', return_token_type_ids = True, return_attention_mask=True)

{'input_ids': [101, 2273, 6007, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [22]:
tokenizer.convert_ids_to_tokens([101, 2273, 6007, 102])

['[CLS]', 'men', 'shoes', '[SEP]']

# Saving artifacts

In [23]:
?model.save_pretrained

In [24]:
model_dir ='../artifacts/model/amazon/'

In [25]:
trainer.save_model(model_dir)

Saving model checkpoint to ../artifacts/model/amazon/


Configuration saved in ../artifacts/model/amazon/config.json


Model weights saved in ../artifacts/model/amazon/pytorch_model.bin


In [26]:
tokenizer.save_pretrained(model_dir)

tokenizer config file saved in ../artifacts/model/amazon/tokenizer_config.json


Special tokens file saved in ../artifacts/model/amazon/special_tokens_map.json


('../artifacts/model/amazon/tokenizer_config.json',
 '../artifacts/model/amazon/special_tokens_map.json',
 '../artifacts/model/amazon/vocab.txt',
 '../artifacts/model/amazon/added_tokens.json',
 '../artifacts/model/amazon/tokenizer.json')

In [27]:
model2 = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=len(labels))

loading configuration file ../artifacts/model/amazon/config.json


Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "ABIS_DRUGSTORE",
    "1": "ABIS_LAWN_AND_GARDEN",
    "2": "ACCESSORY",
    "3": "ACCESSORY_OR_PART_OR_SUPPLY",
    "4": "AUTO_ACCESSORY",
    "5": "BABY_PRODUCT",
    "6": "BACKPACK",
    "7": "BATTERY",
    "8": "BEAUTY",
    "9": "BED",
    "10": "BED_FRAME",
    "11": "BENCH",
    "12": "BISS",
    "13": "BOOT",
    "14": "BRACELET",
    "15": "BREAD",
    "16": "CABINET",
    "17": "CELLULAR_PHONE_CASE",
    "18": "CHAIR",
    "19": "CHARGING_ADAPTER",
    "20": "CLEANING_AGENT",
    "21": "CLOCK",
    "22": "CLOTHES_HANGER",
    "23": "COFFEE",
    "24": "COMPUTER_ADD_ON",
    "25": "COMPUTER_COMPONENT",
    "26": "DAIRY_BASED_DRINK",
    "27": "DESK",
    "28": "DRINKING_CUP",
    "29": "EARRING",
    "30"

loading weights file ../artifacts/model/amazon/pytorch_model.bin


All model checkpoint weights were used when initializing DistilBertForSequenceClassification.



All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ../artifacts/model/amazon/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


# Loading Artifacts

In [28]:
tokenizer2 = transformers.AutoTokenizer.from_pretrained(
                model_dir
            )

Didn't find file ../artifacts/model/amazon/added_tokens.json. We won't load it.


loading file ../artifacts/model/amazon/vocab.txt


loading file ../artifacts/model/amazon/tokenizer.json


loading file None


loading file ../artifacts/model/amazon/special_tokens_map.json


loading file ../artifacts/model/amazon/tokenizer_config.json


# Predicting on new example

In [29]:
query = 'comfortable men sandals'

In [30]:
tokenizer2(query)

{'input_ids': [101, 6625, 2273, 24617, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [31]:
res = tokenizer2.encode_plus(query, return_tensors="pt")

In [32]:
res

{'input_ids': tensor([[  101,  6625,  2273, 24617,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [33]:
model_res = model2(**res)
model_res

SequenceClassifierOutput(loss=None, logits=tensor([[ -7.9003,  -6.6868,  -0.9343,  -7.9616,  -7.8769,  -5.0596,  -8.1236,
          -8.5895,  -4.5717,  -8.4333, -11.7893,  -9.8893,  -5.9735,  -6.6080,
          -4.9853,  -5.2587,  -7.8393, -11.2992,  -8.1207,  -9.6255,  -9.3650,
          -8.2931,  -9.9519,  -5.1105,  -9.1959,  -7.9620, -10.6792, -11.4846,
          -9.5176,  -6.0061,  -7.9634,  -9.4303, -12.8301,  -6.8259,  -4.4912,
          -3.0508,  -9.6358,  -8.9353, -10.5833,  -9.1380,  -8.0518,  -8.2076,
          -0.8344,  -5.9430,  -9.0120, -10.7401,  -1.5854,  -6.4099,  -8.6923,
          -3.4950,  -3.7238,  -6.2071,  -7.4372,  -6.5529,  -5.5914,  -9.1043,
         -10.2239,  -8.4656, -12.2046,  -8.5575, -14.1078,  -5.0134,  -4.5873,
         -10.0449,  -8.2942,  -7.1035,  -5.0914,  -2.6161,  -5.9385, -13.9249,
          -8.8573, -11.1525,  -4.6429,  -3.9677,  -4.8164,  -9.4566,  -8.6195,
          -9.6696, -12.3765,  -6.4477,  -7.5252,  -6.8860,  -6.3124,  -0.8204,
         

In [34]:
model_res[0]

tensor([[ -7.9003,  -6.6868,  -0.9343,  -7.9616,  -7.8769,  -5.0596,  -8.1236,
          -8.5895,  -4.5717,  -8.4333, -11.7893,  -9.8893,  -5.9735,  -6.6080,
          -4.9853,  -5.2587,  -7.8393, -11.2992,  -8.1207,  -9.6255,  -9.3650,
          -8.2931,  -9.9519,  -5.1105,  -9.1959,  -7.9620, -10.6792, -11.4846,
          -9.5176,  -6.0061,  -7.9634,  -9.4303, -12.8301,  -6.8259,  -4.4912,
          -3.0508,  -9.6358,  -8.9353, -10.5833,  -9.1380,  -8.0518,  -8.2076,
          -0.8344,  -5.9430,  -9.0120, -10.7401,  -1.5854,  -6.4099,  -8.6923,
          -3.4950,  -3.7238,  -6.2071,  -7.4372,  -6.5529,  -5.5914,  -9.1043,
         -10.2239,  -8.4656, -12.2046,  -8.5575, -14.1078,  -5.0134,  -4.5873,
         -10.0449,  -8.2942,  -7.1035,  -5.0914,  -2.6161,  -5.9385, -13.9249,
          -8.8573, -11.1525,  -4.6429,  -3.9677,  -4.8164,  -9.4566,  -8.6195,
          -9.6696, -12.3765,  -6.4477,  -7.5252,  -6.8860,  -6.3124,  -0.8204,
          -7.1735,  -7.8208,  -9.4677,  -8.2990,  -4

In [35]:
predictions = list ( zip (labels , torch.softmax(model_res.logits, dim=1).tolist()[0] ) )
predictions = sorted (predictions , key=lambda x:x[1] , reverse =True)

In [36]:
predictions

[('SANDAL', 0.2054256796836853),
 ('GROCERY', 0.2025657445192337),
 ('ACCESSORY', 0.1833166927099228),
 ('SPORTING_GOODS', 0.14686335623264313),
 ('HAT', 0.0955934226512909),
 ('NECKLACE', 0.034101832658052444),
 ('FINEOTHER', 0.02208072319626808),
 ('HEALTH_PERSONAL_CARE', 0.014161545783281326),
 ('HERB', 0.011264330707490444),
 ('PANTRY', 0.008826467208564281),
 ('SHOES', 0.005528533831238747),
 ('FINENECKLACEBRACELETANKLET', 0.005229537840932608),
 ('BEAUTY', 0.0048246257938444614),
 ('LEGUME', 0.00475027970969677),
 ('OUTDOOR_LIVING', 0.00449340371415019),
 ('PET_SUPPLIES', 0.0037776813842356205),
 ('BRACELET', 0.003190439660102129),
 ('LAMP', 0.003102256450802088),
 ('BABY_PRODUCT', 0.002961968770250678),
 ('MEDICATION', 0.00286941509693861),
 ('COFFEE', 0.0028150901198387146),
 ('TEA', 0.002760168630629778),
 ('BREAD', 0.0024273807648569345),
 ('TABLE', 0.0020274301059544086),
 ('WALL_ART', 0.002011240227147937),
 ('HOME_LIGHTING_AND_LAMPS', 0.0017404173268005252),
 ('WASTE_BAG',

# FIN 

<a id='additional-resources'></a>

## Additional resources

To look at more fine-tuning examples you can refer to:

- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/master/examples) which includes scripts
  to train on all common NLP tasks in PyTorch and TensorFlow.

- [🤗 Transformers Notebooks](https://huggingface.co/transformers/notebooks.html) which contains various notebooks and in particular one per task (look
  for the *how to finetune a model on xxx*).