In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import Dataset
from datasets import load_metric

# Transformers Trainer Hello World
This notebook shows a minimal example of fine tuning a huggingface model for text classification using the Trainer class from the transformers library.

## The Dataset
We generate a toy datast as a pandas dataframe containing the columns 'text' and 'labels' (don't forget the s). The labels must be encoded by integers, therefore we also generate mappings to translate from int to str and viceversa.

In [4]:
positives = [['the food was delicious','pos'],
             ['the steak was delicious, best I have ever had','pos'],
             ['the chicken lasagna was delicious','pos'],
             ['everything was delicious','pos'],
             ['the vegan options were very convenient and delicious','pos']]
negatives = [['the food was terrible','neg'],
             ['most terrible ribs I have tasted in my life','neg'],
             ['the high prices do not match the terrible food','neg'],
             ['we had to wait a long time and the food was terrible','neg'],
             ['the lack of vegan or vegetarian options is terrible','neg']]
all = positives + negatives
df = pd.DataFrame(all,columns=['text','labels'])
df

Unnamed: 0,text,labels
0,the food was delicious,pos
1,"the steak was delicious, best I have ever had",pos
2,the chicken lasagna was delicious,pos
3,everything was delicious,pos
4,the vegan options were very convenient and del...,pos
5,the food was terrible,neg
6,most terrible ribs I have tasted in my life,neg
7,the high prices do not match the terrible food,neg
8,we had to wait a long time and the food was te...,neg
9,the lack of vegan or vegetarian options is ter...,neg


Encode the labels as integers:

In [5]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'])
id2label = {i : le.classes_[i] for i in range(2)}
label2id = {label:code for code,label in id2label.items()}

In [6]:
df

Unnamed: 0,text,labels
0,the food was delicious,1
1,"the steak was delicious, best I have ever had",1
2,the chicken lasagna was delicious,1
3,everything was delicious,1
4,the vegan options were very convenient and del...,1
5,the food was terrible,0
6,most terrible ribs I have tasted in my life,0
7,the high prices do not match the terrible food,0
8,we had to wait a long time and the food was te...,0
9,the lack of vegan or vegetarian options is ter...,0


Split in train and test sets:

In [7]:
X = df['text']
y = df['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

In [8]:
train = pd.DataFrame(columns=['text','labels'])
test = pd.DataFrame(columns=['text','labels'])
train['text']=X_train
train['labels']=y_train
test['text']=X_test
test['labels']=y_test

# Define the tokenizer:
Choose a model from the huggingface hub https://huggingface.co/models?sort=downloads

In [9]:
model_ckpt = 'microsoft/MiniLM-L12-H384-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Convert to a 'datasets' library dataset and tokenize the data:
The trainer class is optimized to work with this type of dataset. In particular this prevents ram shortage issues.

In [10]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=64, padding="max_length", truncation=True)


train_tokenized = train.map(tokenize_function, batched=True)
test_tokenized = test.map(tokenize_function, batched=True)

100%|██████████| 1/1 [00:00<00:00, 63.65ba/s]
100%|██████████| 1/1 [00:00<00:00, 247.82ba/s]


In [12]:
train_tokenized

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8
})

# Now import the model to fine tune and training utilities:

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading: 100%|██████████| 127M/127M [00:43<00:00, 3.09MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### We need a compute_metrics function to pass as an argument to the trainer

In [14]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script: 4.21kB [00:00, 1.58MB/s]                   


In [15]:
args = TrainingArguments(
    output_dir="nlp/transformers_model",
    evaluation_strategy="epoch",
    num_train_epochs=3,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics
)

In [16]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3
 33%|███▎      | 1/3 [00:01<00:02,  1.00s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
                                             
 33%|███▎

{'eval_loss': 0.6923680305480957, 'eval_accuracy': 0.5, 'eval_runtime': 0.099, 'eval_samples_per_second': 20.21, 'eval_steps_per_second': 10.105, 'epoch': 1.0}


 67%|██████▋   | 2/3 [00:02<00:01,  1.02s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
                                             
 67%|██████▋   | 2/3 [00:02<00:01,  1.02s/it]

{'eval_loss': 0.6917973160743713, 'eval_accuracy': 0.5, 'eval_runtime': 0.0866, 'eval_samples_per_second': 23.088, 'eval_steps_per_second': 11.544, 'epoch': 2.0}


100%|██████████| 3/3 [00:03<00:00,  1.00it/s]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
                                             
100%|██████████| 3/3 [00:03<00:00,  1.00it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 3/3 [00:03<00:00,  1.04s/it]

{'eval_loss': 0.6914076209068298, 'eval_accuracy': 0.5, 'eval_runtime': 0.0859, 'eval_samples_per_second': 23.281, 'eval_steps_per_second': 11.64, 'epoch': 3.0}
{'train_runtime': 3.1093, 'train_samples_per_second': 7.719, 'train_steps_per_second': 0.965, 'train_loss': 0.6922508875528971, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=0.6922508875528971, metrics={'train_runtime': 3.1093, 'train_samples_per_second': 7.719, 'train_steps_per_second': 0.965, 'train_loss': 0.6922508875528971, 'epoch': 3.0})

# Inference:
Create a pipeline object, specify the task, pass the trained model path and use the same tokenizer used for training (with the same kwargs).

In [17]:
pipe = pipeline('text-classification',model='/content/drive/MyDrive/transformers/toy_model',tokenizer = tokenizer)

tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':64}

OSError: We couldn't connect to 'https://huggingface.co' to load this model, couldn't find it in the cached files and it looks like /content/drive/MyDrive/transformers/toy_model is not the path to a directory containing a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
test_text = test['text'][0]

In [None]:
test_text

'worst ribs I have tasted in my life'

In [None]:
pipe(test_text,**tokenizer_kwargs)

[{'label': 'LABEL_1', 'score': 0.9832837581634521}]