In [1]:
%%capture
! pip install transformers datasets
# versions: https://pypi.org/project/transformers/

Read more about models and developers:

- Czert https://huggingface.co/UWB-AIR/Czert-B-base-cased
- Slavic-BERT https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased
- cst5 https://huggingface.co/azizbarank/cst5-base

*Why only HuggingFace?* It's totally OK to have other models on your own site or spaCy or TFHub. During a workshop it's easier to use one source.

## Understanding the tokenizer

After seeing this tokenizer understands words (or sub-words)

Change it to your model.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Recognizing words
tokenizer.tokenize('Hello my name is Nick.')

In [None]:
# Larger and less common words are split into tokens
tokenizer.tokenize('At the bakery I bought kolaches.')

In [None]:
# Recognizing words (Czech)
tokenizer.tokenize('Arthur koupil koláč s jablečnou náplní.')

In [None]:
# Encoding into IDs (which model will turn into vector / embeddings)
tokenizer('Ahoj Artur.')

In [None]:
# Splitting up a word
tokenizer.tokenize('?')

['?']

In [None]:
# Is it significant who has 1-token names in our model?
tokenizer.tokenize('?')

['?']

## Dataset

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

xml_data = open('./csfd-90k-reviews-ranlp2013.xml', 'r').read()
root = ET.XML(xml_data)

rows = []
for index, child in enumerate(root):
  txt = child.findtext('text')
  rating = int(child.attrib['origRating'])
  rows.append([txt, rating])

all_data = pd.DataFrame(rows, columns=['text', 'rating'])
all_data.head()

In [None]:
len(all_data)

92398

Unfortunately this takes ~2 hours per epoch even after train-test split, and the workshop has limited time (AND CoLab is only open for 12-24 hours).

In [None]:
all_data['label'] = all_data['rating'].astype('float')
sample_data = all_data.sample(10_000, random_state=101)

Know the number of classes

In [None]:
set(train_df.label.values)

{-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0}

Using HuggingFace Datasets format (from DataFrame)

In [None]:
from datasets import Dataset

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=120)

train_dset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
test_dset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

## Downloading the model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased',

    # for regression    
    num_labels=1,
    ignore_mismatched_sizes=True,

    # for classification
    num_labels=len(?),
)

## Fine-tuning

HuggingFace has some info about freezing layers in the model, if that sounds interesting https://discuss.huggingface.co/t/freeze-lower-layers-with-auto-classification-model/11386

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
)


# def compute_metrics(eval_pred):
#   return {"rmse": rmse}

# https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dset,
    eval_dataset=test_dset,
    # compute_metrics=compute_metrics,
    # optimizers=, (set the optimizer and learning rate scheduler)
)

In [None]:
trainer.train()

## Predictions and Evaluation

In [None]:
predictions = trainer.predict(test_dset)

In [None]:
predictions.predictions

In [None]:
"""
# get best label out of classification probabilities
best = []
for p in predictions.predictions:
  best.append(p.argmax())
"""

# get number out of regression predictions
best = []
for p in predictions.predictions:
  best.append(p[0])
best[:10]

## Exporting

In [None]:
# Downloading model
! ls outputs/*

In [None]:
! cp -r outputs/checkpoint-2814 ./drive/MyDrive/czech-movie-rating-model

https://huggingface.co/docs/transformers/model_sharing

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("czech-movie-rating")
trainer.model.push_to_hub("czech-movie-rating")