In [None]:
%%capture
! pip install simpletransformers==0.63.6
# Pinned to version current as of 7 May 2022
# versions: https://pypi.org/project/simpletransformers/

Read more about models and developers:

- Czert https://huggingface.co/UWB-AIR
- Slavic-BERT https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased
- cst5 https://huggingface.co/azizbarank/cst5-base

*Why only HuggingFace?* It's totally OK to have other models on your own site or spaCy or TFHub. During a workshop it's easier to use one source.

## Understanding the tokenizer

After seeing this tokenizer understands words (or sub-words)

Change it to your model.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Recognizing words
tokenizer.tokenize('Ahoj Artur')

['Ah', '##o', '##j', 'Art', '##ur']

In [None]:
# Encoding into IDs (which model will turn into vector / embeddings)
tokenizer('')

{'input_ids': [101, 102], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [None]:
# Splitting up a word
tokenizer.tokenize('?')

['?']

In [None]:
# Is it significant who has 1-token names in our model?
tokenizer.tokenize('?')

['?']

## Dataset

In [None]:
! tar -xvf *.bz2

csfd-90k-reviews-ranlp2013.xml


In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

xml_data = open('./csfd-90k-reviews-ranlp2013.xml', 'r').read()
root = ET.XML(xml_data)

rows = []
for index, child in enumerate(root):
  txt = child.findtext('text')
  rating = int(child.attrib['origRating'])
  rows.append([txt, rating])

all_data = pd.DataFrame(rows, columns=['text', 'rating'])
all_data.head()

In [None]:
len(all_data)

Unfortunately this takes ~2 hours per epoch even after train-test split, and the workshop has limited time (AND CoLab is only open for 12-24 hours).

In [None]:
sample_data = all_data.sample(8_000, random_state=101)
sample_data = all_data

**train/test split**

## Reproducibility

In [None]:
import random
import torch

seed = 1

#np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Downloading and fine-tuning the model

SimpleTransformers on freezing layers: https://simpletransformers.ai/docs/tips-and-tricks/#train-custom-parameters-only

In [None]:
from simpletransformers.classification import ClassificationModel

model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    use_cuda=True, # GPU

    # for classification
    num_labels=len(?),
    args={
        'regression': False,
        'reprocess_input_data': True,
        'use_cached_eval_features': False,
        'overwrite_output_dir': True,
        'num_train_epochs': 1,
    }

    # for regression
    num_labels=1,
    args={
        'regression': True,
        'reprocess_input_data': True,
        'use_cached_eval_features': False,
        'overwrite_output_dir': True,
        'num_train_epochs': 1,
    }
)

In [None]:
model.train_model(train_df)

## Predicting / evaluating

In [None]:
# this line only really works on classification
# result, model_outputs, wrong_predictions = model.eval_model(test_df)

In [None]:
predictions, raw_outputs = model.predict(test_df['text'].tolist())
predictions[:10]

## Exporting

In [None]:
# Downloading model
! ls outputs/*.*

In [None]:
# See https://huggingface.co/docs/transformers/model_sharing
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("czech-movie-rating")
model.model.push_to_hub("czech-movie-rating")