In [1]:
%%capture
! pip install transformers datasets
# versions: https://pypi.org/project/transformers/

Read more about models and developers:

- Czert https://huggingface.co/UWB-AIR/Czert-B-base-cased
- Slavic-BERT https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased
- cst5 https://huggingface.co/azizbarank/cst5-base

*Why only HuggingFace?* It's totally OK to have other models on your own site or spaCy or TFHub. During a workshop it's easier to use one source.

## Understanding the tokenizer

After seeing this tokenizer understands words (or sub-words)

Change it to your model.

In [6]:
from transformers import AutoTokenizer
import os
os.environ["CURL_CA_BUNDLE"]= ""
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')



In [7]:
# Recognizing words
tokenizer.tokenize('Hello my name is Nick.')

['Hello', 'my', 'name', 'is', 'Nick', '.']

In [8]:
# Larger and less common words are split into tokens
tokenizer.tokenize('At the bakery I bought kolaches.')

['At', 'the', 'bakery', 'I', 'bought', 'k', '##ola', '##ches', '.']

In [9]:
# Recognizing words (Czech)
tokenizer.tokenize('Arthur koupil koláč s jablečnou náplní.')

['Arthur',
 'k',
 '##ou',
 '##pi',
 '##l',
 'k',
 '##ol',
 '##á',
 '##č',
 's',
 'j',
 '##able',
 '##č',
 '##no',
 '##u',
 'n',
 '##á',
 '##p',
 '##ln',
 '##í',
 '.']

In [10]:
# Encoding into IDs (which model will turn into vector / embeddings)
tokenizer('Ahoj Artur.')

{'input_ids': [101, 7066, 1186, 3361, 2051, 2149, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
# Splitting up a word
tokenizer.tokenize('?')

['?']

In [12]:
# Is it significant who has 1-token names in our model?
tokenizer.tokenize('?')

['?']

## Dataset

In [23]:
import pandas as pd
import xml.etree.ElementTree as ET

xml_data = open('./csfd-90k-reviews-ranlp2013.xml', 'r').read()
root = ET.XML(xml_data)

rows = []
for index, child in enumerate(root):
    txt = child.findtext('text')
    rating = int(child.attrib['origRating'])
    rows.append([txt, rating])

all_data = pd.DataFrame(rows, columns=['text', 'rating'])
all_data.head()

Unnamed: 0,text,rating
0,Druhý film Angely Schanelec o počasí. :) Po zh...,5
1,U Pomalého života jsem strávil 80 minut a teď ...,2
2,"Nevím, jestli je to nepovedeným dabingem, otra...",2
3,"Upřímný a velice smutný film, chvílemi tak dep...",5
4,"Možná to bude trochu divný komentář, ale nemůž...",2


In [24]:
len(all_data)

92398

Unfortunately this takes ~2 hours per epoch even after train-test split, and the workshop has limited time (AND CoLab is only open for 12-24 hours).

In [28]:
from sklearn.model_selection import train_test_split
all_data['label'] = all_data['rating'].astype('float')
sample_data = all_data.sample(10_000, random_state=101)

train_df, test_df = train_test_split(sample_data, random_state=404)

Know the number of classes

In [33]:
set(train_df.label.values)

{-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0}

Using HuggingFace Datasets format (from DataFrame)

In [30]:
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=120)

train_dset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
test_dset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

## Downloading the model

In [35]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased',

    # for regression    
    num_labels=1,
    ignore_mismatched_sizes=True,

    # for classification
    # num_labels=len(?),
)



Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

## Fine-tuning

HuggingFace has some info about freezing layers in the model, if that sounds interesting https://discuss.huggingface.co/t/freeze-lower-layers-with-auto-classification-model/11386

In [36]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
)


# def compute_metrics(eval_pred):
#   return {"rmse": rmse}

# https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dset,
    eval_dataset=test_dset,
    # compute_metrics=compute_metrics,
    # optimizers=, (set the optimizer and learning rate scheduler)
)

In [None]:
trainer.train()

## Predictions and Evaluation

In [None]:
predictions = trainer.predict(test_dset)

In [None]:
predictions.predictions

In [None]:
"""
# get best label out of classification probabilities
best = []
for p in predictions.predictions:
  best.append(p.argmax())
"""

# get number out of regression predictions
best = []
for p in predictions.predictions:
  best.append(p[0])
best[:10]

## Exporting

In [None]:
# Downloading model
! ls outputs/*

In [None]:
! cp -r outputs/checkpoint-2814 ./drive/MyDrive/czech-movie-rating-model

https://huggingface.co/docs/transformers/model_sharing

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("czech-movie-rating")
trainer.model.push_to_hub("czech-movie-rating")