In [None]:
! git clone https://github.com/adapter-hub/adapter-transformers
! cd adapter-transformers && pip install .

# based on doc at https://docs.adapterhub.ml/training.html
# installs modded HuggingFace/transformers https://github.com/adapter-hub/adapter-transformers

In [None]:
%%capture
! pip install datasets

Read more about models and developers:

- Czert https://huggingface.co/UWB-AIR
- Slavic-BERT https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased
- cst5 https://huggingface.co/azizbarank/cst5-base

(if making an Adapter, it is designed to work with HuggingFace)

## Understanding the tokenizer

After seeing this tokenizer understands words (or sub-words)

Change it to your model.

In [None]:
# as of 8 May 2022, HF Transformers is at 4.18.0, and Adapter-Transformers is 4.17.0
import transformers
transformers.__version__

'4.17.0'

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Recognizing words
tokenizer.tokenize('Ahoj Artur')

In [None]:
# Encoding into IDs (which model will turn into vector / embeddings)
tokenizer('Ahoj Artur.')

In [None]:
# Splitting up a word
tokenizer.tokenize('?')

['?']

In [None]:
# Is it significant who has 1-token names in our model?
tokenizer.tokenize('?')

['?']

## Dataset

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

xml_data = open('./csfd-90k-reviews-ranlp2013.xml', 'r').read()
root = ET.XML(xml_data)

rows = []
for index, child in enumerate(root):
  txt = child.findtext('text')
  rating = int(child.attrib['origRating'])
  rows.append([txt, rating])

all_data = pd.DataFrame(rows, columns=['text', 'rating'])
all_data.head()

Unnamed: 0,text,rating
0,Druhý film Angely Schanelec o počasí. :) Po zh...,5
1,U Pomalého života jsem strávil 80 minut a teď ...,2
2,"Nevím, jestli je to nepovedeným dabingem, otra...",2
3,"Upřímný a velice smutný film, chvílemi tak dep...",5
4,"Možná to bude trochu divný komentář, ale nemůž...",2


In [None]:
set(train_df.rating.values)

In [None]:
all_data['label'] = ?

In [None]:
# Limit how much data to make training faster for the workshop
# Use random_state for reproducible work

sample_data = all_data.sample(?, random_state=?)

In [None]:
from datasets import Dataset

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length= ?)

train_dset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
test_dset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

## Download model, create the adapter

https://docs.adapterhub.ml/training.html#adaptertrainer

In [None]:
from transformers import AutoAdapterModel

task_name = ""
model = AutoAdapterModel.from_pretrained("bert-base-cased")
# add_classification_head
model.add_classification_head(task_name, num_labels=?, activation_function=?)

In [None]:
from transformers import AdapterConfig

# resolve the adapter config
adapter_config = AdapterConfig.load(
    {},
)

# honestly I am not sure which of these are needed (all?)
model.add_adapter(task_name)
model.train_adapter(task_name)
model.set_active_adapters(task_name)

## Fine-tuning

In [None]:
from transformers import TrainingArguments, AdapterTrainer

training_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    #logging_nan_inf_filter=False,
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dset,
    eval_dataset=test_dset,
    tokenizer=tokenizer,
    # compute_metrics=?
    # data_collator=data_collator,
)

In [None]:
trainer.train()

## Predictions and Evaluation

In [None]:
predictions = trainer.predict(test_dset)

In [None]:
predictions.predictions

## Exporting the Adapter

https://docs.adapterhub.ml/contributing.html

In [None]:
# Downloading model
! ls outputs/*