### Setup

In [1]:
import os
os.chdir('FARM')
print("Current working directory is {}".format(os.getcwd()))

Current working directory is /home/workdrive/mhecht/hatespeech/FARM


In [2]:
import torch
import pandas as pd
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.processor import TextClassificationProcessor
from farm.data_handler.data_silo import DataSilo
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer, EarlyStopping
from farm.utils import MLFlowLogger
from pathlib import Path

05/11/2022 15:26:55 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Tutorial1_Colab")


 __          __  _                            _        
 \ \        / / | |                          | |       
  \ \  /\  / /__| | ___ ___  _ __ ___   ___  | |_ ___  
   \ \/  \/ / _ \ |/ __/ _ \| '_ ` _ \ / _ \ | __/ _ \ 
    \  /\  /  __/ | (_| (_) | | | | | |  __/ | || (_) |
     \/  \/ \___|_|\___\___/|_| |_| |_|\___|  \__\___/ 
  ______      _____  __  __  
 |  ____/\   |  __ \|  \/  |              _.-^-._    .--.
 | |__ /  \  | |__) | \  / |           .-'   _   '-. |__|
 |  __/ /\ \ |  _  /| |\/| |          /     |_|     \|  |
 | | / ____ \| | \ \| |  | |         /               \  |
 |_|/_/    \_\_|  \_\_|  |_|        /|     _____     |\ |
                                     |    |==|==|    |  |
|---||---|---|---|---|---|---|---|---|    |--|--|    |  |
|---||---|---|---|---|---|---|---|---|    |==|==|    |  |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Devices available: {}".format(device))

Devices available: cuda


### Data Handling

In [5]:
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path="xlm-roberta-base",
    do_lower_case=False)

05/11/2022 15:27:07 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'XLMRobertaTokenizer'


In [6]:
LABEL_LIST = ["0", "1"]

In [9]:
processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=256,
                                        data_dir="data/Version 1",
                                        train_filename="train.tsv",
                                        label_list=["0","1"],
                                        metric="acc",
                                        label_column_name="hate"
                                        )



In [10]:
dicts = processor.file_to_dicts(file="data/Version 1/train.tsv")
print(dicts[0])


{'text': 'RT  : Ein kluger und lesenswerter Beitrag von', 'text_classification_label': '1'}


In [11]:
BATCH_SIZE = 8

data_silo = DataSilo(
    processor=processor,
    batch_size=BATCH_SIZE,
    #caching=True
)

05/11/2022 15:27:35 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
05/11/2022 15:27:35 - INFO - farm.data_handler.data_silo -   LOADING TRAIN DATA
05/11/2022 15:27:35 - INFO - farm.data_handler.data_silo -   Loading train set from: data/Version 1/train.tsv 
05/11/2022 15:27:36 - INFO - farm.data_handler.data_silo -   Got ya 15 parallel workers to convert 40293 dictionaries to pytorch datasets (chunksize = 538)...
05/11/2022 15:27:36 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
05/11/2022 15:27:36 - INFO - farm.data_handler.data_silo -   /w\  /|\  /w\  /|\  /w\  /w\  /|\  /w\  /w\  /w\  /|\  /|\  /|\  /w\  /|\
05/11/2022 15:27:36 - INFO - farm.data_handler.data_silo -   /'\  /'\  /'\  /'\  / \  /'\  /'\  /'\  /'\  /'\  /'\  /'\  /'\  / \  /'\
05/

Preprocessing Dataset data/Version 1/test.tsv: 100%|█| 4514/4514 [00:07<00:00, 634.
05/11/2022 15:27:52 - INFO - farm.data_handler.data_silo -   
05/11/2022 15:27:52 - INFO - farm.data_handler.data_silo -   DATASETS SUMMARY
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Examples in train: 36263
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Examples in dev  : 4030
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Examples in test : 4514
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Total examples   : 44807
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     256
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 33.67024239583046
05/11/2022 15:27:54 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.00022061053966853267


In [12]:
MODEL_NAME_OR_PATH = "xlm-roberta-base"

language_model = LanguageModel.load(MODEL_NAME_OR_PATH)

05/11/2022 15:27:54 - INFO - farm.modeling.language_model -   
05/11/2022 15:27:54 - INFO - farm.modeling.language_model -   LOADING MODEL
05/11/2022 15:27:54 - INFO - farm.modeling.language_model -   Could not find xlm-roberta-base locally.
05/11/2022 15:27:54 - INFO - farm.modeling.language_model -   Looking on Transformers Model Hub (in local cache and online)...
05/11/2022 15:28:04 - INFO - farm.modeling.language_model -   Loaded xlm-roberta-base


In [13]:
prediction_head = TextClassificationHead(num_labels=len(LABEL_LIST),
                                         class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
)

05/11/2022 15:28:04 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 2]
05/11/2022 15:28:05 - INFO - farm.modeling.prediction_head -   Using class weights for task 'text_classification': [1.3022479  0.81162435]


In [14]:
EMBEDS_DROPOUT_PROB = 0.1

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
    lm_output_types=["per_sequence"],
    device=device)

In [15]:
earlystopping = EarlyStopping(
       metric="loss", mode="min",
    save_dir=Path("saved_models/xlm-roberta-classification"),
    patience=3  
)

In [16]:
LEARNING_RATE = 2e-5
N_EPOCHS = 10

model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    device=device,
    learning_rate=LEARNING_RATE,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=N_EPOCHS)

05/11/2022 15:28:23 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 2e-05}'
05/11/2022 15:28:23 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
05/11/2022 15:28:23 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 4533.0, 'num_training_steps': 45330}'


In [17]:
N_GPU = 1

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=N_EPOCHS,
    n_gpu=N_GPU,
    lr_schedule=lr_schedule,
    device=device,
    evaluate_every=2000
)

In [18]:
model = trainer.train()

05/11/2022 15:28:27 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/9 (Cur. train loss: 0.3851):  44%|▍| 2000/4533 [13:32<16:45,  2.52it/
Evaluating:   0%|                                          | 0/504 [00:00<?, ?it/s][A
Evaluating:  21%|██████▊                         | 108/504 [00:10<00:36, 10.75it/s][A
Evaluating:  

05/11/2022 16:38:52 - INFO - farm.eval -   
 _________ text_classification _________
05/11/2022 16:38:52 - INFO - farm.eval -   loss: 0.6039568588340992
05/11/2022 16:38:52 - INFO - farm.eval -   task_name: text_classification
05/11/2022 16:38:52 - INFO - farm.eval -   acc: 0.792803970223325
05/11/2022 16:38:52 - INFO - farm.eval -   report: 
               precision    recall  f1-score   support

           0     0.7004    0.7850    0.7403      1516
           1     0.8601    0.7975    0.8277      2514

    accuracy                         0.7928      4030
   macro avg     0.7803    0.7912    0.7840      4030
weighted avg     0.8001    0.7928    0.7948      4030

Train epoch 2/9 (Cur. train loss: 0.3657):  65%|▋| 2934/4533 [20:11<10:18,  2.58it/
Evaluating:   0%|                                          | 0/504 [00:00<?, ?it/s][A
Evaluating:  21%|██████▊                         | 108/504 [00:10<00:36, 10.74it/s][A
Evaluating:  43%|█████████████▋                  | 216/504 [00:20<00:

05/11/2022 17:48:41 - INFO - farm.eval -   
 _________ text_classification _________
05/11/2022 17:48:41 - INFO - farm.eval -   loss: 1.1273973709381377
05/11/2022 17:48:41 - INFO - farm.eval -   task_name: text_classification
05/11/2022 17:48:41 - INFO - farm.eval -   acc: 0.7915632754342432
05/11/2022 17:48:41 - INFO - farm.eval -   report: 
               precision    recall  f1-score   support

           0     0.7063    0.7632    0.7337      1516
           1     0.8499    0.8087    0.8288      2514

    accuracy                         0.7916      4030
   macro avg     0.7781    0.7859    0.7812      4030
weighted avg     0.7959    0.7916    0.7930      4030

Train epoch 4/9 (Cur. train loss: 0.8448):  85%|▊| 3868/4533 [26:02<04:14,  2.61it/
Evaluating:   0%|                                          | 0/504 [00:00<?, ?it/s][A
Evaluating:  21%|██████▊                         | 108/504 [00:10<00:36, 10.72it/s][A
Evaluating:  43%|█████████████▋                  | 216/504 [00:20<00

05/11/2022 18:57:34 - INFO - farm.eval -   
 _________ text_classification _________
05/11/2022 18:57:34 - INFO - farm.eval -   loss: 2.049245309036284
05/11/2022 18:57:34 - INFO - farm.eval -   task_name: text_classification
05/11/2022 18:57:34 - INFO - farm.eval -   acc: 0.7861042183622828
05/11/2022 18:57:34 - INFO - farm.eval -   report: 
               precision    recall  f1-score   support

           0     0.7599    0.6306    0.6893      1516
           1     0.7980    0.8799    0.8369      2514

    accuracy                         0.7861      4030
   macro avg     0.7790    0.7552    0.7631      4030
weighted avg     0.7837    0.7861    0.7814      4030

Train epoch 6/9 (Cur. train loss: 0.0000): 100%|█| 4533/4533 [30:52<00:00,  2.45it/
Train epoch 7/9 (Cur. train loss: 0.0000):   6%| | 269/4533 [01:43<26:49,  2.65it/s
Evaluating:   0%|                                          | 0/504 [00:00<?, ?it/s][A
Evaluating:  21%|██████▊                         | 108/504 [00:10<00:36,

05/11/2022 20:06:10 - INFO - farm.eval -   
 _________ text_classification _________
05/11/2022 20:06:10 - INFO - farm.eval -   loss: 2.0425075289080197
05/11/2022 20:06:10 - INFO - farm.eval -   task_name: text_classification
05/11/2022 20:06:10 - INFO - farm.eval -   acc: 0.7935483870967742
05/11/2022 20:06:10 - INFO - farm.eval -   report: 
               precision    recall  f1-score   support

           0     0.7295    0.7170    0.7232      1516
           1     0.8311    0.8397    0.8354      2514

    accuracy                         0.7935      4030
   macro avg     0.7803    0.7784    0.7793      4030
weighted avg     0.7929    0.7935    0.7932      4030

Train epoch 8/9 (Cur. train loss: 0.0000): 100%|█| 4533/4533 [30:53<00:00,  2.45it/
Train epoch 9/9 (Cur. train loss: 0.0000):  27%|▎| 1203/4533 [07:47<20:55,  2.65it/
Evaluating:   0%|                                          | 0/504 [00:00<?, ?it/s][A
Evaluating:  21%|██████▊                         | 108/504 [00:10<00:36

In [19]:
save_dir = Path("saved_models/xlm_v1_r1_hyp2")
model.save(save_dir)
processor.save(save_dir)