# Environment Configuration

In [1]:
## run if training returns an error on apex
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex --quiet

Writing setup.sh


In [2]:
## install apex

!sh setup.sh

Cloning into 'apex'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 7293 (delta 20), reused 19 (delta 6), pack-reused 7255[K
Receiving objects: 100% (7293/7293), 13.87 MiB | 12.99 MiB/s, done.
Resolving deltas: 100% (4921/4921), done.
  cmdoptions.check_install_build_global(options)
Processing ./apex
Skipping wheel build for apex, due to binaries being disabled for it.
Installing collected packages: apex
    Running setup.py install for apex ... [?25l[?25hdone
Successfully installed apex-0.1


In [3]:
## run if simpletransformers is not installed
!pip install simpletransformers --quiet
!pip install unidecode --quiet

[K     |████████████████████████████████| 194kB 5.7MB/s 
[K     |████████████████████████████████| 3.0MB 8.2MB/s 
[K     |████████████████████████████████| 757kB 13.7MB/s 
[K     |████████████████████████████████| 204kB 39.8MB/s 
[K     |████████████████████████████████| 1.1MB 39.7MB/s 
[K     |████████████████████████████████| 890kB 39.3MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: transformers 3.0.0 has requirement tokenizers==0.8.0-rc4, but you'll have tokenizers 0.8.0 which is incompatible.[0m
[K     |████████████████████████████████| 245kB 6.4MB/s 
[?25h

In [5]:
!nvidia-smi

Tue Jun 30 16:15:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
from simpletransformers.classification import (ClassificationModel,
                                               MultiLabelClassificationModel)
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import (classification_report,
                             f1_score,
                             accuracy_score,
                             confusion_matrix,
                             recall_score,
                             precision_score,
                             roc_auc_score,
                             multilabel_confusion_matrix)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from google.colab import drive, files
import pandas as pd
import numpy as np
import json
import re
import nltk
import string
import logging
from unidecode import unidecode

nltk.download("punkt")
nltk.download("stopwords")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

drive.mount('/content/drive', force_remount=True)
datasets_path = "drive/My Drive/Colab Notebooks/toxic/datasets/"

SEED = 42

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Binary Classification

In [None]:
class Experiment():
  def __init__(
      self, bert_args=None, language="portuguese", n_annotators=1,
      balancing=None, do_preprocessing=False, train_amount=1,
      text_representation_model="bow"
  ):
    """
    language (str) -> portuguse or multilingual
    n_annotators -> minimum number of agreement between annotators to consider
                    an example as toxic
    bert_args (dict) -> custom arguments for the BERT model
    balancing (str or None) -> 'undersampling', 'oversampling' or None.
    do_preprocessing (bool) -> if true will remove stopwords, accents, numbers,
                               hashtags and punctuation.
    train_amount (float) -> value from (0,1] that will use this percentage of 
                            the data to train
    text_representation_model (str) -> "bert" or "bow". Bow implies using SVM.

    """
    self.language = language
    self.data_amount = data_amount
    self.datasets_path = "drive/My Drive/Colab Notebooks/toxic/datasets/"
    self.n_annotators = n_annotators
    self.__load_dataset(self.language)
    self.text_representation_model = text_representation_model

    if do_preprocessing:
      if self.language == "multilingual":
        stopwords = list(set([unidecode(w) for w in \
                              nltk.corpus.stopwords.words("portuguese")]))
        stopwords.extend(list(set([unidecode(w) for w in \
                              nltk.corpus.stopwords.words("english")])))
      else: 
        stopwords = list(set([unidecode(w) for w in \
                              nltk.corpus.stopwords.words(self.language)]))
        self.train_set = self.__preprocess(self.train_set, stopwords=stopwords)
        self.test_set = self.__preprocess(self.test_set, stopwords=stopwords)

    if balancing == "undersampling":
      negatives = self.train_set["toxic"] == 0
      positives = self.train_set["toxic"] == 1
      self.train_set = self.train_set[negatives].sample(
          len(self.train_set[positives])).append(
          self.train_set[positives],
          ignore_index=True
      )
    
    elif balancing == "oversampling":
      positives = self.train_set[self.train_set["toxic"] == 1]
      self.train_set = self.train_set.append(positives, ignore_index=True)

    elif balancing is None: pass

    else: raise(NotImplementedError)

    self.describe_data()
      
    if text_representation_model == "bert":
      # Define Bert Pretrained Model
      model_name = "distilbert"
      pretrained_name = "distilbert-base-multilingual-cased"

      if bert_args:
        self.model = ClassificationModel(
            model_name, pretrained_name, args=bert_args
        )
      else:
         self.model = ClassificationModel(
            model_name, pretrained_name
        )

  def __load_dataset(self, language):
    self.train_set = pd.DataFrame()
    self.test_set = pd.DataFrame()
    
    # Test Set
    self.test_set = pd.read_csv(
    self.datasets_path +
    f"{self.n_annotators}annotator/" +
    f"ptbr_test_{self.n_annotators}annotator.csv")

    if language == "portuguese" or language == "multilanguage":
      # Train Set
      train = pd.read_csv(
      self.datasets_path +
      f"{self.n_annotators}annotator/" +
      f"ptbr_train_{self.n_annotators}annotator.csv")

      # Dev Set
      dev_set = pd.read_csv(
      self.datasets_path +
      f"{self.n_annotators}annotator/" +
      f"ptbr_validation_{self.n_annotators}annotator.csv")
      self.train_set = train.append(dev_set, ignore_index=True)
        
    if language == "english" or language == "multilanguage":
      # Load OffenseEval2020 data
      eng_data = pd.read_csv(datasets_path+"olid-training-v1.0.tsv", sep="\t")
      eng_data = eng_data[["tweet", "subtask_a"]]
      eng_data["subtask_a"] = eng_data["subtask_a"].apply(
          lambda x: 1 if x == "OFF" else 0)
      eng_data.columns = ["text", "toxic"]
      eng_data["text"].apply(
        lambda x: x.replace("@USER", "@user")
      ) # normalize user mention token
      
      self.train_set = self.train_set.append(
          eng_data,
          ignore_index=True
      )

    self.train_set = self.train_set.sample(frac=self.data_amount)


  def __preprocess(self, data, stopwords):
    """Remove hashtaghs, numbers, punctuation, accents, links and stopwords.""""
    df = data

    df["text"] = \
      df["text"].apply(lambda x: re.sub("#[^ ]+", "", x)) # remove hashtags

    df["text"] = \
      df["text"].apply(lambda x: re.sub("\d+", "", x)) # remove numbers

    df["text"] = \
    df["text"].apply(lambda x: x.translate(
        str.maketrans("", "", string.punctuation))) # remove punctuation

    df["text"] = \
      df["text"].apply(lambda x: unidecode(x)) # remove accents

    df["text"] = \
     df["text"].apply(lambda x: re.sub("http[^ ]+", "", x)) # remove links

    df["text"] = \
      df["text"].apply(lambda x: " ".join(
          w.strip() for w in x.split() if w not in stopwords)) # remove stopword

    return df
  
  def describe_data(self):
    """
    Prints train set and test set counts and ratios.
    """
    negative_count, positive_count = self.train_set["toxic"].value_counts()
    total = negative_count + positive_count
    negative_ratio = negative_count/total
    positive_ratio = positive_count/total
    train_ratio = len(self.train_set)/(len(self.train_set) + len(self.test_set))
    print(
        f"""TRAIN SET ({train_ratio:.2f})
            Total: {total}
            -------Counts-------
            Negatives: {negative_count}
            Positives: {positive_count}
            -------Ratio--------
            Negatives: {negative_ratio:.2f}
            Positives: {positive_ratio:.2f}
        """
    )

    negative_count, positive_count = self.test_set["toxic"].value_counts()
    total = negative_count + positive_count
    negative_ratio = negative_count/total
    positive_ratio = positive_count/total
    test_ratio = len(self.test_set)/(len(self.train_set) + len(self.test_set))
    print(
        f"""TEST SET ({test_ratio:.2f})
            Total: {total}
            -------Counts-------
            Negatives: {negative_count}
            Positives: {positive_count}
            -------Ratio--------
            Negatives: {negative_ratio:.2f}
            Positives: {positive_ratio:.2f}
        """
    )


  def train(self):
    """
    Itinializes training.
    """
    if self.text_representation_model == "bow":
      bow = CountVectorizer()
      bow.fit(self.train_set["text"])
      self.__X_train = bow.transform(self.train_set["text"])
      self.__y_train = self.train_set["toxic"]
      self.__X_test = bow.transform(self.test_set["text"])
      self.__y_test = self.test_set["toxic"]

      self.model = SVC(verbose=True)
      self.model.fit(self.__X_train, self.__y_train)

    elif self.text_representation_model == "bert":
      self.model.train_model(self.train_set)

  def eval(self):
    """
    Prints accuracy, f1, classification report and (tp, fp, tn, fn) rate.
    Returns pd.DataFrame (text, y_true, y_pred, language) with results.
    """
    if self.text_representation_model == "bert":
      result, model_outputs, wrong_predictions = \
        self.model.eval_model(self.test_set, silent=True)
      
      y_pred, _ = self.model.predict(list(self.test_set.iloc[:, 0]))
      y_true = list(self.test_set.iloc[:, 1])

    else:
      y_pred = self.model.predict(self.__X_test)
      y_true = self.__y_test

      cm = confusion_matrix(y_true, y_pred)
      result = f"(tn: {cm[0,0]} fp: {cm[0,1]} fn: {cm[1,0]} tp: {cm[1,1]}"

    
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Accuracy: {acc:.3f}\nF1: {f1:.3f}")
    print("----------------------------------------")
    print(classification_report(y_true, y_pred))
    print("----------------------------------------")
    print(result)

    results = pd.DataFrame(columns=["text", "y_true", "y_pred", "language"])
    results["y_true"] = y_true
    results["y_pred"] = y_pred
    results["text"] = self.test_set["text"].reset_index(drop=True)
    results["language"] = self.language

    self.results = results
    return self.results

  def download_results(self, filename):
    self.results.to_csv(f"{filename}.csv", index=False)
    files.download(f"{filename}.csv")

## BERT

## BoW + SVM (Baseline)

In [None]:
exp = Experiment(
    language="portuguese",
    n_annotators=1,
    do_preprocessing=False,
    balancing=None,
    data_amount=1.0,
    text_representation_model="bow"
)

In [None]:
exp.train()

[LibSVM]

In [None]:
results = exp.eval()

Accuracy: 0.736
F1: 0.692
----------------------------------------
              precision    recall  f1-score   support

           0       0.73      0.82      0.77      1128
           1       0.75      0.64      0.69       972

    accuracy                           0.74      2100
   macro avg       0.74      0.73      0.73      2100
weighted avg       0.74      0.74      0.73      2100

----------------------------------------



In [None]:
exp = Experiment(
    args={
      'num_train_epochs': 3,
      'evaluate_during_training': False,
      'overwrite_output_dir': True,
      'manual_seed': SEED,
      'do_lower_case': False,
      'save_steps': 100000,
      'no_cache': False,
      'n_gpu': 4,
      'train_batch_size': 50,
      'max_seq_len': 512,
    },
    language="portuguese",
    n_annotators=1,
    do_preprocessing=False,
    balancing=None,
    data_amount=1.0,
    text_representation_model="bert"
)

INFO:filelock:Lock 140403258468224 acquired on /root/.cache/torch/transformers/aee7490b1a48646df683dee12f25d9c63ebbf8dce1b7e1a656ce28830d9a7e86.bc76a47cb1c1c2984e48f23afbd3473a944ac1a2be9a8c8200092f5bf62153c9.lock


                                                    text  toxic
1455   @user que que tem o cu com as calças daniel? k...      1
18777  como que a menina vai pra escola parecendo a l...      1
10458  to chocado com a beleza desse menino, pena que...      0
3387        se vc pate palma pro uma merda dessas = lixo      0
10838  rt @user stf articula afastamento de deltan da...      0
TRAIN SET (0.90)
            Total: 18900
            -------Counts-------
            Negatives: 10617
            Positives: 8283
            -------Ratio--------
            Negatives: 0.56
            Positives: 0.44
        
TEST SET (0.10)
            Total: 2100
            -------Counts-------
            Negatives: 1128
            Positives: 972
            -------Ratio--------
            Negatives: 0.54
            Positives: 0.46
        


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140403258468224 released on /root/.cache/torch/transformers/aee7490b1a48646df683dee12f25d9c63ebbf8dce1b7e1a656ce28830d9a7e86.bc76a47cb1c1c2984e48f23afbd3473a944ac1a2be9a8c8200092f5bf62153c9.lock





INFO:filelock:Lock 140406201576752 acquired on /root/.cache/torch/transformers/72a6c787412704a6fa6f5d9e5ef7d33c5b80c787e2bbc7d9ad82d7f88fb8f802.89fad86febf14521569023d312560283a922c0884a52f412eef4e96f91513ab2.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=541808922.0, style=ProgressStyle(descri…

INFO:filelock:Lock 140406201576752 released on /root/.cache/torch/transformers/72a6c787412704a6fa6f5d9e5ef7d33c5b80c787e2bbc7d9ad82d7f88fb8f802.89fad86febf14521569023d312560283a922c0884a52f412eef4e96f91513ab2.lock





INFO:filelock:Lock 140403255559112 acquired on /root/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140403255559112 released on /root/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock





In [None]:
exp.train()

In [None]:
results = exp.eval()

## Learning Curve Experiment

In [None]:
seeds = range(1,31)
train = pd.read_csv(
      datasets_path +
      "1annotator/" +
      "ptbr_train_1annotator.csv"
)
dev_set = pd.read_csv(
      datasets_path +
      "1annotator/" +
      "ptbr_validation_1annotator.csv"
)

train = train.append(dev_set, ignore_index=True)
test = pd.read_csv(
    datasets_path +
    "1annotator/" +
    "ptbr_test_1annotator.csv")

learning_curve = dict.fromkeys(range(1,11))
for i in range(1, 11):
  confusion = {1: None, 2: None, 3: None}
  f1_negative = []
  f1_positive = []
  f1_overall = []
 
  precision_negative = []
  precision_positive = []
  precision_overall = []

  recall_negative = []
  recall_positive = []
  recall_overall = []
  for j in range(1,4):
    print(f"-------------{it}/30-------------")
    model = ClassificationModel(
      "distilbert", "distilbert-base-multilingual-cased",
      args={
        'num_train_epochs': 1,
        'evaluate_during_training': False,
        'overwrite_output_dir': True,
        'do_lower_case': False,
        'save_steps': 100000,
        'no_cache': True,
        'n_gpu': 4,
        'train_batch_size': 50,
        'max_seq_len': 512,
        'silent': True,
        'manual_seed': SEED,
        "reprocess_input_data": True,
      },
    )
    np.random.seed(seeds[it])
    indices = np.random.randint(0,len(train), int(len(train)*i/10))
    print(indices)
    train_ij = train.iloc[indices, :]
    try:
      model.train_model(train_ij)
    except RuntimeError:
      with open(f"learning_curve{i}.json", "w") as f:
        json.dump(learning_curve, f)
        files.download(f"learning_curve{i}.json")
        print(f"OUT OF CUDA MEMORY, STOPPED AT {i}")
    y_pred, _ = model.predict(list(test.iloc[:, 0]))
    y_true = test.iloc[:, 1]
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    negatives, positives = train_ij["toxic"].value_counts()
    total = positives + negatives
    a, b = f1_score(y_true, y_pred, average=None)
    f1_overall.append(f1_score(y_true, y_pred))
    f1_negative.append(a)
    f1_positive.append(b)

    c, d = precision_score(y_true, y_pred, average=None)
    precision_overall.append(precision_score(y_true, y_pred))
    precision_negative.append(c)
    precision_positive.append(d)

    e, f = recall_score(y_true, y_pred, average=None)
    recall_overall.append(recall_score(y_true, y_pred))
    recall_negative.append(e)
    recall_positive.append(f)

    confusion[j] = {
      "fp": int(fp),
      "fn": int(fn),
      "tp": int(tp),
      "tn": int(tn),
      "positives": int(positives),
      "negatives": int(negatives),
      "total": int(total)
    }
    del indices
    del model
    it += 1

  learning_curve[i] = {
    "f1_overall": f1_overall,
    "f1_negative": f1_negative,
    "f1_positive": f1_positive,
    "precision_overall": precision_overall,
    "precision_positive": precision_positive,
    "precision_negative": precision_negative,
    "recall_overall": recall_overall,
    "recall_positive": recall_positive,
    "recall_negative": recall_negative,
    "confusion_matrix": confusion
  }

with open("learning_curve.json", "w") as f:
  json.dump(learning_curve, f)

files.download(f"learning_curve.json")
    

-------------13/30-------------
[13656  9484 18838 ... 15462  3375  6768]


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------14/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[7624 8076 2693 ... 1950 1235 8554]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------15/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 6825 15598  2169 ...  9254  2231  3509]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------16/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[10863  2191 13702 ... 15521 18048  4996]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------17/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 2885  1726 16305 ... 11475 15194  7679]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------18/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[10862 17141  1378 ... 11715 13679 17923]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------19/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[15715  4367 14729 ... 18244 12034 16550]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------20/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[15305  5327  5944 ...  3162 12170 11528]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------21/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[11125 15956  9181 ...  4445 14159  4742]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------22/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 8787  9256 11190 ...  7453  1844 17659]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------23/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[12706   899 14528 ... 11480  1631 17580]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------24/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 6618  2934  1175 ...  1152 11825  2628]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------25/30-------------
[ 9648 10177 10202 ...  2678  2294  8471]


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------26/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 5139  3912 14879 ... 16449   537  6833]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------27/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[9473 4089 7200 ... 8123 7501 7625]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------28/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 6380 10749   808 ... 16443  8883 12855]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


INFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


-------------29/30-------------


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


[ 5925  4517 15277 ... 13814 16709  4818]
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x7ff699942728>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py", line 220, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'
Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x7ff699942888>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py", line 220, in __iter__
    self.sp(bar_style='danger')
AttributeError: 'tqdm_notebook' object has no attribute 'sp'


MessageError: ignored

# Multi-label Classification

In [9]:
multilabel_data = pd.read_csv(datasets_path+"1annotator/multilabel_grouped.csv")
multilabel_data = multilabel_data[["text", "homophobia", "obscene", "insult", "racism", "misogyny", "xenophobia"]]

## BoW + SVM (Baseline)


In [10]:
def get_dataset_from_label(df, label):
  return_df = pd.DataFrame(columns=["text", "toxic"])
  return_df["text"] = df["text"]
  return_df["toxic"] = df.apply(lambda x: 1 if x[label] == 1 else 0, axis=1)

  return return_df

In [12]:
multilabel_baseline = dict.fromkeys(multilabel_data.columns[1:])
scores = []
for label in multilabel_data.columns[1:]:
  print(label)
  df = get_dataset_from_label(multilabel_data, label)
  train, test = train_test_split(df, train_size=0.9, random_state=SEED)
  bow = CountVectorizer()
  train_bow = bow.fit_transform(train.iloc[:, 0])
  test_bow = bow.transform(test.iloc[:, 0])

  model = SVC()
  model.fit(train_bow, train.iloc[:, 1])
  y_pred = model.predict(test_bow)
  y_true = test.iloc[:, 1]

  score = roc_auc_score(y_true, y_pred)
  cm = confusion_matrix(y_true, y_pred)
  multilabel_baseline[label] = {
      "label": label,
      "model": model,
      "score": score,
      "tn": cm[0, 0],
      "fp": cm[0, 1],
      "fn": cm[1, 0],
      "tp": cm[1, 1]
  }
  scores.append(score)
  print(cm)
  
print(f"Overall ROC AUC: {np.mean(scores)}")

homophobia
[[2071    3]
 [  20    6]]
obscene
[[1301  167]
 [ 263  369]]
insult
[[1648   28]
 [ 322  102]]
racism
[[2089    0]
 [  11    0]]
misogyny
[[2056    1]
 [  34    9]]
xenophobia
[[2081    0]
 [  19    0]]
Overall ROC AUC: 0.5943415888816505


## BERT

In [None]:
multilabel_data["labels"] = multilabel_data.iloc[:, 1:].apply(lambda x: np.array(x), axis=1)

In [None]:
multilabel_data = multilabel_data[["text", "labels"]]

In [None]:
train, test = train_test_split(multilabel_data, train_size=0.9, random_state=SEED)

In [None]:
mclf = MultiLabelClassificationModel(model_type="bert", model_name="bert-base-multilingual-cased", num_labels=6)

INFO:filelock:Lock 140411740546104 acquired on /root/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140411740546104 released on /root/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.65df3cef028a0c91a7b059e4c404a975ebe6843c71267b67019c0e9cfa8a88f0.lock





INFO:filelock:Lock 140414651991600 acquired on /root/.cache/torch/transformers/3d1d2b2daef1e2b3ddc2180ddaae8b7a37d5f279babce0068361f71cd548f615.7131dcb754361639a7d5526985f880879c9bfd144b65a0bf50590bddb7de9059.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…

INFO:filelock:Lock 140414651991600 released on /root/.cache/torch/transformers/3d1d2b2daef1e2b3ddc2180ddaae8b7a37d5f279babce0068361f71cd548f615.7131dcb754361639a7d5526985f880879c9bfd144b65a0bf50590bddb7de9059.lock





- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:filelock:Lock 140411790497888 acquired on /root/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140411790497888 released on /root/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729.lock





In [None]:
mclf.train_model(train, args={
      'num_train_epochs': 3,
      'evaluate_during_training': False,
      'overwrite_output_dir': True,
      'manual_seed': SEED,
      'do_lower_case': False,
      'save_steps': 100000,
      'no_cache': False,
      'n_gpu': 4,
      'train_batch_size': 8,
      'max_seq_len': 512,
    })

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=18900.0), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=2363.0, style=ProgressStyle(descrip…

Running loss: 0.674199



Running loss: 0.646596Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 0.610434Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 0.549291



Running loss: 0.295872




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=2363.0, style=ProgressStyle(descrip…

Running loss: 0.170651


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=2363.0, style=ProgressStyle(descrip…

Running loss: 0.271613Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Running loss: 0.102064



INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


In [None]:
result, model_outputs, wrong_predictions = mclf.eval_model(test)

HBox(children=(FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=263.0, style=ProgressStyle(descr…




In [None]:
# "homophobia", "obscene", "insult", "racism", "misogyny", "xenophobia"
predictions, outputs = mclf.predict(test["text"])

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=263.0), HTML(value='')))




In [None]:
y_true = np.array([np.array(x) for x in test["labels"]])
y_pred = np.array([np.array(x) for x in predictions])

In [None]:
roc_auc_score(y_true, y_pred, average='macro')

In [None]:
roc_auc_score(y_true, y_pred, average='micro')

In [None]:
multilabel_confusion_matrix(y_true, y_pred)

array([[[2072,    2],
        [  25,    1]],

       [[1430,   38],
        [ 427,  205]],

       [[1635,   41],
        [ 290,  134]],

       [[2089,    0],
        [  11,    0]],

       [[2057,    0],
        [  39,    4]],

       [[2081,    0],
        [  19,    0]]])

In [None]:
hamming_loss(y_true, y_pred)

0.14476190476190476