In [70]:
import tensorflow as tf
print(tf.__version__)
print(tf.test.gpu_device_name())

2.4.0
/device:GPU:0


In [71]:
!pip3 install -q ktrain

In [72]:
!pip3 install -q fasttext

In [73]:
!pip3 install -q imbalanced-learn

In [74]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
import ktrain
import pandas as pd
import fasttext
import tempfile
from imblearn.over_sampling import RandomOverSampler
import numpy as np

## Task 1

In [75]:
with open('1_training_text.txt') as f:
  x_train = f.read().splitlines()
with open('1_training_tags.txt') as f:
  y_train = list(map(int, f.read().splitlines()))
with open('1_test_text.txt') as f:
  x_test = f.read().splitlines()
with open('1_test_tags.txt') as f:
  y_test = list(map(int, f.read().splitlines()))

### Over-sampling

Zbiór jest mocno niezbalansowany.
Over-samplingu istotnie poprawia wyniki w tym przypadku.

In [76]:
print(Counter(y_train))

oversample = RandomOverSampler(sampling_strategy='minority', random_state=13)
x_train_over, y_train_over = oversample.fit_resample(np.array(x_train).reshape((-1, 1)), y_train)
x_train_over = x_train_over.reshape((-1))

print(Counter(y_train_over))

Counter({0: 9190, 1: 851})
Counter({0: 9190, 1: 9190})


### TF-IDF & Bayes

In [77]:
def classify_bayessian_tfidf(x_train, y_train, x_test):
    vectorizer = TfidfVectorizer()
    x_train_tfidf = vectorizer.fit_transform(x_train)
    x_test_tfidf = vectorizer.transform(x_test)

    classifier = MultinomialNB()
    classifier.fit(x_train_tfidf, y_train)

    y_pred = classifier.predict(x_test_tfidf)
    return y_pred

In [136]:
def print_report(y_true, y_pred):
  print("Confusion matrix:\n%s\n" % metrics.confusion_matrix(y_true, y_pred))
  print(metrics.classification_report(y_true, y_pred))
  print("MCC: %.2f" % metrics.matthews_corrcoef(y_true, y_pred))

In [79]:
y_pred = classify_bayessian_tfidf(x_train_over, y_train_over, x_test)
print_report(y_test, y_pred)

Confusion matrix:
[[747 119]
 [ 56  78]]

              precision    recall  f1-score   support

           0       0.93      0.86      0.90       866
           1       0.40      0.58      0.47       134

    accuracy                           0.82      1000
   macro avg       0.66      0.72      0.68      1000
weighted avg       0.86      0.82      0.84      1000

MCC: 0.38


### fastText

In [80]:
def classify_fasttext(x_train, y_train, x_test):
    with tempfile.NamedTemporaryFile() as f:
        f.writelines([f"__label__{label} {sentence}".encode() for sentence, label in zip(x_train, y_train)])
        model = fasttext.train_supervised(f.name)

    y_pred, _ = model.predict(x_test)
    y_pred = [int(label.split("__label__")[1]) for (label,) in y_pred]
    return y_pred

In [81]:
y_pred = classify_fasttext(x_train_over, y_train_over, x_test)
print_report(y_test, y_pred)

Confusion matrix:
[[866   0]
 [134   0]]

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.00      0.00      0.00       134

    accuracy                           0.87      1000
   macro avg       0.43      0.50      0.46      1000
weighted avg       0.75      0.87      0.80      1000

MCC: 0.00


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


### Transformers

In [82]:
def classify_transformer(x_train, y_train, x_test, model_name):
    t = ktrain.text.Transformer(model_name, maxlen=25, class_names=np.unique(y_train))
    x_train_train, x_val, y_train_train, y_val = \
        train_test_split(x_train, y_train, test_size=0.2, random_state=13)
    train = t.preprocess_train(x_train_train, y_train_train)
    val = t.preprocess_test(x_val, y_val)
    model = t.get_classifier()
    learner = ktrain.get_learner(model, train_data=train, val_data=val, batch_size=32)
    learner.fit_onecycle(5e-5, 2)
    predictor = ktrain.get_predictor(learner.model, preproc=t)
    y_pred = predictor.predict(x_test)
    return y_pred

##### BERT transformer

In [83]:
y_pred = classify_transformer(x_train_over, y_train_over, x_test, model_name='bert-base-multilingual-uncased')
print_report(y_test, y_pred)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 21
	99percentile : 23




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[805  61]
 [ 81  53]]

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       866
           1       0.46      0.40      0.43       134

    accuracy                           0.86      1000
   macro avg       0.69      0.66      0.67      1000
weighted avg       0.85      0.86      0.85      1000

MCC: 0.35


##### DistilBERT transformer

In [84]:
y_pred = classify_transformer(x_train_over, y_train_over, x_test, model_name='distilbert-base-multilingual-cased')
print_report(y_test, y_pred)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 21
	99percentile : 23




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[813  53]
 [ 99  35]]

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       866
           1       0.40      0.26      0.32       134

    accuracy                           0.85      1000
   macro avg       0.64      0.60      0.61      1000
weighted avg       0.83      0.85      0.83      1000

MCC: 0.24


##### XLM-RoBERTa transformer

In [90]:
y_pred = classify_transformer(x_train_over, y_train_over, x_test, model_name='xlm-roberta-base')
print_report(y_test, y_pred)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…


preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 21
	99percentile : 23


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[712 154]
 [ 39  95]]

              precision    recall  f1-score   support

           0       0.95      0.82      0.88       866
           1       0.38      0.71      0.50       134

    accuracy                           0.81      1000
   macro avg       0.66      0.77      0.69      1000
weighted avg       0.87      0.81      0.83      1000

MCC: 0.42


##### Polish version of BERT transformer

In [91]:
y_pred = classify_transformer(x_train_over, y_train_over, x_test, model_name='dkleczek/bert-base-polish-uncased-v1')
print_report(y_test, y_pred)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…


preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=494801.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 21
	99percentile : 23


404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-uncased-v1/resolve/main/tf_model.h5


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=531146902.0, style=ProgressStyle(descri…




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[763 103]
 [ 79  55]]

              precision    recall  f1-score   support

           0       0.91      0.88      0.89       866
           1       0.35      0.41      0.38       134

    accuracy                           0.82      1000
   macro avg       0.63      0.65      0.64      1000
weighted avg       0.83      0.82      0.82      1000

MCC: 0.27


## Task 2

In [139]:
with open('2_training_text.txt') as f:
  x_train_2 = f.read().splitlines()
with open('2_training_tags.txt') as f:
  y_train_2 = list(map(int, f.read().splitlines()))
with open('2_test_text.txt') as f:
  x_test_2 = f.read().splitlines()
with open('2_test_tags.txt') as f:
  y_test_2 = list(map(int, f.read().splitlines()))

In [140]:
print(Counter(y_train_2))

oversample = RandomOverSampler(sampling_strategy='not majority', random_state=13)
x_train_over_2, y_train_over_2 = oversample.fit_resample(np.array(x_train_2).reshape((-1, 1)), y_train_2)
x_train_over_2 = x_train_over_2.reshape((-1))

print(Counter(y_train_over_2))

Counter({0: 9190, 2: 598, 1: 253})
Counter({0: 9190, 2: 9190, 1: 9190})


In [141]:
y_pred_2 = classify_bayessian_tfidf(x_train_over_2, y_train_over_2, x_test_2)
print_report(y_test_2, y_pred_2)

Confusion matrix:
[[710  73  83]
 [  9   9   7]
 [ 47  16  46]]

              precision    recall  f1-score   support

           0       0.93      0.82      0.87       866
           1       0.09      0.36      0.15        25
           2       0.34      0.42      0.38       109

    accuracy                           0.77      1000
   macro avg       0.45      0.53      0.46      1000
weighted avg       0.84      0.77      0.80      1000

MCC: 0.28


In [142]:
y_pred_2 = classify_fasttext(x_train_over_2, y_train_over_2, x_test_2)
print_report(y_test_2, y_pred_2)

Confusion matrix:
[[866   0   0]
 [ 25   0   0]
 [109   0   0]]

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       866
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00       109

    accuracy                           0.87      1000
   macro avg       0.29      0.33      0.31      1000
weighted avg       0.75      0.87      0.80      1000

MCC: 0.00


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [143]:
y_pred_2 = classify_transformer(x_train_over_2, y_train_over_2, x_test_2,
                                model_name='bert-base-multilingual-uncased')
print_report(y_test_2, y_pred_2)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[804  42  20]
 [ 19   3   3]
 [ 69  12  28]]

              precision    recall  f1-score   support

           0       0.90      0.93      0.91       866
           1       0.05      0.12      0.07        25
           2       0.55      0.26      0.35       109

    accuracy                           0.83      1000
   macro avg       0.50      0.44      0.45      1000
weighted avg       0.84      0.83      0.83      1000

MCC: 0.26


In [144]:
y_pred_2 = classify_transformer(x_train_over_2, y_train_over_2, x_test_2,
                                model_name='distilbert-base-multilingual-cased')
print_report(y_test_2, y_pred_2)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[813  38  15]
 [ 19   2   4]
 [ 80   9  20]]

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       866
           1       0.04      0.08      0.05        25
           2       0.51      0.18      0.27       109

    accuracy                           0.83      1000
   macro avg       0.48      0.40      0.41      1000
weighted avg       0.83      0.83      0.82      1000

MCC: 0.20


In [146]:
y_pred_2 = classify_transformer(x_train_over_2, y_train_over_2, x_test_2,
                                model_name='xlm-roberta-base')
print_report(y_test_2, y_pred_2)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[800  45  21]
 [ 15   5   5]
 [ 69  15  25]]

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       866
           1       0.08      0.20      0.11        25
           2       0.49      0.23      0.31       109

    accuracy                           0.83      1000
   macro avg       0.49      0.45      0.45      1000
weighted avg       0.84      0.83      0.83      1000

MCC: 0.26


In [145]:
y_pred_2 = classify_transformer(x_train_over_2, y_train_over_2, x_test_2,
                                model_name='dkleczek/bert-base-polish-uncased-v1')
print_report(y_test_2, y_pred_2)

preprocessing train...
language: pl
train sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


Is Multi-Label? False
preprocessing test...
language: pl
test sequence lengths:
	mean : 13
	95percentile : 20
	99percentile : 23


404 Client Error: Not Found for url: https://huggingface.co/dkleczek/bert-base-polish-uncased-v1/resolve/main/tf_model.h5




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/2
Epoch 2/2
Confusion matrix:
[[761  97   8]
 [ 14   8   3]
 [ 71  24  14]]

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       866
           1       0.06      0.32      0.10        25
           2       0.56      0.13      0.21       109

    accuracy                           0.78      1000
   macro avg       0.51      0.44      0.40      1000
weighted avg       0.84      0.78      0.80      1000

MCC: 0.18


## Podsumowanie

> 1. Which of the classifiers works the best for the task 1 and the task 2.

**Task 1**

|Model|Accuracy|Macro-avg F1|MCC|
|---|---:|---:|---:|
|TF-IDF Bayes|0.82|0.68|0.38|
|fastText|0.87|0.46|0.00|
|Transformer bert-base-multilingual-uncased|0.86|0.67|0.35|
|Transformer distilbert-base-multilingual-cased|0.85|0.61|0.24|
|Transformer xlm-roberta-base|0.81|0.69|0.42|
|Transformer bert-base-polish-uncased-v1|0.82|0.64|0.27|

Ponieważ zbiór danych jest mocno niezbalansowany wysokie wartości metryki Accuracy niekoniecznie świadczą o dobrych wynikach.

Wg metryk Macro-avg F1 i MCC najlepsze wyniki osiągnął transformer *xlm-roberta-base*
Dobrze sprawdziła się także klasyfikacja Bayesa z TF-IDF.

fastText przydzielił wszystkim danym jedną etykietę 0 ("non harmful"). Nie jestem pewien czy wynika to ze sposobu działania fastTexta czy z błędu w mojej implementacji.

**Task 2**

|Model|Accuracy|Macro-avg F1|MCC|
|---|---:|---:|---:|
|TF-IDF Bayes|0.77|0.46|0.28|
|fastText|0.87|0.31|0.00|
|Transformer bert-base-multilingual-uncased|0.83|0.45|0.26|
|Transformer distilbert-base-multilingual-cased|0.83|0.41|0.20|
|Transformer xlm-roberta-base|0.78|0.40|0.18|
|Transformer bert-base-polish-uncased-v1|0.78|0.40|0.18|

Najlepsze wyniki uzyskała klasyfikacja Bayesa z TF-IDF. Na drugim miejscu znalazł się transformer *bert-base-multilingual-uncased*

> 2. Did you achieve results comparable with the results of PolEval Task?

Dla Task 1 otrzymałem wyższy wynik F1 (ale niższe Accuracy).
Dla Task 2 otrzymałem gorsze wyniki.

> 3. Did you achieve results comparabie with the Klej leaderboard?

Najlepsze lub prawie najlepsze wyniki otrzymałem dla XLM-ROBERTa, tak samo jak w Klej leaderboard.
