In [107]:
! pip install fasttext sacremoses evaluate keras transformers lime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [108]:
from datasets import load_dataset
from transformers import AutoTokenizer
import evaluate
import numpy as np
from transformers import TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import create_optimizer
import keras
import os 
import fasttext
from transformers.keras_callbacks import KerasMetricCallback
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

In [109]:
import tensorflow as tf
print(tf.__version__)
tf.config.list_physical_devices('GPU')

2.9.2


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [110]:
dataset1 = load_dataset('poleval2019_cyberbullying', 'task01')
dataset2 = load_dataset('poleval2019_cyberbullying', 'task02')



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

### Bayesian classifier with TF * IDF weighting.

Dataset 1

In [111]:
tf_vectorizer = CountVectorizer() # or term frequency

In [112]:
data_dict = dataset1['train'].to_dict()
data_test_dict = dataset1['test'].to_dict()

In [113]:
X_train_tf = tf_vectorizer.fit_transform(data_dict['text'])
X_test_tf = tf_vectorizer.transform(data_test_dict['text'])

In [114]:
classifier_bayes_data1 = MultinomialNB()
classifier_bayes_data1.fit(X_train_tf, data_dict['label'])


MultinomialNB()

In [115]:
y_pred = classifier_bayes_data1.predict(X_test_tf)

In [116]:
# score1 = metrics.accuracy_score(data_test_dict['label'], y_pred)
print(metrics.classification_report(data_test_dict['label'], y_pred,
                                            target_names=['Ok', 'Bullying']))


              precision    recall  f1-score   support

          Ok       0.87      1.00      0.93       866
    Bullying       0.73      0.06      0.11       134

    accuracy                           0.87      1000
   macro avg       0.80      0.53      0.52      1000
weighted avg       0.85      0.87      0.82      1000



Dataset 2

In [117]:
data_dict = dataset2['train'].to_dict()
data_test_dict = dataset2['test'].to_dict()

In [118]:
X_train_tf = tf_vectorizer.fit_transform(data_dict['text'])
X_test_tf = tf_vectorizer.transform(data_test_dict['text'])

In [119]:
classifier_bayes_data2 = MultinomialNB()
classifier_bayes_data2.fit(X_train_tf, data_dict['label'])


MultinomialNB()

In [120]:
y_pred = classifier_bayes_data2.predict(X_test_tf)

In [121]:
# score1 = metrics.accuracy_score(data_test_dict['label'], y_pred)
print(metrics.classification_report(data_test_dict['label'], y_pred,
                                            target_names=['Ok', 'Bullying', 'Hate Speech']))


              precision    recall  f1-score   support

          Ok       0.87      1.00      0.93       866
    Bullying       0.00      0.00      0.00        25
 Hate Speech       0.60      0.03      0.05       109

    accuracy                           0.87      1000
   macro avg       0.49      0.34      0.33      1000
weighted avg       0.82      0.87      0.81      1000



### Fasttext text classifier

In [122]:
data1_path = os.path.abspath('./data1.txt')
data1_path

'/content/data1.txt'

In [123]:
data2_path = os.path.abspath('./data2.txt')
data2_path

'/content/data2.txt'

In [124]:
def fasttext_write_data(data, data_path: str):
    with open(data_path, 'w+') as file:
        data_dict = dataset1['train'].to_dict()
        for text, label in zip(data_dict['text'], data_dict['label']):
            text = text.replace('\n', ' ')
            file.write(f'__label__{label} {text}\n')

In [125]:
fasttext_write_data(dataset1['train'], data1_path)

In [126]:
! head -n 5 data.txt

head: cannot open 'data.txt' for reading: No such file or directory


In [128]:
classifier_fasttext_data1 = fasttext.train_supervised(input=data1_path)

In [129]:
data_test_dict = dataset1['test'].to_dict()
y_pred = classifier_fasttext_data1.predict(data_test_dict['text'])

y_pred = y_pred[0]
y_pred = [int(y[0].replace('__label__', '')) for y in y_pred]

print(metrics.classification_report(data_test_dict['label'], y_pred,
                                        target_names=['Ok', 'Bullying']))

              precision    recall  f1-score   support

          Ok       0.88      0.98      0.93       866
    Bullying       0.59      0.16      0.26       134

    accuracy                           0.87      1000
   macro avg       0.74      0.57      0.59      1000
weighted avg       0.84      0.87      0.84      1000



In [130]:
fasttext_write_data(dataset2['train'], data2_path)

In [131]:
! head -n 5 data2.txt

__label__0 Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.
__label__0 @anonymized_account @anonymized_account Brawo ty Daria kibic ma być na dobre i złe
__label__0 @anonymized_account @anonymized_account Super, polski premier składa kwiaty na grobach kolaborantów. Ale doczekaliśmy czasów.
__label__0 @anonymized_account @anonymized_account Musi. Innej drogi nie mamy.
__label__0 Odrzut natychmiastowy, kwaśna mina, mam problem


In [132]:
classifier_fasttext_data2 = fasttext.train_supervised(input=data2_path)

In [133]:
data_test_dict = dataset2['test'].to_dict()
y_pred = classifier_fasttext_data2.predict(data_test_dict['text'])

y_pred = y_pred[0]
y_pred = [int(y[0].replace('__label__', '')) for y in y_pred]

print(metrics.classification_report(data_test_dict['label'], y_pred,
                                target_names=['Ok', 'Bullying', 'Hate Speech']))

              precision    recall  f1-score   support

          Ok       0.88      0.98      0.93       866
    Bullying       0.11      0.16      0.13        25
 Hate Speech       0.00      0.00      0.00       109

    accuracy                           0.85      1000
   macro avg       0.33      0.38      0.35      1000
weighted avg       0.77      0.85      0.81      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Transformer classifier 

In [134]:
# (take into account that a number of experiments should be performed for this model).

In [135]:
model_name = 'allegro/herbert-base-cased'
# "distilbert-base-uncased"

In [136]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data1 = dataset1.map(preprocess_function, batched=True)
tokenized_data2 = dataset2.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]



In [137]:
accuracy = evaluate.load("accuracy")


In [138]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [139]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [140]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [141]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

Dataset 1

In [142]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data1["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [143]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data1["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data1["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


You're using a HerbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [144]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [145]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [146]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9ea4280d00>

In [147]:
classifier_transformers_data1 = model

In [148]:
data_test_dict = dataset1['test'].to_dict()
y_pred = classifier_transformers_data1.predict(tf_validation_set)



In [149]:
prediction = tf.nn.softmax(y_pred.logits)
prediction = np.argmax(prediction, 1)

In [150]:
print(metrics.classification_report(data_test_dict['label'], prediction,
                                        target_names=['Ok', 'Bullying']))

              precision    recall  f1-score   support

          Ok       0.92      0.98      0.95       866
    Bullying       0.76      0.47      0.58       134

    accuracy                           0.91      1000
   macro avg       0.84      0.72      0.76      1000
weighted avg       0.90      0.91      0.90      1000



Dataset 2

In [151]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [152]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data2["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [153]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data2["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data2["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


In [154]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [155]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
classifier_transformers_data2 = model

In [None]:
data_test_dict = dataset1['test'].to_dict()
y_pred = classifier_transformers_data1.predict(tf_validation_set)

In [None]:
prediction = tf.nn.softmax(y_pred.logits)
prediction = np.argmax(prediction, 1)

In [None]:
print(metrics.classification_report(data_test_dict['label'], prediction,
                                        target_names=['Ok', 'Bullying']))

## Lime

In [None]:
data = dataset1['test']['text']
data_label = dataset1['test']['label']
class_names = ['Ok', 'Bullying']

In [None]:
pipeline = make_pipeline(tf_vectorizer, classifier_bayes_data1)

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
idx = 73
data[idx]

In [None]:
exp = explainer.explain_instance(data[idx], pipeline.predict_proba, num_features=6, top_labels=1)
exp.show_in_notebook(text=False)

In [None]:
idx = 83
data[idx]

In [None]:
exp = explainer.explain_instance(data[idx], pipeline.predict_proba, num_features=6, top_labels=1)
exp.show_in_notebook(text=False)