In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/18_BERT_keras_cyberbullying.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# Clasificación de cyberbullying con BERT

<img src="../img/bert.png" width="700"/>

__Imagen tomada de Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805.__
  
- Dataset: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification

## 1.- Conjuntos de datos
- Partición de entrenamiento, validación y prueba.

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import pandas as pd
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import torch

torch.__version__

Using PyTorch backend.


'2.0.1+cu117'

In [3]:
torch.manual_seed(77)

<torch._C.Generator at 0x7f2c44fb1610>

## 1.- Conjuntos de entrenamiento y validación

In [4]:
df = pd.read_csv('./cyberbullying_tweets.csv')   

In [5]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [6]:
df['cyberbullying_type'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [7]:
# Contar elementos por clase
count = df['cyberbullying_type'].value_counts()
count

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [8]:
# Creamos un diccionario que mapea cada etiqueta a un número entero
labels_dict = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

df['cyberbullying_type'] = df['cyberbullying_type'].replace(labels_dict)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0


- Elimina puntuación y convierte a minúsculas
- Se utiliza el método __str.translate()__ para eliminar todos los caracteres de puntuación mediante una tabla de traducción creada con el método __str.maketrans__. La constante string.punctuation contiene todos los caracteres de puntuación ASCII, que se eliminan de los valores en la columna

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
df['tweet_text'] = df['tweet_text'].str.lower().str.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,in other words katandandre your food was crapi...,0
1,why is aussietv so white mkr theblock imaceleb...,0
2,xochitlsuckkks a classy whore or more red velv...,0
3,jasongio meh p thanks for the heads up but no...,0
4,rudhoeenglish this is an isis account pretendi...,0


In [11]:
from sklearn.model_selection import train_test_split
train_df, val_df= train_test_split(df, test_size = 0.15, random_state = 123)

In [12]:
train_df['cyberbullying_type'].value_counts()

cyberbullying_type
0    6826
1    6821
5    6777
4    6763
2    6743
3    6608
Name: count, dtype: int64

## 2.- Pipeline

In [13]:
import tensorflow as tf

X_train = train_df['tweet_text'].values
y_train = train_df['cyberbullying_type'].values

X_val = val_df['tweet_text'].values
y_val = val_df['cyberbullying_type'].values

In [14]:
len(X_train), len(X_val)

(40538, 7154)

In [15]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
raw_val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))

In [16]:
# Define a function to convert the label to a one-hot encoding
def convert_label_to_one_hot(text, label):
    one_hot_label = tf.one_hot(label, 6)
    return text, one_hot_label

# Apply the function to the dataset using map()
raw_train_ds = raw_train_ds.map(convert_label_to_one_hot)
raw_val_ds = raw_val_ds.map(convert_label_to_one_hot)

batch_size = 32
train_ds = raw_train_ds.shuffle(40538).batch(batch_size)
val_ds = raw_val_ds.batch(batch_size)

In [17]:
for test_text, test_target in train_ds.take(1):
    print(test_text[0], test_target[0])

tf.Tensor(b'nasty ass eating food jweave mcjdagreat httpstcoeqsazurrbh', shape=(), dtype=string) tf.Tensor([0. 0. 0. 1. 0. 0.], shape=(6,), dtype=float32)


## 3.- Modelo

In [18]:
import keras_nlp

In [None]:
bert_tiny = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased", num_classes=6
)

In [20]:
bert_tiny.trainable = True

In [21]:
bert_tiny.layers

[<InputLayer name=padding_mask, built=True>,
 <InputLayer name=segment_ids, built=True>,
 <InputLayer name=token_ids, built=True>,
 <BertBackbone name=bert_backbone, built=True>,
 <Dropout name=dropout_6, built=True>,
 <Dense name=logits, built=True>]

In [22]:
bert_tiny.layers[0].trainable = False
bert_tiny.layers[1].trainable = False
bert_tiny.layers[2].trainable = False
bert_tiny.layers[3].trainable = False
bert_tiny.layers[4].trainable = False

- Muestra el tamaño del modelo y los parámetros entrenables.

In [23]:
bert_tiny.summary()

- Prueba de las salidas de BERT.

In [24]:
test_output = bert_tiny.predict(test_text)
test_output.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 687ms/step


(32, 6)

## 4.- Entrenamiento (última capa)

In [25]:
lr = 0.0001
optimizer = keras.optimizers.Adam(learning_rate=lr)

bert_tiny.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=optimizer, metrics=['accuracy']
)

In [26]:
epochs = 3
history = bert_tiny.fit(
    train_ds, validation_data=val_ds, epochs=epochs
)

Epoch 1/3
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 22ms/step - accuracy: 0.3043 - loss: 1.7299 - val_accuracy: 0.5102 - val_loss: 1.5182
Epoch 2/3
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.4601 - loss: 1.5201 - val_accuracy: 0.5212 - val_loss: 1.3805
Epoch 3/3
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 22ms/step - accuracy: 0.4821 - loss: 1.4161 - val_accuracy: 0.5516 - val_loss: 1.2976


### Evaluación

In [27]:
bert_tiny.evaluate(val_ds)

[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5546 - loss: 1.3007


[1.2975798845291138, 0.5515795350074768]

## 5.- Entrenamiento (todas las capas)

In [28]:
bert_tiny_full = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased", num_classes=6
)

In [29]:
bert_tiny_full.trainable = True

In [30]:
bert_tiny_full.summary()

In [31]:
lr = 0.0001
optimizer = keras.optimizers.Adam(learning_rate=lr)

bert_tiny_full.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=optimizer, metrics=['accuracy']
)

In [32]:
epochs = 2
history = bert_tiny_full.fit(
    train_ds, validation_data=val_ds, epochs=epochs
)

Epoch 1/2
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 24ms/step - accuracy: 0.6869 - loss: 0.9236 - val_accuracy: 0.8418 - val_loss: 0.3933
Epoch 2/2
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 23ms/step - accuracy: 0.8517 - loss: 0.3787 - val_accuracy: 0.8500 - val_loss: 0.3680


### Evaluación

In [33]:
bert_tiny_full.evaluate(val_ds)

[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8466 - loss: 0.3695


[0.3680413067340851, 0.8500139713287354]

## Ejercicio
- Modifica la arquitectura y el entrenamiento para mejorar los resultados.
- Prueba diferentes versiones de BERT: https://keras.io/api/keras_nlp/models/