In [1]:
# The MIT License (MIT) Copyright (c) 2023 milmor
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of 
# this software and associated documentation files (the "Software"), to deal in the Software without 
# restriction, including without limitation the rights to use, copy, modify, merge, publish, 
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or 
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Clasificación de cyberbullying con BERT

<img src="../img/bert.png" width="700"/>

__Imagen tomada de Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805.__

## 1.- Conjuntos de datos
- Partición de entrenamiento, validación y prueba.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./cyberbullying_tweets.csv')   

In [4]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [5]:
df['cyberbullying_type'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [6]:
# Contar elementos por clase
count = df['cyberbullying_type'].value_counts()
count

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
train_df, test_df= train_test_split(df, test_size = 0.30, random_state = 123)

In [8]:
train_df['cyberbullying_type'].value_counts()

gender                 5655
not_cyberbullying      5610
ethnicity              5601
age                    5573
religion               5557
other_cyberbullying    5388
Name: cyberbullying_type, dtype: int64

In [9]:
val_df, test_df= train_test_split(test_df, test_size = 0.50, random_state = 123)

In [10]:
val_df['cyberbullying_type'].value_counts()

religion               1250
other_cyberbullying    1219
age                    1198
gender                 1177
ethnicity              1164
not_cyberbullying      1146
Name: cyberbullying_type, dtype: int64

In [11]:
test_df['cyberbullying_type'].value_counts()

age                    1221
other_cyberbullying    1216
ethnicity              1196
religion               1191
not_cyberbullying      1189
gender                 1141
Name: cyberbullying_type, dtype: int64

## 2.- Preprocesamiento

In [12]:
# Creamos un diccionario que mapea cada etiqueta a un número entero
labels_dict = {
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 2,
    'other_cyberbullying': 3,
    'age': 4,
    'ethnicity': 5
}

# Usamos la función map() para reemplazar cada etiqueta con su valor entero correspondiente
train_df['cyberbullying_type'] = train_df['cyberbullying_type'].replace(labels_dict)
val_df['cyberbullying_type'] = val_df['cyberbullying_type'].replace(labels_dict)
test_df['cyberbullying_type'] = test_df['cyberbullying_type'].replace(labels_dict)

train_df.head()

Unnamed: 0,tweet_text,cyberbullying_type
26651,@AntonSirius @erinspice @prpltnkr @ChiefElk oh...,3
4820,"Ladies ""Bedroom Bully"" The Mix Cd By @GappyRan...",0
1847,RT @_bobbidana: Never thought I'd say this but...,0
20653,She is intellectual terrorists and world suffe...,2
19195,You saudias are not friends of Muslim idiots c...,2


- Verifica cadenas vacias.

## 3.- Pipeline

In [13]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

X_train = train_df['tweet_text'].values
y_train = train_df['cyberbullying_type'].values

X_val = val_df['tweet_text'].values
y_val = val_df['cyberbullying_type'].values

X_test = test_df['tweet_text'].values
y_test = test_df['cyberbullying_type'].values

In [14]:
len(X_train), len(X_val), len(X_test)

(33384, 7154, 7154)

In [15]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
raw_val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val))
raw_test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [16]:
# Define a function to convert the label to a one-hot encoding
def convert_label_to_one_hot(text, label):
    one_hot_label = tf.one_hot(label, 6)
    return text, one_hot_label

# Apply the function to the dataset using map()
raw_train_ds = raw_train_ds.map(convert_label_to_one_hot)
raw_val_ds = raw_val_ds.map(convert_label_to_one_hot)
raw_test_ds = raw_test_ds.map(convert_label_to_one_hot)

batch_size = 32
train_ds = raw_train_ds.shuffle(40538).batch(batch_size)
val_ds = raw_val_ds.batch(batch_size)
test_ds = raw_test_ds.batch(batch_size)

In [17]:
for test_text, test_target in train_ds.take(1):
    print(test_text[0], test_target[0])

tf.Tensor(b"@jritter33 can I have details? U fucking droopy eyed cock sucker nigger loving Cunt this Is y we're not home dumb fuck #obamalover", shape=(), dtype=string) tf.Tensor([0. 0. 0. 0. 0. 1.], shape=(6,), dtype=float32)


## 3.- Modelo

In [18]:
import keras_nlp

In [19]:
bert_tiny = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased", num_classes=6
)

In [20]:
bert_tiny.trainable = True

In [21]:
bert_tiny.layers

[<keras.engine.input_layer.InputLayer at 0x7f85bc14e550>,
 <keras.engine.input_layer.InputLayer at 0x7f85bc14e040>,
 <keras.engine.input_layer.InputLayer at 0x7f85bc3e18b0>,
 <keras_nlp.src.models.bert.bert_backbone.BertBackbone at 0x7f85bc3e1640>,
 <keras.layers.regularization.dropout.Dropout at 0x7f85b7f43940>,
 <keras.layers.core.dense.Dense at 0x7f85bc105be0>]

In [22]:
bert_tiny.layers[0].trainable = False
bert_tiny.layers[1].trainable = False
bert_tiny.layers[2].trainable = False
bert_tiny.layers[3].trainable = False
bert_tiny.layers[4].trainable = False

- Muestra el tamaño del modelo y los parámetros entrenables.

In [23]:
bert_tiny.summary()

Preprocessor: "bert_preprocessor"
__________________________________________________________________________________________________
 Tokenizer (type)                                    Vocab #     
 bert_tokenizer (BertTokenizer)                      30522       
__________________________________________________________________________________________________
                                                                                                  
Model: "bert_classifier"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 padding_mask (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, None)]       0           []                               
                 

- Prueba de las salidas de BERT.

In [24]:
test_output = bert_tiny.predict(test_text)
test_output.shape



(32, 6)

## 4.- Entrenamiento (última capa)

In [25]:
lr = 0.0001
opt = tf.keras.optimizers.Adam(learning_rate=lr)

bert_tiny.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    optimizer=opt,
                    metrics=['accuracy'])

In [26]:
epochs = 3
history = bert_tiny.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


### Evaluación

In [27]:
bert_tiny.evaluate(test_ds)



[1.3602951765060425, 0.525020956993103]

## 5.- Entrenamiento (todas las capas)

In [28]:
bert_tiny_full = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased", num_classes=6
)

In [29]:
bert_tiny_full.trainable = True

In [30]:
bert_tiny_full.summary()

Preprocessor: "bert_preprocessor_1"
__________________________________________________________________________________________________
 Tokenizer (type)                                    Vocab #     
 bert_tokenizer_1 (BertTokenizer)                    30522       
__________________________________________________________________________________________________
                                                                                                  
Model: "bert_classifier_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 padding_mask (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, None)]       0           []                               
             

In [31]:
lr = 0.0001
opt = tf.keras.optimizers.Adam(learning_rate=lr)

bert_tiny_full.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    optimizer=opt,
                    metrics=['accuracy'])

In [32]:
epochs = 2
history = bert_tiny_full.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/2
Epoch 2/2


### Evaluación

In [33]:
bert_tiny_full.evaluate(test_ds)



[0.3975391685962677, 0.8405088186264038]

## Ejercicio
- Modifica la arquitectura y el entrenamiento para mejorar los resultados.
- Prueba diferentes versiones de BERT: https://keras.io/api/keras_nlp/models/