# BERT Fine-Tuning


<a name="sec:setup"></a>
### Instalación de librerías e importación de dependencias.


In [None]:
%%capture
!pip install transformers==4.26.0 tensorflow==2.11 pandas==1.3.5 plotly==5.5.0 scikit-learn==1.0.0

In [None]:
%reset -f

import os

import pandas as pd
from collections import Counter
from sklearn import preprocessing
from sklearn.utils.class_weight import compute_class_weight

# para evaluar los modelos
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.utils.multiclass import unique_labels

#  para construir gráficas y realizar análisis exploratorio de los datos
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from tqdm import tqdm

# para guardar el modelo
import pickle
import tensorflow as tf

# algoritmos de clasificación, tokenizadores, etc.
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DistilBertConfig, TextClassificationPipeline

In [None]:
def get_model_inputs(cfg, data):
    encodings = cfg['tokenizer'](data, truncation=True, padding='max_length', max_length=cfg['max_length'], return_tensors=cfg['framework'])

    inputs = {'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
            }
    return inputs

def predict_model(model, cfg, data, pref='m'):
  res = {}
  inputs = get_model_inputs(cfg, data)
  scores = model.predict(inputs)['logits']  # la salida de este modelo es TFSequenceClassifierOutput, debe tomarse el valor asociado a la llave 'logits'

  if cfg['num_labels']==1: # si es clasificación binaria, este modelo devuelve solo 1 score por instancia
    res = {f'scores_{pref}': scores[:,0]}
  else:
    res = {f'scores_{pref}_{cls.lower()}': score for cls, score in zip(cfg['label_binarizer'].classes_, [col for col in scores.T])}

  labels = cfg['label_binarizer'].inverse_transform(scores)
  res[f'labels_{pref}'] = labels

  res = pd.DataFrame(res, columns=sorted(list(res.keys())))
  return res


def evaluate_model(y_true, y_pred, y_score=None, pos_label='positive'):
  print('==== Sumario de la clasificación ==== ')
  print(classification_report(y_true, y_pred))

  print('Accuracy -> {:.2%}\n'.format(accuracy_score(y_true, y_pred)))

  # graficar matriz de confusión
  display_labels = sorted(unique_labels(y_true, y_pred), reverse=True)
  cm = confusion_matrix(y_true, y_pred, labels=display_labels)

  z = cm[::-1]
  x = display_labels
  y =  x[::-1].copy()
  z_text = [[str(y) for y in x] for x in z]

  fig_cm = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

  fig_cm.update_layout(
      height=400, width=400,
      showlegend=True,
      margin={'t':150, 'l':0},
      title={'text' : 'Matriz de Confusión', 'x':0.5, 'y':0.95, 'xanchor': 'center'},
      xaxis = {'title_text':'Valor Real', 'tickangle':45, 'side':'top'},
      yaxis = {'title_text':'Valor Predicho', 'tickmode':'linear'},
  )
  fig_cm.show()


  fig_roc = None
  if y_score is not None:
    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=pos_label)
    fig_roc = px.area(
        x=fpr, y=tpr,
        title = f'Curva ROC (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='Ratio Falsos Positivos', y='Ratio Verdaderos Positivos'),
        width=400, height=400
    )
    fig_roc.add_shape(type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1)

    fig_roc.update_yaxes(scaleanchor="x", scaleratio=1)
    fig_roc.update_xaxes(constrain='domain')

    fig_roc.show()

### Carga de datos



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

In [None]:
text_col = 'combined'
class_col = 'label'

train = pd.read_csv('/content/drive/MyDrive/TFM/v2/train_limpio_0_4.csv', index_col = 0)
train['combined'] = train.apply(lambda row: f"{row['author'].lower()} [SEP] {row['candidate'].lower()}", axis=1)
print(train.head())
val = pd.read_csv('/content/drive/MyDrive/TFM/v2/test_limpio_0_4.csv', index_col = 0)
val['combined'] = val.apply(lambda row: f"{row['author'].lower()} [SEP] {row['candidate'].lower()}", axis=1)
print(val.head())

### Modelo


In [None]:
# configuraciones
cfg = {}
cfg['framework'] = 'tf'
cfg['max_length'] = 512
cfg['transformer_model_name'] = 'bert-base-uncased'
cfg['num_labels'] = 1

#### Configuración del modelo

In [None]:
config = DistilBertConfig(num_labels=cfg['num_labels'], seq_classif_dropout=0.5)

model = TFDistilBertForSequenceClassification.from_pretrained(cfg['transformer_model_name'], config=config)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['binary_accuracy'])

model.summary()

### Pre-procesamiento de los datos

#### Instanciar tokenizador, etc.

In [None]:
cfg['tokenizer'] = DistilBertTokenizer.from_pretrained(cfg['transformer_model_name'] )
cfg['label_binarizer'] = preprocessing.LabelBinarizer()

#### Pre-procesamiento

In [None]:
cfg['label_binarizer'].fit(train[class_col])

with open('label_binarizer_reviews.pkl', 'wb') as f:
    pickle.dump(cfg['label_binarizer'], f)

train_blabels = cfg['label_binarizer'].transform(train[class_col])
val_blabes = cfg['label_binarizer'].transform(val[class_col])

train_blabels_t = tf.convert_to_tensor(train_blabels, dtype='int32')
val_blabels_t = tf.convert_to_tensor(val_blabes, dtype='int32')

train_inputs = get_model_inputs(cfg, train[text_col].to_list())
val_inputs = get_model_inputs(cfg, val[text_col].to_list())

### Entrenamiento del modelo
Vamos a probar con el valor de epochs, entre 1 y 10.

In [None]:
# configuraciones
cfg['checkpoints_dir'] = 'checkpoints'
cfg['model_name'] = 'bert-authors'
cfg['trained_model_name'] = os.path.join(cfg['checkpoints_dir'], cfg['model_name'])

epochs_max = 10
epochs_to_save = 1
batch_size = 16
histories = pd.DataFrame(None, columns = ["epoch", "loss", "binary_accuracy",
                                          "val_loss", "val_binary_accuracy"])

for epoch_current in range(0, epochs_max, epochs_to_save):
    epoch_from = epoch_current +1
    epoch_to = epoch_current + epochs_to_save
    print(f'Training model, epochs {epoch_from} - {epoch_to}')

    history = model.fit(train_inputs, y=train_blabels_t, initial_epoch=epoch_current, epochs=epoch_to,
                        batch_size=batch_size, validation_data=(val_inputs,val_blabels_t))
    histories = pd.concat([pd.DataFrame([[epoch_to, history.history['loss'][0],
                                          history.history['binary_accuracy'][0],
                                          history.history['val_loss'][0],
                                          history.history['val_binary_accuracy'][0]]],
                                          columns = histories.columns), histories], ignore_index = True)

    model.save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch_from:03d}-{epoch_to:03d}', )
    cfg['tokenizer'].save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch_from:03d}-{epoch_to:03d}')

    model_pkl_file = "/content/drive/MyDrive/TFM/models/bert_v2_" +  str(epoch_to) + ".pkl"
    with open(model_pkl_file, 'wb') as file:
        pickle.dump(model, file)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 5))
fig.suptitle('Train VS Validation')

histories.plot.line(ax = ax1, x = 'epoch', y = ['binary_accuracy', 'val_binary_accuracy'], color = ['rebeccapurple', 'teal'])
ax1.set_title('Accuracy')
histories.plot.line(ax = ax2, x = 'epoch', y = ['loss', 'val_loss'], color = ['rebeccapurple', 'teal'])
ax2.set_title('Loss')

plt.show()

El mejor resultado es para 6 epochs.

In [None]:
# configuraciones
cfg['checkpoints_dir'] = 'checkpoints'  # directorio donde se guardarán los checkpoints al entrenar el modelo
cfg['model_name'] = 'bert-authors'  # identificador al guardar los checkpoints
cfg['trained_model_name'] = os.path.join(cfg['checkpoints_dir'], cfg['model_name'])

epochs_max = 10
epochs_to_save = 1
batch_size = 16

for epoch_current in range(0, epochs_max, epochs_to_save):
    epoch_from = epoch_current +1
    epoch_to = epoch_current + epochs_to_save
    print(f'Training model, epochs {epoch_from} - {epoch_to}')

    model.fit(train_inputs, y=train_blabels_t, initial_epoch=epoch_current, epochs=epoch_to, batch_size=batch_size, validation_data=(val_inputs,val_blabels_t))

    model.save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch_from:03d}-{epoch_to:03d}', )
    cfg['tokenizer'].save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch_from:03d}-{epoch_to:03d}')

### Métricas

In [None]:
model_pkl_file = "/content/drive/MyDrive/TFM/models/bert_v2_6.pkl"
with open(model_pkl_file, 'rb') as file:
    model = pickle.load(file)

In [None]:
data = val
true_labels = data[class_col]

m_pred = predict_model(model, cfg, data[text_col].to_list(), pref='m')

evaluate_model(true_labels, m_pred['labels_m'])  # notar que en este caso se no suministran los scores

print('Done!')