<a href="https://colab.research.google.com/github/karengarm/NLP_Disaster_Tweets_Classification/blob/main/04_Fine_tuning_XLNET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine tuning XLNET

One method which aims to
overcome the limitations of the BERT model is XLNet. It does so by combining the bidirectional approach which BERT also utilizes with an autoregressive training which allows for training without
masking any words, and thus avoid to remove the context the masked words provides.

In this jupyter notebook we will now fine tune XLNet transformers models to track our experiment we will use MLflow Tracking. The XLNET-based model is composed of the XLNet main block and a classification layer. The model used a binary cross entropy as optimization criterion.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install sentencepiece
!pip install transformers 
import pandas as pd
import numpy as np
import gc
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, auc
import re
import tensorflow as tf
from transformers import TFXLNetModel, XLNetTokenizer
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler
import json
import csv 

import seaborn as sn
import matplotlib.pyplot as plt

dir_path = '/content/drive/MyDrive/'
os.chdir(dir_path)

In [None]:
!pip install mlflow
!databricks configure --host https://community.cloud.databricks.com/
import mlflow
import mlflow.keras

In [None]:
def evaluate(y_test, y_pred, mlflow):
    """
    Evaluation function. For each of the text in evaluation data, it reads the score from
    the predictions made. And based on this, it calculates the values of
    True positive, True negative, False positive, and False negative.

    :param y_test: true labels
    :param y_pred: predicted labels
    :param labels: list of possible labels
    :return: evaluation metrics for classification like, precision, recall, and f_score
    """
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    mauc = auc(fpr, tpr)
    # calculate the g-mean for each threshold
    gmeans = np.sqrt(tpr * (1-fpr))
    # locate the index of the largest g-mean
    ix = np.argmax(gmeans)
    best_trhold =thresholds[ix]

    y_pred  = np.where(y_pred > 0.5, 1, 0) 
    labels = ['Not informative', 'Related and informative']
    confusion = confusion_matrix(y_test, y_pred)


    df_cm = pd.DataFrame(confusion, index=[i for i in labels],
                         columns=[i for i in labels])
    '''
    fig, ax = plt.subplots()
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True)
    plt.title('Confusion Matrix')
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    mlflow.log_figure(fig, "confusion.png") 
    '''

    # importing accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average= None)
    recall = recall_score(y_test, y_pred, average= None)
    f1score = f1_score(y_test, y_pred, average= None)
    
    
    report = classification_report(y_test, y_pred, target_names = labels)
    print(report)

    mlflow.log_metric("auc_test", mauc) 
    mlflow.log_metric("accuracy_test", accuracy)
    mlflow.log_metric("precision_NI", precision[0])  
    mlflow.log_metric("precision_I", precision[1])    
    mlflow.log_metric("recall_NI", recall[0])
    mlflow.log_metric("recall_I", recall[1])
    mlflow.log_metric("f1score_NI", f1score[0]) 
    mlflow.log_metric("f1score_I", f1score[1]) 
    return accuracy, precision, recall, f1score, mauc 

def mlflow_log_parameters(cat, parameter):
  # Log parameters
  mlflow.log_param("crisis_type", cat)
  mlflow.log_param("dropout", parameter['dropout'])
  mlflow.log_param("learning_rate", parameter['learning_rate'])
  mlflow.log_param("epochs", parameter['epochs'])
  mlflow.log_param("batch_size", parameter['batch_size'])


def get_inputs(tweets, tokenizer, max_len=120):
    """ Gets tensors from text using the tokenizer provided"""
    inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in tweets]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

def create_xlnet(mname, ndropout, nlr):
    """ XLNet is composed of a main block and a classification layer with dropout
    """
    # Define token ids as inputs
    word_inputs = tf.keras.Input(shape=(120,), name='word_inputs', dtype='int32')

    # XLNet model
    xlnet = TFXLNetModel.from_pretrained(mname)
    xlnet_encodings = xlnet(word_inputs)[0]

    # Classification layer with dropout for regularization
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    doc_encoding = tf.keras.layers.Dropout(ndropout)(doc_encoding)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)

    # Compile model
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=nlr), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model


def training_test_mae( mlflow, history):
  # Plot training and test loss at each epoch 
  fig, ax = plt.subplots()
  ax.plot(history.history['accuracy'], label='Training acc')
  plt.plot(history.history['val_accuracy'], label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('accuracy')
  plt.legend()
  plt.show()
  mlflow.log_figure(fig, "training_validation_accuracy.png") 

  fig, ax = plt.subplots()
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()
  mlflow.log_figure(fig, "training_validation_loss.png") 

def training_Xlnet(inp_tok, xlnet_model, ndropout, nlr, nepochs, nbatch_size, mlflow):
  xlnet = create_xlnet(xlnet_model, ndropout, nlr)
  mlflow.keras.log_model(xlnet, "Xlnet")
  history = xlnet.fit(x=inp_tok, y=y_train, epochs=nepochs, batch_size=nbatch_size, validation_split=.15)
  training_test_mae( mlflow, history)
  return xlnet

## Scenario 1


In the first scenario, the goal is to test the effectiveness of the models by only training on data from
the same type of disaster. More specifically, each model will be trained and validated
on a specific disaster type and then tested on another crisis of the same type. 

In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/karengar@stanford.edu/Xlnet_Scenario1")

In [None]:
df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
df_train= pd.read_csv('train_data.csv', header = 0, sep = 't')

xlnet_model = 'xlnet-base-cased'
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)  

rng = np.random.RandomState(0)
param_dist = {    'dropout': [0.1,0.2, 0.3],
                  'learning_rate': loguniform.rvs(1e-6, 1e-4, size= 10),
                  'epochs': [5, 7],
                  'batch_size': [16, 32, 64]
                  }
            
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter= 10, random_state=rng)

In [None]:
for parameter in dict_parameters:
  print(parameter)
  for cat in list(df_test.categorization_type.unique())[1:3]:
    with mlflow.start_run(): 
      mlflow_log_parameters(cat, parameter)
      tmp = df_train[df_train['categorization_type'] == cat]
      X_train = tmp['tweettext_proc'].values
      y_train = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)
      tmp = df_test[df_test['categorization_type'] == cat]
      X_test = tmp['tweettext_proc'].values 
      y_test = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)

      #Training
      inp_tok, ids, segments = get_inputs(X_train, xlnet_tokenizer)
      
      xlnet = training_Xlnet(inp_tok, xlnet_model, parameter['dropout'], parameter['learning_rate'], parameter['epochs'], parameter['batch_size'], mlflow)

      #Testing
      inp_tok, ids, segments = get_inputs(X_test, xlnet_tokenizer)
      y_pred = xlnet.predict(inp_tok, verbose=True)
      print(cat)
      accuracy, precision, recall, f1score, mauc = evaluate(y_test, y_pred, mlflow)
      mlflow.end_run()

## Scenario 2

The second scenario aims to test the performance of the models in the transfer of information between disasters. In this case the models were trained using 22 293 tweets belonging to 22 different disasters and tested on the remaining 4 disasters. The same test sets as scenario 1 was used, but the training set was not divided into different crisis types. Instead the data was used all together, and also included other crisis types besides derailments, earthquakes and floods. As for scenario 1 the training and validation set was split randomly, with 85\% in the training set and 15\% in the validation set.


In [None]:
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/karengar@stanford.edu/Xlnet_Scenario2")

In [None]:
df_test = pd.read_csv('test_data.csv', header = 0, sep = 't')
df_train= pd.read_csv('train_data.csv', header = 0, sep = 't')

xlnet_model = 'xlnet-base-cased'
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)  

rng = np.random.RandomState(0)
param_dist = {    'dropout': [0.1, 0.2,  0.3],
                  'learning_rate': loguniform.rvs(1e-6, 1e-4, size= 10),
                  'epochs': [5, 7],
                  'batch_size': [16, 32]
                  }
dict_parameters = ParameterSampler(param_distributions=param_dist, n_iter=1, random_state=rng)

X_train = df_train['tweettext_proc'].values
y_train = np.where(df_train.cat_informativeness =='Related and informative', 1, 0)

for parameter in dict_parameters:
  print(list(parameter))
  #Training
  inp_tok, ids, segments = get_inputs(X_train, xlnet_tokenizer)
  xlnet = training_Xlnet(inp_tok, xlnet_model, parameter['dropout'], parameter['learning_rate'], parameter['epochs'], parameter['batch_size'], mlflow)
  for cat in df_test.categorization_type.unique():
    with mlflow.start_run(): 
      mlflow_log_parameters(cat, parameter) 
      drive.mount('/content/drive')
      tmp = df_test[df_test['categorization_type'] == cat]
      X_test = tmp['tweettext_proc'].values 
      y_test = np.where(tmp.cat_informativeness =='Related and informative', 1, 0)

      #Testing
      inp_tok, ids, segments = get_inputs(X_test, xlnet_tokenizer)
      y_pred = xlnet.predict(inp_tok, verbose=True)
      print(cat)
      accuracy, precision, recall, f1score, mauc = evaluate(y_test, y_pred, mlflow)
      mlflow.end_run()
    