In [None]:
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./

In [None]:
!sh setup.sh

In [None]:
# 0. import all necessary dependencies
print("[XLNet-MLL]: Starting model fitting and test-run on the 10 datasets.")
!nvidia-smi

# check if we are on a google COLAB notebook
try:
  import google.colab
  IN_COLAB = True
except:
  print("We are not in a Google COLAB setup, please consider adapting the code!")
  IN_COLAB = False

# load packages according to environment
if IN_COLAB == True:
  import os 
  import sys
  import datetime
  import time
  from multiprocessing import cpu_count
  # check if transformers available
  if 'transformers' in sys.modules:
    import transformers
  else:
    !pip install transformers
  # check if transformers available
  if 'simpletransformers' in sys.modules:
    import transformers
  else:
    !pip install simpletransformers
  if 'tensorboardX' in sys.modules:
    import tensorboardX
  else:
    !pip install tensorboardX
  # then import everything
  import tensorflow as tf 
  import torch 
  from torch.cuda import memory_allocated
  from torch.cuda import memory_cached
  import transformers
  import simpletransformers 
  from simpletransformers.classification import MultiLabelClassificationModel
  import pandas as pd
  import numpy as np 
  import scipy
  import sklearn 
  from sklearn.metrics import accuracy_score, f1_score, hamming_loss, label_ranking_loss, label_ranking_average_precision_score
  from sklearn.metrics import precision_score, recall_score, zero_one_loss 
  from sklearn.model_selection import train_test_split
  import torch 
  import seqeval
  import tensorboardX
  import tqdm
  import logging
  from google.colab import drive
  import matplotlib.pyplot as plt
  from matplotlib import rcParams
  rcParams['font.family'] = 'serif'
  rcParams['font.sans-serif'] = ['Verdana']
else:
  print("Not in COLAB environment!")
print("[XLNet-MLL]: basic dependencies loaded")

In [0]:
# 1. define helper functions
def mount_gpu():
  # function for mounting the GPU available on the machine / or notebook
  # first check if we are in the COLAB environment
  try:
    import google.colab
    IN_COLAB = True
  except:
    IN_COLAB = False

  # COLAB code:
  if IN_COLAB==True:
    # get the GPU device name
    device_name = tf.test.gpu_device_name()

    # in COLAB, the device name should look like:
    if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
    else:
      raise SystemError('GPU device not found')

    # if we find a GPU:
    if torch.cuda.is_available():    
      # swith torch to GPU
      device = torch.device("cuda")
      print('There are %d GPU(s) available.' % torch.cuda.device_count())
      print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
      # otherwise, we notify the user and switch torch to CPU
      print('No GPU available, using the CPU instead.')
      device = torch.device("cpu")
  else:
    print("No COLAB environment, getting GPU via TF")
    if tf.test.is_gpu_available()==True:
      # swith torch to GPU
      device = torch.device("cuda")
      print('There are %d GPU(s) available.' % torch.cuda.device_count())
      print('Found GPU at: {}'.format(tf.test.gpu_device_name()))
      print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
      print('No GPU available, using the CPU instead.')
      device = torch.device("cpu")
  # return the mounted device 
  return device

# some self-defined metrics for use with the scikit-learn metrics
def custom_acc(out, labels):
      # as the predictions have to be converted to multi-label 
      # integer-formatted predictions, we have to check each
      # element of the predictions vector across a threshold to 
      # generate the binary label markers (we use 0.5 as a threshold
      # here) - then we have to check the new vector against 
      # the ground truth and calculate the accuracy
      tmp_input = out 
      # set all vector elements >0.5 to 1, all below to 0
      tmp_input[tmp_input>0.5] = 1
      tmp_input[tmp_input<0.5] = 0
      # then calculate the accuracy against ground truth
      pred_flat = tmp_input.flatten()
      labels_flat = labels.flatten()
      count_true = np.sum(pred_flat == labels_flat)
      acc_int = count_true / len(labels_flat)
      # crosscheck
      acc_scikit = accuracy_score(pred_flat, labels_flat)
      print("Accuracy self: {} vs. scikit: {}, check: {}".format(acc_int, acc_scikit, (acc_scikit==acc_int)))
      return acc_int

def f1_row_cardsmith(y_test_true, y_test_pred):
    """Calculates a row-wise / observations - based F1 Score just like in the paper by Card/Smith 2015.
    Takes in the test-dataframe containing the ground truth in binary vectors and the predictions-dataframe
    containing the predicted binary vectors for all test observations. These have to be provided as dataframes
    containing only zeros or ones. Returns a scalar value / the F1 score
  	as calculated by Card/Smith, p. 4 (formula 1).

    :param y_test_true: dataframe containing the ground truth binary code vectors
    :param y_test_pred: dataframe containing the predicted binary code vectors
    :return: scalar, the F1 Score as calculated by Card/Smith, Formula 1, page 4
    """
    # now calculate the f1 score row wise by using the formula from card / smith
    y_test_true = pd.DataFrame.from_records(y_test_true)
    f1_sum = 0
    for index, row in y_test_pred.iterrows():
        # extract the original binary code vector from y_test (ground truth)
        ground_truth = y_test_true.iloc[index, :]
        # extract the predicted binary code vector from df_results (prediction)
        predvec = row
        # calculate the counter (dot vector product between ground truth and predvec)
        above = np.asscalar(np.dot(ground_truth, predvec))
        # calculate the denominator (1-norm of ground truth and predvec)
        denom = np.asscalar(np.linalg.norm(ground_truth, ord=1) + np.linalg.norm(predvec, ord=1))
        # calculate the pre-score and build up the sum
        try:  # might encounter division by zero!
            pre_score = (2 * (above / denom))
        except ZeroDivisionError:
            pre_score = 0  # set to zero, if denominator is 0!
        # aggregate the individual F1-values up
        f1_sum += pre_score
    # get number of rows in test set
    nrow = y_test_true.shape[0]
    # calculate the final, row-wise score
    f1_final = f1_sum / nrow
    return f1_final

# 2. Main model function
def xlnet_multilabel_classification(dataset_filenamelist, xlnet_config, dump_pred_path, verbose=False):
  """ Reads in a list of dataset filenames, loads them, pre-processes them for XLNet 
  and then fine-tunes a XLNet Model on the current dataset. Returns test_f1_macro, 
  test_f1_micro, test_precision_macro, test_precision_micro, test_recall_macro, 
  test_recall_micro, test_acc_score, test_hamloss, test_f1_samples, test_cardsmith, 
  test_rankloss, test_LRAP after the final evaluation run."""
  print("=====================================================================")
  # 1. perform some input - checks and mount gpu 
  current_gpu = mount_gpu()

  if dataset_filenamelist is None or len(dataset_filenamelist)==0:
    print("[XLNet-MLL]: No filename-list given for the fixed train/test splits - aborting!")
    return
    
  if xlnet_config is None:
    print("[XLNet-MLL]: No XLNet-Model configuration given - aborting!")
    return

  if dump_pred_path is None or len(dataset_filenamelist)==0:
    print("[XLNet-MLL]: No folder is given for the predictions dumps - aborting!")
    return

  # 2. shoot off Status message and timestamp
  print("[XLNet-MLL]: Started multi-label classification on {} at: {}".
        format(dataset_filenamelist, datetime.datetime.now()))

  # 3. load dataset parts
  get_xtrainp=dataset_filenamelist[0]
  get_xtestp=dataset_filenamelist[1]
  get_ytrainp=dataset_filenamelist[2]
  get_ytestp=dataset_filenamelist[3]
  X_train_raw=pd.read_csv(get_xtrainp, sep = ',', encoding='utf-8')
  X_test_raw=pd.read_csv(get_xtestp, sep = ',', encoding='utf-8')
  y_train_raw=pd.read_csv(get_ytrainp, sep = ',', encoding='utf-8')
  y_test_raw=pd.read_csv(get_ytestp, sep = ',', encoding='utf-8')

  # get test data portion labels as numpy ndarray from dataframe
  y_test = y_test_raw.to_numpy(dtype=int)
  
  if verbose==True:
    print("[XLNet-MLL]: Xtrain dataset {} head: \n {}".format(get_xtrainp, 
                                                        X_train_raw.head()))
    print("[XLNet-MLL]: Xtest dataset {} head: \n {}".format(get_xtestp, 
                                                          X_test_raw.head()))
    print("[XLNet-MLL]: ytrain dataset {} head: \n {}".format(get_ytrainp, 
                                                          y_train_raw.head()))
    print("[XLNet-MLL]: ytest dataset {} head: \n {}".format(get_ytestp, 
                                                          y_test_raw.head()))
    
  # get category indices and number of categories present in dataset for later
  categories=y_train_raw.columns.values.tolist()
  num_labels_df, pred_final_colsize=len(categories), len(categories)

  # print some overview statistics 
  if verbose == True:
    print("[XLNet-MLL]: Word count train: ", X_train_raw['verbatim'].apply(lambda x: len(x.split(' '))).sum())
    print("[XLNet-MLL]: Word count test: ", X_test_raw['verbatim'].apply(lambda x: len(x.split(' '))).sum())
    print("[XLNet-MLL]: Labels of the dataset: \n", categories, " count: ", len(categories))
    print("[XLNet-MLL]: Number of overall training verbatims: {:,}\n".format(X_train_raw.shape[0]))
    print("[XLNet-MLL]: Number of overall testing verbatims: {:,}\n".format(X_test_raw.shape[0]))

  # 4. perform XLNET - specific preprocessing

  # XLNET requires all text to be in column "text" and all labels as a list in
  # column "labels", hence we have to collect the labels into lists for train/test

  # Train data portion
  # print("Column count y_train_raw: ", y_train_raw.shape[1])
  # get all labels from one observation, create a list and store
  train_observations_labels = []
  for index, row in y_train_raw.iterrows():
      # get raw row values as a list
      get_raw = row.values
      labels_list = list(tuple(get_raw))
      # labels_list = row.values.tolist()
      # print(labels_list, " / ", "type: ", str(type(labels_list)))
      train_observations_labels.append(labels_list)
  # init list of labels column
  y_train_raw['label_list'] = train_observations_labels  
  # quick inspect it, having a look on the results for plausibility
  if verbose == True:
    print(y_train_raw.head())
    print(y_train_raw.tail())
  # generate new training data df, using X_train_raw
  X_train_raw['labels'] = train_observations_labels
  X_train_raw['text'] = X_train_raw['verbatim']
  # copy text and label into !one! dataframe
  X_train_xlnet = X_train_raw[['caseID', 'text', 'labels']]

  # Test data portion
  # print("Column count y_test_raw: ", y_test_raw.shape[1])
  # get all labels from one observation, create a list and store
  test_observations_labels = []
  for index, row in y_test_raw.iterrows():
      # get raw row values as a list
      get_raw = row.values
      labels_list = list(tuple(get_raw))
      # labels_list = row.values.tolist()
      # print(labels_list, " / ", "type: ", str(type(labels_list)))
      test_observations_labels.append(labels_list)
  # init list of labels column
  y_test_raw['label_list'] = test_observations_labels  
  # quick inspect it, having a look on the results for plausibility
  if verbose == True:  
    print(y_test_raw.head())
    print(y_test_raw.tail())
  # generate new training data df, using X_train_raw
  X_test_raw['labels'] = test_observations_labels
  X_test_raw['text'] = X_test_raw['verbatim']
  # copy text and label into !one! dataframe
  X_test_xlnet = X_test_raw[['caseID', 'text', 'labels']]

  # 5. Generate a training evaluation data portion from the train data set
  train_df, eval_df = train_test_split(X_train_xlnet, test_size=0.1)
  # non-conformity regarding multi label stratified splitting, TODO: report in text!

  print("[XLNet-MLL]: Trying to set up the model")
  # 5. set the model up for the current number of labels in the dataset
  model_starttime = time.time() # take the time for fitting and predicting
  xlnet_model = MultiLabelClassificationModel('xlnet', 'xlnet-base-cased', 
                                               num_labels=num_labels_df, 
                                               args=xlnet_config)
  print("[XLNet-MLL]: Successfully setup the model")
  
  # 6. train the model: pass training portion and evaluation portion
  xlnet_model.train_model(train_df=train_df, eval_df=eval_df)
  print("[XLNet-MLL]: Successfully trained the model")

  # 7. evaluate the model 
  print("[XLNet-MLL]: Model evaluation cycle!")
  result, model_outputs, wrong_predictions = xlnet_model.eval_model(X_test_xlnet)
  print("[XLNet-MLL]: Evaluation of the model (LRAP): " + str(result))

  # stop the time here and calculate the elapsed time
  model_stoptime = time.time()
  mod_elapsed_time = model_stoptime - model_starttime
  print("[XLNet-MLL]: Time elapsed: {} (in seconds)".format(mod_elapsed_time))
  # use the model outputs to construct a evaluation dataframe 
  # and a evaluation array (don't forget to cross check the outputs)
  # and use a tresholder!

  print("Model outputs type: \n", type(model_outputs))

  # generate a list holding all predictions for the test data
  y_preds = []
  # fill with predictions and convert the predictions to zero / one
  for index in range(len(model_outputs)): 
    # get predictions for each observation
    row = model_outputs[index]
    tmp_input = row
    # set all vector elements >0.5 to 1, all below to 0
    tmp_input[tmp_input>0.5] = 1
    tmp_input[tmp_input<0.5] = 0
    # append to final array
    y_preds.append(tmp_input)
  # convert list to ndarray
  y_predicted = np.asarray(y_preds, dtype=int)
  # store the predictions on HDD for later inspection - as csv file
  fname_wout_extension = os.path.splitext(os.path.basename(get_xtestp))[0]
  dump_path = dump_pred_path + fname_wout_extension + '_XLNet_MLL_prediction.csv'
  print("[XLNet-MLL]: Current dump_path for predictions: ", dump_path)
  np.savetxt(dump_path, y_predicted, delimiter=',')

  # also keep a copy of the raw outputs (for the rank-based metrics)
  y_score = model_outputs
  print("Model ypred type: \n", type(y_predicted))
  print("y_test type: ", type(y_test))

  # 8. calculate performance metrics
  # label based
  test_f1_macro = f1_score(y_test, y_predicted, average='macro')
  test_f1_micro = f1_score(y_test, y_predicted, average='micro')

  test_precision_macro = precision_score(y_test, y_predicted, average='macro')
  test_precision_micro = precision_score(y_test, y_predicted, average='micro')
  test_recall_macro = recall_score(y_test, y_predicted, average='macro')
  test_recall_micro = recall_score(y_test, y_predicted, average='micro')

  # observation based
  test_acc_score = accuracy_score(y_test, y_predicted)
  test_hamloss = hamming_loss(y_test, y_predicted)
  test_f1_samples = f1_score(y_test, y_predicted, average='samples')

  # addition: 0/1 loss 
  test_zero_one_loss = zero_one_loss(y_test, y_predicted)

  # generate a dataframe from the predictions ndarray for the
  # card/smith f1 function
  ypred_df = pd.DataFrame.from_records(y_predicted)
  test_cardsmith = f1_row_cardsmith(y_test, ypred_df)

  # rank based
  test_rankloss = label_ranking_loss(y_test, y_score)
  test_LRAP = label_ranking_average_precision_score(y_test, y_score)
  model_LRAP = result

  if verbose == True:
    # cross check the model - calculated LRAP and our LRAP
    print("Model {} and scikit {} LRAP match: {}".format(test_LRAP, model_LRAP,
                                                       test_LRAP == model_LRAP))

  # 8. build a evaluation metric list
  ret_list = [test_f1_macro, test_f1_micro, test_precision_macro, 
              test_precision_micro, test_recall_macro, test_recall_micro, 
              test_acc_score, test_zero_one_loss, test_hamloss, 
              test_f1_samples, test_cardsmith, test_rankloss, test_LRAP, 
              mod_elapsed_time]

  # 9. shoot off Status message and timestamp
  print("[XLNet-MLL]: Finished multi-label classification on {} at: {}".
        format(dataset_filenamelist, datetime.datetime.now()))
  print("=====================================================================")
  print("---------------------------------------------------------------------")
  # 10. finally delete the model and all data to free memory from gpu 
  print("Trying to free the GPU memory by deleting current model and tensors!")
  c = memory_cached(current_gpu)
  a = memory_allocated(current_gpu)
  get_summary = c-a  # free inside cache
  print(f"Memory free inside cache before: {get_summary}")
  del xlnet_model
  del result, model_outputs, wrong_predictions
  torch.cuda.empty_cache()
  c = memory_cached(current_gpu)
  a = memory_allocated(current_gpu)
  get_summary = c-a  # free inside cache
  print(f"Memory free inside cache after: {get_summary}")
  print("=====================================================================")
  # 11. return this to the user
  return ret_list

In [None]:
# 3. setup preliminaries: Mount GoogleDrive, setup model, etc.
if IN_COLAB==True:
  # mount the google drive which has been prepared with the data for the task
  drive.mount('/content/drive')

  # generate the model storage folder paths for later model dumping
  model_storage_folders = []
  for index in range(1, 11):
    folder_base = '/content/drive/My Drive/ma_data/stored_models/xlnet_mll/model_dataset_{}/'.format(index)
    model_storage_folders.append(folder_base)

  # switch on logging
  logname = '/content/drive/My Drive/ma_data/stored_models/xlnet_mll/xlnet_models_run.log'
  logging.basicConfig(filename=logname, 
                      level=logging.DEBUG, 
                      filemode='w')
  transformers_logger = logging.getLogger("transformers")


  # create dataset paths for fixed train/test splits
  data_parts_list = []
  for index in range(1, 11):
    # generate paths
    base_path='/content/drive/My Drive/ma_data/'
    x_trainp=base_path+"dataset{}_Xtrain.csv".format(index)
    x_testp=base_path+"dataset{}_Xtest.csv".format(index)
    y_trainp=base_path+"dataset{}_ytrain.csv".format(index)
    y_testp=base_path+"dataset{}_ytest.csv".format(index)
	  # add to list
    data_parts_list.append([x_trainp, x_testp, y_trainp, y_testp])

  # create the predictions storage path 
  pred_storage_path = '/content/drive/My Drive/ma_data/stored_models/xlnet_mll/dumped_predictions/'

  # call the model for each dataset separately
  xlnet_ds_metrics = []
  indexer = 0
  for filename_list in data_parts_list:
    # get the appropriate model storage folder path from model_storage_folders
    model_folder = model_storage_folders[indexer]
    print("[XLNet-MLL]: Saving model to: {} after fine-tuning!".format(model_folder))
    indexer += 1
    # setup the arguments dictionary for the model
    xlnet_config = {"output_dir": model_folder, 
                    "cache_dir": model_folder,
                    "best_model_dir": model_folder,
                    "fp16": True,
                    "fp16_opt_level": "O1",
                    "max_seq_length": 128, # leave this for comp.
                    "train_batch_size": 8, # leave this for comp.
                    "eval_batch_size": 8, # leave this for comp.
                    "gradient_accumulation_steps": 1, # leave this for comp.
                    "num_train_epochs": 3, # overridden
                    "weight_decay": 0.01, # 
                    "learning_rate": 2e-5, # overridden
                    "adam_epsilon": 1e-6, # overridden
                    "warmup_ratio": 0.06, # overridden
                    "warmup_steps": 120, # overridden
                    "max_grad_norm": 1.0,
                    "do_lower_case": False,
                    "logging_steps": 50,
                    "evaluate_during_training": True, # overriden
                    "evaluate_during_training_steps": 2000,
                    "evaluate_during_training_verbose": True, # overriden
                    "use_cached_eval_features": False,
                    "save_eval_checkpoints": True, 
                    "save_steps": 2000,
                    "no_cache": False,
                    "save_model_every_epoch": True,
                    "tensorboard_dir": model_folder,
                    "overwrite_output_dir": True, # overrriden
                    "reprocess_input_data": True,
                    "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
                    "n_gpu": 1,
                    "silent": False,
                    "use_multiprocessing": True,
                    "wandb_project": None,
                    "wandb_kwargs": {},
                    "use_early_stopping": True,
                    "early_stopping_patience": 3,
                    "early_stopping_delta": 0.01, # overrridden
                    "early_stopping_metric": "eval_loss",
                    "early_stopping_metric_minimize": True,
                    "manual_seed": 2020 # overriden
                    }
    # fine tune model on the dataset, extract evaluation metrics from last
    # evaluation run and save these metrics 
    metrics_list = xlnet_multilabel_classification(dataset_filenamelist = filename_list,
                                                   xlnet_config = xlnet_config,
                                                   dump_pred_path = pred_storage_path,
                                                   verbose = False)
    # for each dataset write a list into the overall metrics list 
    xlnet_ds_metrics.append(metrics_list)
    print("Current metrics: {}".format(metrics_list))
  
  ####################### REPORTING OF RESULTS #################################
  print("[XLNet-MLL]: Generating the metrics report!")

  # generate the metrics dataset
  metrics_df = pd.DataFrame(xlnet_ds_metrics)
  colnames = ['F1_macro', 'F1_micro', 'Precision_macro', 'Precision_micro', 
                'Recall_macro', 'Recall_micro', 'Accuracy', 'Zero_One_loss', 
                'Hamming_Loss', 'F1_samples', 'F1_Card_Smith', 
                'Ranking_loss', 'LRAP', 'Model_time']
  metrics_df.columns = colnames
  # insert dataset identifier
  metrics_df['Dataset_ID'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 
  new_colnames = ['Dataset_ID', 'F1_macro', 'F1_micro', 'Precision_macro', 'Precision_micro', 
                'Recall_macro', 'Recall_micro', 'Accuracy', 'Zero_One_loss', 
                'Hamming_Loss', 'F1_samples', 'F1_Card_Smith', 
                'Ranking_loss', 'LRAP', 'Model_time']
  # reorder the dataset columns
  metrics_df = metrics_df[new_colnames]

  # store it on HDD
  metrics_file_path = '/content/drive/My Drive/ma_data/stored_metrics/XLNet_mll_metrics.csv'
  metrics_df.to_csv(metrics_file_path, index=False, header=True, encoding='utf-8')

  # store latex version on HDD
  print("[XLNet-MLL]: Trying to store the *.tex-file!")
  texmet_file_path = '/content/drive/My Drive/ma_data/stored_metrics/XLNet_mll_metrics.tex'
  try:
    metrics_df.to_latex(texmet_file_path, index=False)
  except:
    print("[XLNet-MLL]: Failed to print to latex!")

  # drop the dataset id column
  metrics_df = metrics_df.iloc[:, 1:]
  metrics_df = metrics_df.iloc[:, :-1]
  # plot the data 
  labels = ["dataset 1", "dataset 2", "dataset 3", "dataset 4", "dataset 5", 
            "dataset 6", "dataset 7", "dataset 8", "dataset 9", "dataset 10"]
  fig, ax = plt.subplots(figsize=(15, 10))
  metrics_df.plot.bar(ax=ax)
  # label and style the plot
  ax.set_ylabel('Metric')
  ax.set_xlabel('Dataset')
  fig.suptitle('XLNet - Performance metrics', fontsize=16)
  plt.rc('xtick', labelsize='x-small')
  plt.rc('ytick', labelsize='x-small')
  ax.spines['top'].set_visible(False)
  ax.spines['right'].set_visible(False)
  # restyle left and bottom axis
  ax.spines['left'].set_smart_bounds(True)
  ax.spines['bottom'].set_smart_bounds(True)
  ax.legend(loc='best', title='Metrics')
  ax.set_xticklabels(labels)

  # store on HDD for documentation
  plot_path = '/content/drive/My Drive/ma_data/stored_plots/XLNet_mll_metrics.png'
  plt.savefig(plot_path)
  plt.show()

  ################# LAST BUT NOT LEAST - UNMOUNT #################################
  drive.flush_and_unmount()
  print('All changes made in this colab session should now be visible in Drive.')
  
  ########################## LAST STATUS MESSAGE ###############################
  print("[XLNet-MLL]: Done running the model and saving the results!")

else: 
  print("[XLNet-MLL]: No COLAB environment found - aborting!")