### Install packages

In [None]:
# !pip install torch==2.1.0
# !pip install transformers
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install transformers-interpret
# !pip install transformers accelerate optimum

### Import necessary packages and modules

In [None]:
# Importing the libraries needed
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from numpy import argmax,exp

import torch
from torch import tensor
from torch.utils.data import Dataset, DataLoader
import accelerate

import transformers
from transformers import BertModel, BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

from transformers_interpret import SequenceClassificationExplainer
from pathlib import Path
from optimum.pipelines import pipeline

import re
import gc

In [None]:
total_memory = torch.cuda.get_device_properties(0).total_memory
print(f"Total GPU Memory: {total_memory / (1024 ** 2)}MB")

### Set up inputs

In [None]:
tokenizer_path = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
classifier_path = "../bert_par_all/new_bert/model_paragraphs"
data_path = "../bert_par_all/nature_paragraph_data.csv"
experiment = "bert_par_fm"

### Load data and model

In [None]:
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
path = Path(classifier_path)
tokenizer.save_pretrained(classifier_path)

df=pd.read_csv(data_path)
X_all = df['TEXT'].values
y_all = df['CATEGORY'].values
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42, stratify=y_all)
del(X_all, y_all, X_train, y_train, df)

In [None]:
modeltest = BertForSequenceClassification.from_pretrained(path)

training_arguments = TrainingArguments(evaluation_strategy='epoch',
                                       save_strategy='epoch',
                                       logging_strategy='epoch',
                                       log_level='critical',
                                       output_dir="./results",
                                       num_train_epochs=2,
                                       per_device_train_batch_size=4,
                                       per_device_eval_batch_size=4,
                                       warmup_steps=200,
                                       weight_decay=0.01,
                                       logging_dir="./logs",
                                       load_best_model_at_end=True)

trainertest = Trainer(model=modeltest, args=training_arguments)

### Declare useful functions

#### get_words_weights(): calculate input words' importance wweights using integrated gradient

In [None]:
def get_words_weights(text, model, tokenizer):
  torch.cuda.empty_cache()
  gc.collect()
    
  # gettoken weights using ig
  with torch.no_grad():
    cls_explainer = SequenceClassificationExplainer(
      model,
      tokenizer)
    word_attributions = cls_explainer(text, internal_batch_size=1)
  
  weights_df = pd.DataFrame(word_attributions, columns =['Token', 'Weight'])
  weights_df['Count'] = 1

  last_word_index = 0
  for i in range(weights_df.shape[0]):
    if weights_df.Token[i].startswith("##"):
      weights_df.Token[last_word_index] = weights_df.Token[last_word_index] + weights_df.Token[i].replace("##", "")
      weights_df.Weight[last_word_index] += weights_df.Weight[i]
      weights_df.Count[last_word_index] += 1
      weights_df.Token[i] = "##"
    else:
      last_word_index = i

  weights_df = weights_df[weights_df.Token != "##" ]
  weights_df['Avg_Weight'] = weights_df['Weight']/weights_df['Count']

  gc.collect()
  torch.cuda.empty_cache()

  return weights_df[['Avg_Weight','Token']]

#### get_new_probs(): calculate new input label and probability after removing its most important token

We use them to calculate the **Decision Flip - Most Informative Token (DMFIT)** and **Faithfulness** metrics. Both of them consider the original text and the text after its most important token is removed. Faithfulness calculates the average difference in probability between the two texts, and DMFIT the proportion of texts that changed class among all the dataset.

In [None]:
def get_new_probs(text, n, weights_df, indexes, label, trainertest):
  
  warnings.filterwarnings("ignore")
  torch.cuda.empty_cache()
  idx = indexes[n] if len(indexes)>1 else indexes
  
  word_n = weights_df.Token[idx]
  word_n = word_n.replace("(", "\(").replace(")", "\)").replace("+", "\+").replace("*", "\*").replace("[", "\[").replace("]", "\]")
  pattern = re.compile(word_n, re.IGNORECASE)
  text_n = pattern.sub("", text, count = 1)

  test_dataset = myDataset(np.array([text_n], dtype='object'), np.array([0]), tokenizer)
    
  with torch.no_grad():
    outputs = trainertest.predict(test_dataset)
    
  predictions = outputs.predictions
  probabilities = softmax(predictions[0])
  pred_label = argmax(softmax(probabilities))
  
  if pred_label == label:
    pred_prob = probabilities[pred_label]
  else:
    pred_prob = 1- probabilities[pred_label]
    
  return pred_label, pred_prob

#### myDataset(): class to prepare the classifier input data
#### softmax(): calculate probabilities from logits

In [None]:
import warnings
warnings.filterwarnings("ignore")

class myDataset(Dataset):
  def __init__(self, encodings, labels, tokenizer):
    self.encodings = tokenizer(list(encodings), truncation=True, padding=True)
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

def softmax(vector):
  e = exp(vector)
  return e / e.sum()

### Calculate metrics for each instance and save results in a file 

In [None]:
# only for scientific text
scient_test = X_test[y_test == 0]
results_list = []

s_0 = 0
s_1 = 0
s_e = 0
torch.cuda.empty_cache()

print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

In [None]:
for one_test in range(0, len(scient_test)):
    
    torch.cuda.empty_cache()
    gc.collect()
    try:
        text = scient_test[one_test]
        test_dataset = myDataset(np.array([text], dtype='object'), np.array([0]), tokenizer)

        with torch.no_grad():
            outputs = trainertest.predict(test_dataset)
        predictions = outputs.predictions
        probabilities = softmax(predictions[0])
        pred_label = argmax(softmax(probabilities))

        if pred_label != 0:
            s_1=s_1+1
            continue;

        s_0=s_0+1
        text = text.replace("é", "e")
        weights_df = get_words_weights(text, modeltest, tokenizer)
        indexes = weights_df['Avg_Weight'].nlargest(5).index.values
        
        new_label, new_prob = get_new_probs(text, 0, weights_df, indexes, pred_label, trainertest)
        
        results_list.append([text, pred_label, probabilities[pred_label], new_label, new_prob])

    except Exception as e:
        print(one_test)
        print("The error is: ",e)
        s_e=s_e+1
        continue;

torch.cuda.empty_cache()
df = pd.DataFrame(results_list, columns=['Original', 'old_label', 'old_prob', 'new_label', 'new_prob'])
output_path = experiment + ".csv"
df.to_csv(output_path)


### Print average results

In [None]:
import pandas as pd
from os import listdir

In [None]:
files = listdir(".")
files = [f for f in files if f.endswith(".csv")]

In [None]:
for f in files:
    print(f)
    temp_df = pd.read_csv(f, index_col = 0)
    print(f"faithfulness: {temp_df[['old_prob']].mean()[0] - temp_df[['new_prob']].mean()[0]}")
    counts = temp_df['new_label'].value_counts()
    if len(counts)>1:
        print(f"DMFIT: {counts[1]/temp_df.shape[0]}")
    else: 
        print(f"DFMIT: 0 ")
    print("")

In [None]:
temp_df[['old_prob']].mean()[0]