In [1]:
# import spacy
import shap
import torch
from transformers import BertTokenizer, BertForSequenceClassification

from lib.utils import load_jsonl_file
from lib.ner_processing import custom_anonymize_text

SEED = 42
BATCH_SIZE = 16
CLASS_NAMES = ['continue', 'not_continue']

# Load dataset
DATASET = load_jsonl_file("shared_data/dataset_2_6_2b.jsonl")
# Select a sample from the dataset
text  = DATASET[1]["text"]
text_id = DATASET[1]["id"]

# nlp = spacy.load("en_core_web_trf")


def get_device():
  """Returns the appropriate device available in the system: CUDA, MPS, or CPU"""
  if torch.backends.mps.is_available():
    return torch.device("mps")
  elif torch.cuda.is_available():
    return torch.device("cuda")
  else:
    return torch.device("cpu")


# Set device
device = get_device()
print(f"\nUsing device: {str(device).upper()}\n")

# Initialize constants
BERT_MODEL = 'bert-base-uncased'
MODEL_PATH = 'models/2/paper_b_hop_bert_reclass.pth'

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=len(CLASS_NAMES),
                                                      hidden_dropout_prob=0.1)

# Move the model to the device
model = model.to(device)
# Load the model weights
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
# Set the model to evaluation mode
model.eval()

"""sentence1, sentence2 = text.split('[SEP]')
text = (sentence1.strip(), sentence2.strip())"""

def custom_tokenize(text):
    sentence1, sentence2 = text.split('[SEP]')
    tokenized_output = tokenizer(sentence1.strip(), sentence2.strip(), return_tensors='pt', padding=True, truncation=True, max_length=512)
    # If tokenized_output is a dictionary containing tensors, you need to extract these tensors in a way that's compatible with your model and SHAP.
    # Assuming your model expects input_ids, attention_mask (common in Hugging Face models), you could return them directly:
    input_ids = tokenized_output['input_ids']
    attention_mask = tokenized_output['attention_mask']
    # Now, ensure to return a structure that has a .shape attribute and is expected by your model. You might need to adjust this based on your specific model's needs.
    # For example, if your model expects a single tensor, you could concatenate these tensors or return them as a tuple or list if the model expects multiple inputs.
    return input_ids, attention_mask


def predict(texts):
  encoding = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'  # Return PyTorch tensors
  )
  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)
  
  logits = model(input_ids, attention_mask=attention_mask)[0]  
  probabilities = logits.detach().cpu().numpy()
  return probabilities


# Initialize the SHAP explainer
explainer = shap.Explainer(
  model=predict, 
  masker=custom_tokenize, 
  output_names=CLASS_NAMES, 
  seed=SEED
)

# Compute SHAP values for the selected samples
shap_values = explainer([text])

# Visualize the SHAP values
shap.plots.text(shap_values)

# shap.save_html(f"xnlp/model_3_shap_{text_id}.html", shap.plots.text(shap_values[0]))


Loading data from shared_data/dataset_2_6_2b.jsonl...
Loaded 3566 items.

Using device: MPS



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

AttributeError: 'str' object has no attribute 'shape'