# Analysis of word importance

In [1]:
import torch
from torch import cuda


device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

### Load models

In [2]:
model_names = {
    "bert":{"slug": "bert-base-uncased-pubmed", "file_name":"bert-base-uncased.pt"},
    "roberta":{"slug": "roberta-base-pubmed", "file_name":"roberta-base.pt"},
    "deberta":{"slug": "deberta-base-pubmed", "file_name":"deberta-base.pt"},
    "bluebert":{"slug": "bluebert-large-pubmed", "file_name":"bluebert_pubmed_uncased_L-24_H-1024_A-16.pt"},
    "xlnet":{"slug": "xlnet-large-pubmed", "file_name":"xlnet-large-cased.pt"},
    "svm":{"slug":"svm-linear-pubmed", "file_name":["svm.pkl", "vectorizer.pkl"]}
}

In [3]:
import kaggle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import joblib

from Source_code.z_utils.BERTClassifier import BERTClassifier
from Source_code.z_utils.RoBERTaClassifier import RoBERTaClassifier
from Source_code.z_utils.DeBERTaClassifier import DeBERTaClassifier
from Source_code.z_utils.BlueBERTClassifier import BlueBERTClassifier
from Source_code.z_utils.XLNetClassifier import XLNetClassifier


kaggle.api.authenticate()
data_path = "./models/"
if not os.path.exists(data_path):
    os.makedirs(data_path)
    print(f"Directory created: {data_path}")
models = {}

for model in model_names.keys():
    file_name = model_names[model]["file_name"]
    target_path = f"{data_path}{file_name}"
    
    if not os.path.exists(target_path):
        slug = model_names[model]["slug"]
        if model == "svm":
            kaggle.api.model_instance_version_download_cli(f"marcelhiltner/{slug}/scikitlearn/{slug}/1", data_path, untar=True)
        else:
            kaggle.api.model_instance_version_download_cli(f"marcelhiltner/{slug}/pytorch/{slug}/1", data_path, untar=True)
        
    if model == "svm":
        svm = joblib.load(f"{data_path}{file_name[0]}")
        print("svm loaded.")
        vectorizer = joblib.load(f"{data_path}{file_name[1]}")
        print("vectorizer loaded.")
        models[model] = (svm, vectorizer)
    else:
        models[model] = torch.load(target_path)
        models[model].eval()
        print(f"{model} loaded.")

bert loaded.
roberta loaded.
deberta loaded.
bluebert loaded.
xlnet loaded.
Downloading svm-linear-pubmed.tar.gz to ./models


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36.4M/36.4M [00:02<00:00, 17.6MB/s]







svm loaded.
vectorizer loaded.


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
     ---------------------------------------- 0.1/12.8 MB 1.0 MB/s eta 0:00:13
     -- ------------------------------------- 0.7/12.8 MB 3.9 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 6.7 MB/s eta 0:00:02
     ------ --------------------------------- 2.1/12.8 MB 8.5 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 11.0 MB/s eta 0:00:01
     ------------- -------------------------- 4.4/12.8 MB 13.2 MB/s eta 0:00:01
     ------------------ --------------------- 5.9/12.8 MB 15.6 MB/s eta 0:00:01
     ----------------------- ---------------

### Define prediction functions aligned to LIME functionality

In [5]:
import spacy
import numpy as np
import torch.nn.functional as F

from Source_code.z_utils.global_constants import MAX_LEN


def predict_proba_plm(texts):
    """
    Predicts class probabilities for the given texts using PLM classifier.

    Args:
        texts (array-like): List or array containing text data.

    Returns:
        array: Array of predicted class probabilities.
    """
    predictions = []
    
    for data in texts:
        text = str(data)

        inputs = model.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
            return_token_type_ids=True
        )
        with torch.no_grad():
            logits = model(**inputs.to(device))

        probabilities = F.softmax(
            logits.squeeze(), -1).cpu().detach().numpy().tolist()
        predictions.append(probabilities)
        
    return np.array(predictions)

In [13]:
def predict_proba_svm(texts):
    """
    Predicts class probabilities for the given texts using SVM classifier.

    Args:
        texts (array-like): List or array containing text data.

    Returns:
        array: Array of predicted class probabilities.
    """
    vectorized_texts = vectorizer.transform(texts)
    decision = svm.decision_function(vectorized_texts)
    reshaped_decision = np.array(decision).reshape(-1, 1)

    return reshaped_decision

### Interpret models

In [None]:
import lime
import lime.lime_text

from Source_code.z_utils.data_preprocessing import preprocess_text
from Source_code.z_utils.global_constants import LABELS_MAP, RANDOM_SEED
from Source_code.z_utils.lemmatize import lemmatize


class_names = LABELS_MAP.keys()
explainer = lime.lime_text.LimeTextExplainer(class_names=class_names, random_state=RANDOM_SEED, verbose=True)
text = 'Physicians and veterinarians often encounter cases where human and animal health intersect. For instance, consider zoonotic diseases like rabies, where humans can contract the virus from infected animals. Additionally, advancements in medical technology have led to the use of animal models for studying human diseases, such as cancer research using mice. Furthermore, the One Health approach emphasizes the interconnectedness of human, animal, and environmental health, highlighting the importance of collaboration between medical and veterinary professionals.'
text_prep = preprocess_text(text)

for model_key in models.keys():
    model = models[model_key]
    
    print("=" * 30)
    print(f'Model {model_key}')
    print("=" * 30)
    
    if model_key == "svm":
        lemmatizer = spacy.load('en_core_web_sm')
        proc_text = preprocess_text(text_prep, numbers=True)
        lemm_text = lemmatize(lemmatizer, proc_text)
        exp = explainer.explain_instance(lemm_text, predict_proba_svm, labels=(0,), num_features=25, num_samples=500)
        exp.save_to_file(f"interpretation_svm.html")
    else:
        model.to(device)
        model.eval()
        exp = explainer.explain_instance(text_prep, predict_proba_plm, num_features=25, num_samples=500)
        exp.save_to_file(f"interpretation_{model.checkpoint[model.checkpoint.find('/')+1:]}.html")
    
    exp.show_in_notebook(text=text_prep)
    