In [1]:
import transformers as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers_interpret import SequenceClassificationExplainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model_pipeline(username, prefix, model_name):
    p = tf.pipeline('text-classification', f'{username}/{prefix}-{model_name}', return_all_scores=True, device='cuda')
    return p

def load_data(data_folder):
    df_train = pd.read_pickle(f'{data_folder}/processed/train.pkl')
    df_test = pd.read_pickle(f'{data_folder}/processed/test.pkl')
    df = pd.concat([df_train, df_test])
    return df

def clean_data(df : pd.DataFrame, p : tf.Pipeline, na_repl='blank', max_tokens=225):
    print(f'Original df size: {df.shape}')

    print(f'Filling NAN in comment with "{na_repl}"')
    df['comment'] = df.comment.fillna(na_repl)

    df['tf_toklen'] = [len(toks) for toks in p.tokenizer(df.comment.tolist())['input_ids']]
    df = df[df['tf_toklen'] <= max_tokens]
    print(f'Size after filtering comments longer than {max_tokens} tokens: {df.shape}')

    return df

def get_word_importances(comments : list[str], p : tf.Pipeline, relative_to: str):

    explainer = SequenceClassificationExplainer(p.model, p.tokenizer)

    word_imports = {}
    for comment in tqdm(comments):
        attribs = explainer(comment, class_name=relative_to)
        for w, score in attribs:
            try:
                word_imports[w].append(score)
            except KeyError:
                word_imports[w] = [score]
    return word_imports

In [3]:
p = load_model_pipeline('maxspad','nlp-qual','qual')
df = load_data('../data')



In [4]:
df = clean_data(df, p, max_tokens=127)
word_imports = get_word_importances(df.comment.tolist(), p, 'LABEL_3')

Original df size: (2500, 46)
Filling NAN in comment with "blank"
Size after filtering comments longer than 127 tokens: (2287, 47)


100%|██████████| 2287/2287 [04:31<00:00,  8.43it/s]


In [50]:
len(word_imports)

497

In [3]:
p = load_model('maxspad','nlp-qual','qual')



In [4]:
from transformers_interpret import SequenceClassificationExplainer

In [26]:
tok_lens = [len(toks) for toks in p.tokenizer(df.comment.dropna().tolist())['input_ids']]

719

In [21]:
p.model.config

BertConfig {
  "_name_or_path": "maxspad/nlp-qual-qual",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.27.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [5]:
cls_explainer = SequenceClassificationExplainer(p.model, p.tokenizer)

In [16]:
p.tokenizer([comment, comment])

{'input_ids': [[101, 124, 1107, 25098, 6006, 119, 1960, 192, 120, 184, 18311, 1193, 2941, 117, 1141, 1125, 2846, 1956, 119, 2750, 5531, 117, 5113, 1116, 1106, 3242, 1106, 1294, 8313, 1105, 1759, 1106, 10407, 1172, 119, 102], [101, 124, 1107, 25098, 6006, 119, 1960, 192, 120, 184, 18311, 1193, 2941, 117, 1141, 1125, 2846, 1956, 119, 2750, 5531, 117, 5113, 1116, 1106, 3242, 1106, 1294, 8313, 1105, 1759, 1106, 10407, 1172, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
comment = df.comment.sample(1).iloc[0]
print(comment)
cls_explainer(comment)

3 intubations. Two w/o paralytic, one had difficult features. Good technique, listens to ways to make improvements and works to implement them.


[('[CLS]', 0.0),
 ('3', 0.005185757033069379),
 ('in', 0.04395189975750096),
 ('##tub', -0.02113587613474386),
 ('##ations', 0.0007418072721983587),
 ('.', -0.01244410198356168),
 ('Two', 0.07038895208989684),
 ('w', 0.030665099802163035),
 ('/', 0.06931605859413979),
 ('o', 0.010496900965436122),
 ('para', 0.05789081846478245),
 ('##ly', 0.036046785326364725),
 ('##tic', 0.05362616533392286),
 (',', 0.0369501769953565),
 ('one', 0.10680154519293895),
 ('had', 0.14328514343932056),
 ('difficult', 0.1777208542181716),
 ('features', 0.12301101772702572),
 ('.', 0.07466439980589178),
 ('Good', 0.11170049467262178),
 ('technique', 0.21197054046343647),
 (',', 0.05348651487955829),
 ('listen', 0.44825852529961574),
 ('##s', -0.05666687142929514),
 ('to', 0.14741974984557887),
 ('ways', 0.6395882107306262),
 ('to', 0.14521857051295992),
 ('make', 0.15130049355768155),
 ('improvements', 0.08790667123189103),
 ('and', 0.08584647610711459),
 ('works', 0.14438028478700984),
 ('to', 0.16527520770