# Setup

In [1]:
! pip install --quiet transformers shap sentencepiece datasets --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import transformers
import shap
import torch
import datasets

import numpy as np
import scipy as sp

from transformers import AutoTokenizer, AutoModelForCausalLM

# 1 - mGPT Model

In [4]:
model_path = 'ai-forever/mGPT'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

In [5]:
model.config.is_decoder=True

gen_dict = dict(
    do_sample=True,
    max_length=20,
    temperature=0.7,
    top_k=50,
    no_repeat_ngram_size=2,
    max_new_tokens=25
)
model.config.task_specific_params = dict()
model.config.task_specific_params["text-generation"] = gen_dict

In [6]:
s = ['Cats are better than dogs because']

In [7]:
shap_model = shap.models.TeacherForcing(model, tokenizer)
masker = shap.maskers.Text(tokenizer, mask_token="...", collapse_mask_token=True)
explainer = shap.Explainer(shap_model, tokenizer)
shap_values = explainer(s)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Partition explainer: 2it [00:10, 10.24s/it]               


In [8]:
shap.plots.text(shap_values)

#  2 - Bloom Model

In [9]:
from transformers import BloomForCausalLM
from transformers import BloomTokenizerFast

In [10]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [11]:
s = ['Cats are better than dogs because']

In [12]:
model.config.is_decoder=True

gen_dict = dict(
    do_sample=True,
    max_length=20,
    temperature=0.7,
    top_k=50,
    no_repeat_ngram_size=2,
    max_new_tokens=25
)
model.config.task_specific_params = dict()
model.config.task_specific_params["text-generation"] = gen_dict

In [13]:
shap_model = shap.models.TeacherForcing(model, tokenizer)
masker = shap.maskers.Text(tokenizer, mask_token="...", collapse_mask_token=True)
explainer = shap.Explainer(shap_model, tokenizer)
shap_values = explainer(s)

`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore passing `position_ids`.


  0%|          | 0/42 [00:00<?, ?it/s]

Partition explainer: 2it [00:35, 35.68s/it]               


In [14]:
shap.plots.text(shap_values)

# 3 - Pythia Model

In [15]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-v0")
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-v0").cuda()

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [17]:
model.config.is_decoder=True

gen_dict = dict(
    do_sample=True,
    max_length=20,
    temperature=0.7,
    top_k=50,
    no_repeat_ngram_size=2,
    max_new_tokens=25
)
model.config.task_specific_params = dict()
model.config.task_specific_params["text-generation"] = gen_dict

In [18]:
s = ['Cats are better than dogs because']

In [19]:
shap_model = shap.models.TeacherForcing(model, tokenizer)
masker = shap.maskers.Text(tokenizer, mask_token="...", collapse_mask_token=True)
explainer = shap.Explainer(shap_model, tokenizer)
shap_values = explainer(s)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [20]:
shap.plots.text(shap_values)