## Download IMDB dataset

In [None]:
!pip install datasets
from datasets import load_dataset

In [7]:
#train set
train_df_neg = load_dataset('imdb', split='train[:5000]')
train_df_pos = load_dataset('imdb', split='train[12500:17500]')

train_texts = train_df_neg['text'] + train_df_pos['text']
train_labels = train_df_neg['label'] + train_df_pos['label']



In [8]:
#val set
val_df_neg = load_dataset('imdb', split='test[:1000]')
val_df_pos = load_dataset('imdb', split='test[12500:13500]')

val_texts = val_df_neg['text'] + val_df_pos['text']
val_labels = val_df_neg['label'] + val_df_pos['label']



In [9]:
#test set
test_df_neg = load_dataset('imdb', split='test[1001:1251]')
test_df_pos = load_dataset('imdb', split='train[12501:12751]')

test_texts = test_df_neg['text'] + test_df_pos['text']
test_labels = test_df_neg['label'] + test_df_pos['label']



## Fine Tuning Bert model

### Tokenizer

In [10]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf

In [11]:
lm = "bert-base-cased"
tam_batch = 2
epochs = 1

In [12]:
tokenizer = AutoTokenizer.from_pretrained(lm, truncation=True, padding=True)

In [13]:
model = TFAutoModelForSequenceClassification.from_pretrained(lm,num_labels=2)

2022-11-09 09:48:36.748612: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-11-09 09:48:36.748728: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
#encoding
train_encodings = tokenizer(train_texts,truncation=True, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))

val_encodings = tokenizer(val_texts,truncation=True, padding=True)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))

### Training

In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=SparseCategoricalCrossentropy(from_logits=True),metrics=["accuracy"])

In [17]:
model.fit(train_dataset.shuffle(len(train_texts)).batch(tam_batch),
          validation_data = val_dataset.shuffle(len(val_texts)).batch(tam_batch),
          epochs=epochs,batch_size=tam_batch)



<keras.callbacks.History at 0x7ff8c9322040>

In [18]:
#optional: save the model
model.save_pretrained("./pretrained")

In [19]:
#evaluate
test_encodings = tokenizer(test_texts,truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

In [20]:
model.evaluate(test_dataset.shuffle(len(test_texts)).batch(tam_batch), return_dict=True, batch_size=tam_batch)



{'loss': 0.13499604165554047, 'accuracy': 0.9599999785423279}

## Using Shap

In [None]:
!pip install shap
import shap
from transformers import pipeline

In [24]:
pipeline_model = pipeline("sentiment-analysis", model='./pretrained', tokenizer=tokenizer, return_all_scores=True)

Some layers from the model checkpoint at ./pretrained were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./pretrained.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [25]:
pmodel = shap.models.TransformersPipeline(pipeline_model, rescale_to_logits=False)

In [27]:
#pos example
text_example_pos = 'the movie was amazing, the best movie i ever seen. The last one was horrible, but i\'d love this one'

In [28]:
explainer = shap.Explainer(pmodel)
shap_values = explainer([text_example_pos])
shap.plots.text(shap_values[:, :, 1]) #positive

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 2it [01:57, 117.91s/it]              


In [29]:
#neg example
text_example_neg = 'It was a bad stupid film. You may think it\'s wonderful, but after a while it really dissapoints you'

In [30]:
explainer = shap.Explainer(pmodel)
shap_values = explainer([text_example_neg])
shap.plots.text(shap_values[:, :, 1]) #negative

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 2it [01:54, 114.72s/it]              


## Making a DF for data visualization

In [31]:
import numpy as np
import pandas as pd

In [None]:
#optional: load model from pretrained
#tokenizer = AutoTokenizer.from_pretrained(lm, truncation=True, padding=True)
#model = TFAutoModelForSequenceClassification.from_pretrained('./pretrained')

In [37]:
results = model.predict(test_dataset.batch(tam_batch))
res = tf.nn.softmax(results.logits, axis=1).numpy()
neg_probs = [i[0] for i in res]
pos_probs = [i[1] for i in res]
pred_labels = np.argmax(res, axis=1)

In [38]:
#create pandas dataframe for better visualization
columns = ['text', 'label', 'neg', 'pos', 'pred']

df = pd.DataFrame(list(zip(test_texts, test_labels, neg_probs, pos_probs, pred_labels)),
                 columns = columns)

In [41]:
#true positives
true_positives = df.loc[(df['label'] == 1) & (df['pred'] ==1)]
true_positives.head(5)

Unnamed: 0,text,label,neg,pos,pred
250,Zentropa is the most original movie I've seen ...,1,0.016014,0.983986,1
252,*Contains spoilers due to me having to describ...,1,0.090437,0.909563,1
253,That was the first thing that sprang to mind a...,1,0.045566,0.954434,1
254,I had started to lose my faith in films of rec...,1,0.012254,0.987746,1
255,Critics need to review what they class as a qu...,1,0.008412,0.991588,1


In [42]:
#true negatives
true_negatives = df.loc[(df['label'] == 0) & (df['pred'] == 0)]
true_negatives.head(5)

Unnamed: 0,text,label,neg,pos,pred
0,"This experimental silent film, made in Switzer...",0,0.542562,0.457438,0
1,"I'm usually a fan of ""art"" and ""foreign"" films...",0,0.726813,0.273187,0
2,"Why did I buy this movie on DVD?, Well the sho...",0,0.987924,0.012076,0
3,It is difficult to rate a writer/director's fi...,0,0.958763,0.041237,0
4,o dear god i suffered having to watch this fil...,0,0.991703,0.008297,0


In [43]:
#false negatives
false_negatives = df.loc[(df['label'] == 1) & (df['pred'] == 0)]
false_negatives.head(5)

Unnamed: 0,text,label,neg,pos,pred
251,Lars Von Trier is never backward in trying out...,1,0.551477,0.448523,0
286,"Yeah, the poster is quite a horrible piece of ...",1,0.988649,0.011351,0
287,"To be honest, I had no idea what this movie wa...",1,0.734964,0.265036,0
305,I couldn't believe the comments made about the...,1,0.704697,0.295303,0
363,"The production quality, cast, premise, authent...",1,0.971803,0.028197,0


In [44]:
#false positive
false_positives = df.loc[(df['label'] == 0) & (df['pred'] == 1)]
false_positives.head(5)

Unnamed: 0,text,label,neg,pos,pred
9,The story starts off in the home of a little g...,0,0.212319,0.787681,1
12,"You may like Tim Burton's fantasies, but not i...",0,0.402035,0.597965,1
15,"This oddity contains Bunuel-like touches, but ...",0,0.486203,0.513797,1
118,"Sorry to disagree with you, but I found the DK...",0,0.012922,0.987078,1
127,I watched this episode with high hopes after s...,0,0.499478,0.500522,1


## Upload to Nuclia

In [None]:
import requests
import sys

In [None]:
#Upload to nuclia via API
BACKEND = "https://europe-1.stashify.cloud/api/v1"
KB = '/kb/c1ccf198-f634-4a8b-b867-d65381b26135'
API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsImtpZCI6InNhIn0.eyJpc3MiOiJodHRwczovL3N0YXNoaWZ5LmNsb3VkLyIsImV4cCI6MTY5ODA5Njg4NCwiaWF0IjoxNjY2NTYwODg1LCJzdWIiOiJlMzY4MTgyZS1hMzkyLTQ5N2UtOWFiMi02MzJiNzJmNmIyMDAiLCJqdGkiOiJiZjA0NzA5Mi00ZjQ3LTQ4YjctYmI0MC0zYzMwMTQxNzRiNmUiLCJrZXkiOiIxZWY2Y2VlMC1lZDQzLTQwNzMtYWQ2Yy04M2Q0OWFmNGFkZDUiLCJraWQiOiJmMjZhMTdkNC0xMmRkLTRjODktODlkNC03NGZkZGNiMzMxMGUifQ.mwEUikIyDp9V7QuvJskk2nI83teJAO-AA8t73hfm8zMi9E6YCfi9kVjnxXfaPe6GD3fZ_GxqMzbdG303CWlu-DtvHTxu84XD4tFVXh8Uayzmwfmzy5he1tn36q8UveT1kiU_BGxlGPMkn58382KaZ37jSIpuLCQYDaQr893FwbNSpl2AKteAGeq9dTAoh9-2PpwQua1gIGbU1ppG0TEp3o9WtzfFKC85FVzBGWWryGU550SR4dyrQUPMHekBVqGo8HKKwt_fQcN5Ad-Rs4WfbJ8L5IdXnLlJM74uQFw8BvcJEpMDzalFYE3CAwd3FhkZnkKQC_cWLiacZ3BuEuYKa5OcHC1ztYCRe1ChObK9F8o5_BsYlFdGUdoz67Q_pMKSD01EGij4m4vBdv4w4qpICHdPBbvNRaEbiTAX-NbG8A9jBxyu1W0eOivni8fArJ9EVQcp9KmSpYkYwUHC3E_5Jj1cM_clz2pkoiEDDg_hFufMHvoYxbIvd2rbIFMIQtVJfKd_aV9yyD1hwob0_xzFxhlBXtHRt4724uohr-0q2UnMyICg0nrnCZ9-lwXWTJ31kmw-lVwA1ohxODcZMVyQVraq1sIS6BSUoXIAIDiTYb7R1JpagJYDBGysSVNlpMXNBNtrjWTuuUaGuMoDgzQgin-CwBtMcU9Lh_ajBNjy7N0"

In [None]:
def upload(row, name, class_):
    id_ = row[0]
    pos = round(row[1][class_], 2)
    text = row[1]['text']
    
    response = requests.post(
        f'{BACKEND}{KB}/resources',
        headers={
            'content-type': 'application/json',
            'X-STF-Serviceaccount': 'Bearer ' + API_KEY,
        },
        json={
            'title': f'{id_}_{name}_{pos}',
            'texts': {'text': {'format': 'PLAIN', 'body': text}},
        },
        verify=False,
    )
    if response.status_code != 201:
        print(f"Error {response.status_code} importing {row['id']}")

In [None]:
for row in false_negatives.iterrows():
    upload(row, 'false_negatives', 'neg')
    
for row in false_positives.iterrows():
    upload(row, 'false_positives', 'pos')

for row in true_negatives.iterrows():
    upload(row, 'true_negatives', 'neg')

for row in true_positives.iterrows():
    upload(row, 'true_positives', 'pos')

## Overview

### Shap

In [63]:
def select_list(type_):
    if type_ == 'false_positives':
        return list(false_positives['text'])
    elif type_ == 'false_negatives':
        return list(false_negatives['text'])
    elif type_ == 'true_positives':
        return list(true_positives['text'])
    elif type_ == 'true_negatives':
        return list(true_negatives['text'])
    else:
        print('Error. No available type. Select false positives, negatives, true positives or negatives')

## false positives

In [94]:
false_positives_list = select_list('false_positives')
print(len(false_positives_list))

11


In [101]:
explainer = shap.Explainer(pmodel)
shap_values = explainer([false_positives_list[0]])
shap.plots.text(shap_values[:, :, 1])

  0%|          | 0/498 [00:00<?, ?it/s]


Partition explainer: 2it [04:09, 249.77s/it]              [A


In [102]:
explainer = shap.Explainer(pmodel)
shap_values = explainer([false_positives_list[1]])
shap.plots.text(shap_values[:, :, 1])

  0%|          | 0/498 [00:00<?, ?it/s]


Partition explainer: 2it [01:59, 119.56s/it]              [A
