In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [2]:
data_path = "../../data/shap/"

## Fine-tuned bert

In [3]:
bert_shap_df = pd.read_csv(data_path + "bert_test_shap-values.csv")
bert_shap_df

Unnamed: 0,text_id,token,shap_neg,shap_neut,shap_pos
0,0,[CLS],0.000000,0.000000,0.000000
1,0,17,-0.007158,0.012147,-0.004989
2,0,war,0.009573,-0.021759,0.012187
3,0,en,-0.001289,0.006610,-0.005321
4,0,w,0.002141,-0.001326,-0.000815
...,...,...,...,...,...
26729,441,beg,0.008301,-0.016289,0.007988
26730,441,eg,0.007680,-0.014102,0.006422
26731,441,nen,0.013180,-0.031078,0.017898
26732,441,.,0.000038,-0.006390,0.006353


## Fine-tuned gbert

In [4]:
gbert_shap_df = pd.read_csv(data_path + "gbert_test_shap-values.csv")
gbert_shap_df

Unnamed: 0,text_id,token,shap_neg,shap_neut,shap_pos
0,0,[CLS],0.000000,0.000000,0.000000
1,0,17,-0.005422,0.018636,-0.013214
2,0,waren,-0.005749,0.017784,-0.012035
3,0,wir,-0.002026,0.001944,0.000081
4,0,in,-0.003754,0.010308,-0.006554
...,...,...,...,...,...
16418,441,Liebe,-0.020607,-0.363338,0.383945
16419,441,zu,0.004014,-0.026877,0.022863
16420,441,begegnen,-0.007661,-0.043362,0.051023
16421,441,.,0.000910,-0.046911,0.046001


# Group by Tokens (and Store Data)

## Fine-tuned bert

In [5]:
token_grouped_bert_shap_df = bert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
token_grouped_bert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
!,0.007913,-0.124836,0.116923
!,-0.002699,-0.038225,0.040924
"""",-0.001368,-0.001947,0.003315
(,-0.001456,0.003790,-0.002334
),0.001822,-0.004706,0.002884
...,...,...,...
’,-0.000294,-0.000728,0.001022
’,-0.004081,0.009047,-0.004966
“,-0.001390,0.002368,-0.000978
”,0.000328,-0.002791,0.002463


In [6]:
token_grouped_bert_shap_df.to_csv(data_path + "bert_test_shap-values_grouped_tokens.csv")

## Fine-tuned gbert

In [7]:
token_grouped_gbert_shap_df = gbert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
token_grouped_gbert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
!,0.000018,-0.032862,0.032844
!,-0.001365,0.000405,0.000960
"""",0.000479,0.000283,-0.000762
(,0.000003,-0.000758,0.000755
),-0.000791,0.001784,-0.000993
...,...,...,...
’,0.016857,-0.017156,0.000299
’,-0.001406,-0.054415,0.055821
“,-0.000923,-0.003320,0.004243
”,-0.000469,-0.013701,0.014169


In [8]:
token_grouped_gbert_shap_df.to_csv(data_path + "gbert_test_shap-values_grouped_tokens.csv")

# Group by N-grams (and Store Data)

In [9]:
def get_ngram_data(shap_df, n=1):
    if n < 1:
        return []
    
    shap_df_size = len(shap_df)
    ngram_data = []
    for i in range(shap_df_size - n + 1):
        rows = shap_df.iloc[i:i + n]
        text_id = set(rows["text_id"])
        if len(text_id) > 1:
            continue # do not concat values from different samples
        else:
            text_id = list(text_id)[0]
        
        token = "+".join(rows["token"])
        shap_neg = sum(rows["shap_neg"])
        shap_neut = sum(rows["shap_neut"])
        shap_pos = sum(rows["shap_pos"])
        ngram_data.append([text_id, token, shap_neg, shap_neut, shap_pos])
    return ngram_data

## Fine-tuned bert

In [10]:
bigram_bert_shap_df = pd.DataFrame(get_ngram_data(bert_shap_df, 2), columns=bert_shap_df.columns)
bigram_grouped_bert_shap_df = bigram_bert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
bigram_grouped_bert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
! +Da,-0.003134,-0.042503,0.045637
! +Du,0.018667,-0.123530,0.104863
! +h,0.001734,-0.037391,0.035657
! +l,0.014448,-0.071660,0.057212
!+/,0.025036,-0.059316,0.034280
...,...,...,...
’+s,0.000164,-0.002108,0.001944
“+B,0.014943,-0.010519,-0.004424
“+Mei,-0.008718,0.016785,-0.008067
” +–,0.000874,-0.005806,0.004932


In [11]:
bigram_grouped_bert_shap_df.to_csv(data_path + "bert_test_shap-values_grouped_bigrams.csv")

In [12]:
trigram_bert_shap_df = pd.DataFrame(get_ngram_data(bert_shap_df, 3), columns=bert_shap_df.columns)
trigram_grouped_bert_shap_df = trigram_bert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
trigram_grouped_bert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
! +Da+ß,-0.001352,-0.070239,0.071591
! +Du +will,0.029652,-0.160327,0.130675
! +h+ö,0.003500,-0.102877,0.099377
! +l+ä,0.025056,-0.090563,0.065507
!+/+/,0.022029,-0.060862,0.038833
...,...,...,...
’+s +und,-0.002888,0.008178,-0.005290
“+B+ran,0.031528,-0.025767,-0.005761
“+Mei+n,-0.016298,0.033564,-0.017266
” +– +und,0.001419,-0.008821,0.007401


In [13]:
trigram_grouped_bert_shap_df.to_csv(data_path + "bert_test_shap-values_grouped_trigrams.csv")

## Fine-tuned gbert

In [14]:
bigram_gbert_shap_df = pd.DataFrame(get_ngram_data(gbert_shap_df, 2), columns=gbert_shap_df.columns)
bigram_grouped_gbert_shap_df = bigram_gbert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
bigram_grouped_gbert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
! +Da,-0.000915,-0.030004,0.030920
! +Du,0.002015,-0.040648,0.038633
! +höchst,-0.008332,-0.116163,0.124495
! +lä,-0.006692,-0.033189,0.039880
!+/,-0.003275,0.000155,0.003120
...,...,...,...
’+s,0.031522,-0.037402,0.005880
“+Bran,-0.001555,-0.015329,0.016884
“+Mein,-0.001148,-0.001153,0.002302
” +–,-0.000911,-0.026753,0.027665


In [15]:
bigram_grouped_gbert_shap_df.to_csv(data_path + "gbert_test_shap-values_grouped_bigrams.csv")

In [16]:
trigram_gbert_shap_df = pd.DataFrame(get_ngram_data(gbert_shap_df, 3), columns=gbert_shap_df.columns)
trigram_grouped_gbert_shap_df = trigram_gbert_shap_df.groupby("token")[["shap_neg", "shap_neut", "shap_pos"]].mean()
trigram_grouped_gbert_shap_df

Unnamed: 0_level_0,shap_neg,shap_neut,shap_pos
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
! +Da+ß,-0.002124,-0.077670,0.079794
! +Du +willst,0.003419,0.166016,-0.169435
! +höchst +glück,-0.009788,-0.559148,0.568936
! +lä+che,-0.008613,-0.078021,0.086634
!+/+/,-0.004895,0.007163,-0.002268
...,...,...,...
’+s +und,0.100604,-0.115150,0.014547
“+Bran+nte,-0.002322,-0.024074,0.026396
“+Mein +Sohn,-0.001796,-0.002032,0.003828
” +– +und,-0.001483,-0.040274,0.041758


In [17]:
trigram_grouped_gbert_shap_df.to_csv(data_path + "gbert_test_shap-values_grouped_trigrams.csv")