In [1]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [2]:
from datasets import get_agnews_prepraped

# Laden von Testdatensätzen
device = 'cpu'
size_train_batch = 64
size_test_batch = 1028

train_set, test_set, vocab_size, n_classes, vocab = get_agnews_prepraped(random_state=42,
                                                                   batch_sizes=(size_train_batch, size_test_batch))
# Auswahl des gewählten Testbatches mit Text, Label
X_test, Y_test = next(iter(test_set))

looking for vocab in ./datasets/AG_NEWS/vocab.torch
Load Prepared Data


In [3]:
from models import SentenceCNN

# Laden einer trainierten Modells
embed_dim = 128
model = SentenceCNN(n_classes=n_classes, embed_dim=embed_dim, vocab_size=vocab_size)
model.load_state_dict(torch.load('./results/001/model_1'))
loss_fun = torch.nn.CrossEntropyLoss()

In [4]:
from utils import create_df

# Umwandlung X_test zu Dataframe in Form 0/1 (OHE like)
data_df = create_df(X_test, vocab_size)

data_df

87796 columns dropped
data shape: (1028, 8016)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,94179,94617,94732,95067,95105,95111,95538,95676,95729,95773
0,0,1,1,1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,1,1,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1024,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1025,1,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1026,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Test Labels zu Series
labels = pd.Series(Y_test, dtype='int64')
labels

0       0
1       2
2       1
3       3
4       1
       ..
1023    2
1024    1
1025    2
1026    1
1027    1
Length: 1028, dtype: int64

In [6]:
from utils import _get_gradients_and_outputs

# Berechung der Gradienten und Prediction des Test Datensatz auf dem Modell
_y_outs, _grads, _y_pred, _acc = _get_gradients_and_outputs(X_test, Y_test, model, loss_fun)
print(_acc)

# Test Predictions to Series
preds = pd.Series(_y_pred, dtype='int64')
preds

RuntimeError: grad can be implicitly created only for scalar outputs

In [7]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Testen unterschiedlicher Methoden

f_score = f1_score(labels, preds, average='macro')
prec = precision_score(labels, preds, average='macro')
recall = recall_score(labels, preds, average='macro')

print(f'recall: {recall}\nprecision: {prec}\nf_score: {f_score}\n')

recall: 0.8763009702411639
precision: 0.875818808256605
f_score: 0.8750922597600582



In [8]:
# Gradienten in Numpy array mit Clean-up von Embedding dim
token_grads_np = np.zeros((_grads.shape[0], _grads.shape[1], 2))
for i in range(_grads.shape[0]):
    for j in range(_grads.shape[1]):
        token_grads_np[i][j][0] = X_test[i][j]
        token_grads_np[i][j][1] = _grads[i][j].sum()

In [9]:
from helper_func import *

if len(intervals_dict) == 0:
    compute_intervals(intervals_dict, data_df, 1)
    
print(len(intervals_dict))
print(intervals_dict)

# unique_values = X_train[name].unique()
#       if len(unique_values) > 2 or max(unique_values) != 1 or min(unique_values) != 0:
#
# columns_name, sind in ohe like nur 1 lang (tokenizer id des wort), greift hier also nicht

0
{}


In [10]:
itemset = set()
encoded_vals = []
shap_threshold = 0
num_cores = cpu_count()

pos_label = '1'
neg_label = '0'

for feature in data_df.columns.to_list():
    itemset.add(str(feature))
    
itemset.add(pos_label)
itemset.add(neg_label)

In [11]:
print(itemset)

{'12207', '74794', '1245', '37743', '7778', '3198', '6634', '470', '33651', '767', '8119', '3910', '72464', '1279', '22641', '85853', '26814', '604', '9592', '3723', '2822', '7287', '18398', '19152', '7823', '24208', '12107', '1665', '8095', '26098', '857', '1860', '5758', '4722', '27076', '32127', '1365', '6527', '6151', '11913', '3783', '3911', '17273', '1847', '45421', '741', '1714', '44816', '3637', '5156', '1295', '27103', '2440', '1580', '11336', '6983', '1234', '6993', '8782', '4055', '6598', '861', '1092', '6426', '355', '2190', '9904', '3831', '1506', '3820', '6365', '3135', '50588', '14110', '65420', '4484', '184', '7838', '40283', '11608', '2534', '1715', '7363', '9046', '1041', '3926', '2057', '4993', '34095', '1161', '26191', '83', '290', '972', '4258', '19349', '12734', '11607', '5231', '6825', '559', '4730', '2453', '22314', '6378', '1563', '19525', '4135', '31625', '53332', '27160', '16137', '641', '2234', '18061', '1833', '14632', '20858', '70', '12712', '1364', '7749'

In [12]:
for indx in tqdm(range(len(preds))):
    instance_features = data_df.iloc[[indx]].to_dict(orient='records')[0]

100%|██████████████████████████████████████████████████████████████████████████████| 1028/1028 [09:39<00:00,  1.77it/s]


In [14]:
print(instance_features)

{'1': 0, '2': 1, '3': 1, '4': 1, '5': 0, '6': 1, '7': 1, '8': 1, '9': 1, '10': 0, '11': 0, '12': 0, '13': 1, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 1, '26': 0, '27': 1, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, '34': 0, '35': 0, '36': 0, '37': 0, '38': 0, '39': 0, '40': 0, '41': 0, '42': 0, '43': 0, '44': 0, '45': 0, '46': 0, '47': 0, '48': 0, '49': 0, '50': 0, '51': 0, '52': 0, '53': 0, '54': 0, '55': 0, '56': 0, '57': 0, '58': 0, '59': 0, '60': 0, '61': 0, '62': 0, '63': 0, '64': 0, '65': 0, '66': 0, '67': 0, '68': 0, '69': 0, '70': 0, '71': 0, '72': 0, '73': 0, '74': 0, '75': 0, '76': 0, '77': 0, '78': 0, '79': 0, '80': 0, '81': 0, '82': 0, '83': 0, '84': 0, '85': 0, '86': 0, '87': 0, '88': 0, '89': 0, '90': 0, '91': 0, '92': 0, '93': 0, '94': 0, '95': 0, '96': 0, '97': 0, '98': 0, '99': 0, '100': 0, '101': 0, '102': 0, '103': 0, '104': 0, '105': 0, '106': 0, '107': 0, '108': 0, '109': 0, '110': 0, '111': 

In [None]:
# from CEGA_w_lime

for indx in tqdm(range(len(pred))):
    
    pos_queue.put(pos_label)
    neg_queue.put(neg_label)
    
    exp = explainer.explain_instance(X_dev.values[indx], clf.predict_proba, num_features=num_features)
    lime_names = [clean_name(name) for name, val in exp.as_list()]
    lime_vals = [val for name, val in exp.as_list()]
    
    instance_features = X_dev.iloc[[indx]].to_dict(orient='records')[0]
    feature_vals = [instance_features[name] for name in lime_names]
    
    zipped = zip(lime_vals, feature_vals,
                 lime_names, [shap_threshold]*len(lime_names))

    p.map(get_relevant_features, zipped)
    
    append_to_encoded_vals(pos_queue, itemset, encoded_vals)
    append_to_encoded_vals(neg_queue, itemset, encoded_vals)

ohe_df = pd.DataFrame(encoded_vals)

In [None]:
# from Rosario cega_utils

exp = gradsPerIteration[indx]#[item[indx] for item in sample] #normalize featureListALL ?

instance_features = data_df.iloc[[indx]].to_dict(orient='records')[0]
            
feature_vals = [instance_features[name] for name in featureNames] #put here grads#   feature values ?? 
zipped = zip(exp, feature_vals, featureNames, [shap_threshold]*len(featureNames))

p.map(get_relevant_features, zipped)
append_to_encoded_vals(pos_queue, itemset, encoded_vals)
append_to_encoded_vals(neg_queue, itemset, encoded_vals)

In [12]:
from mlxtend.frequent_patterns import apriori

apriori(data_df, min_support=0.2, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.267510,(1)
1,0.999027,(2)
2,0.976654,(3)
3,0.806420,(4)
4,0.711089,(5)
...,...,...
527,0.226654,"(2, 3, 5, 9, 7, 4)"
528,0.219844,"(2, 3, 16, 5, 15, 4)"
529,0.255837,"(2, 3, 16, 15, 17, 4)"
530,0.224708,"(2, 3, 16, 5, 15, 17)"


In [14]:
test_df = data_df.assign(category=preds)
test_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,94617,94732,95067,95105,95111,95538,95676,95729,95773,category
0,0,1,1,1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,3
4,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,1,1,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,2
1024,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1025,1,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1026,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
world_df = test_df.loc[test_df['category'] == 0]
drop_list = [col for col in world_df.columns if sum(world_df[col]) <= 0]
world_df.drop(drop_list, axis=1, inplace=True)
world_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,82876,83286,83469,84847,85647,87375,93099,93163,94617,95773
0,0,1,1,1,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
12,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
22,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,0,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1011,0,1,1,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1013,0,1,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1015,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
apriori(world_df, min_support=0.5, use_colnames=True)

Unnamed: 0,support,itemsets
0,1.0,(2)
1,0.975309,(3)
2,0.839506,(4)
3,0.753086,(5)
4,0.646091,(6)
5,0.658436,(7)
6,0.621399,(8)
7,0.679012,(9)
8,0.526749,(17)
9,0.975309,"(2, 3)"


In [31]:
sports_df = test_df.loc[test_df['category'] == 1]
drop_list = [col for col in sports_df.columns if sum(sports_df[col]) <= 0]
sports_df.drop(drop_list, axis=1, inplace=True)
drop_list = (['category'])
sports_df.drop(drop_list, axis=1, inplace=True)
sports_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,87529,91470,91914,92114,92966,93131,93552,93639,93948,94101
2,0,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,0,1,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11,1,1,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1020,1,1,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1024,0,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1026,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
apriori(sports_df, min_support=0.5, use_colnames=True)

Unnamed: 0,support,itemsets
0,1.0,(2)
1,0.972603,(3)
2,0.880137,(4)
3,0.64726,(5)
4,0.571918,(6)
5,0.55137,(7)
6,0.619863,(9)
7,0.972603,"(2, 3)"
8,0.880137,"(2, 4)"
9,0.64726,"(5, 2)"
