In [1]:
# Import Dataset, joint model and pytorch train helpers
import transformers
%load_ext autoreload
%autoreload 2
import sys
import torch.optim as optim
import torch
from dataset_parser.wich_parser import WichDataset
from joint_model import JointModel
from util.WeightedRandomSampler import WeightedRandomSampler

In [2]:
# Open data set and split in train and dev and instantiate data loaders
trainset, devset, testset  = WichDataset(fixed_set="train"), WichDataset(fixed_set="val"), WichDataset(fixed_set="test")
res = len(trainset) + len(devset) + len(testset)
print (res)

Loading Wich train set from fixed split.
Successfully loaded wich dataset.
Loading Wich val set from fixed split.
Successfully loaded wich dataset.
Loading Wich test set from fixed split.
Successfully loaded wich dataset.
68443


In [3]:
BATCH_SIZE = 256
sampler_trainset = WeightedRandomSampler(trainset, 5000)
sampler_devset = WeightedRandomSampler(devset, 1000)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                        num_workers=6, sampler = sampler_trainset)
devloader = torch.utils.data.DataLoader(devset, batch_size=1,
                                        num_workers=6)
testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                        num_workers=6)

In [4]:
# Instantiate joint model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
joint_model = JointModel(dataset="wich", random_subset_size=5000)
joint_model.to(device)

Successfully initialized TweetNetwork submodel
Successfully initialized TweetClassifier submodel
other done
offensive done
Successfully initialized TweetHistory submodel
Successfully initialized last final classification layer


JointModel(
  (SAGE): GraphSAGE(
    (model): SAGE(
      (convs): ModuleList(
        (0): SAGEConv(2, 32)
        (1): SAGEConv(32, 32)
        (2): SAGEConv(32, 32)
      )
    )
  )
  (BERT): TweetBERT(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(31102, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0): TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(in_features=768, out_features=768, bias=True)
                (v_lin): Linear(in_features=768, 

In [5]:
# load weights from trained model
if "cuda" in str(device):
    pass
    joint_model.load_state_dict(torch.load('../../models/joint_model_wich_nulled_network_10epochs.model'))
else:
    print("check-else")
    joint_model.load_state_dict(torch.load('../../models/joint_model_wich_nulled_network_10epochs.model', map_location=torch.device('cpu')))

In [5]:
# Training settings
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(joint_model.parameters(), lr=0.001)

In [22]:
# OPTIONAL: Train joint model
print ("Batch size: {}".format(BATCH_SIZE))
for epoch in range(10):
    #break
    running_loss = 0.0
    for i, data in enumerate(trainloader):
        input_ids, attention_mask, user_id, tweet_label = data['input_ids'], data['attention_mask'], data['userid'], data['label'] #TODO fix this mess
        input_ids, attention_mask, tweet_label = input_ids.to(device),  attention_mask.to(device), tweet_label.to(device)
        predictions = joint_model(input_ids,attention_mask, user_id)
        loss = criterion(predictions, tweet_label)
        loss.backward()
        running_loss += loss
        optimizer.step()
        optimizer.zero_grad()
        if i % 4 == 0:
            print ("Epoch {}: {} tweets processed".format(epoch, i*BATCH_SIZE))
#torch.save(joint_model.state_dict(), "../../models/joint_model_wich_nulled_network_10epochs.model")

Batch size: 256


NameError: name 'criterion' is not defined

In [8]:
# Obtain predictions for the dev/validation set.
y_pred, y_true = [],[]
output_for_print = []
for i, data in enumerate(devloader):
    joint_model.eval()
    input_ids, attention_mask, user_id, tweet_label = data['input_ids'], data['attention_mask'], data['userid'], data['label'] #TODO fix this mess
    input_ids, attention_mask, tweet_label = input_ids.to(device),  attention_mask.to(device), tweet_label.to(device)
    predictions = joint_model(input_ids,attention_mask, user_id)
    predictions = torch.nn.functional.softmax(predictions)
    max_pred = torch.argmax(predictions)
    y_pred.append(max_pred.item())
    y_true.append(tweet_label.item())
    output_for_print.append([i,user_id.item(),tweet_label.item(),max_pred.item()])

0


  predictions = torch.nn.functional.softmax(predictions)


1000
2000
3000
4000
5000
6000
7000


In [25]:
# Print metrics
from sklearn.metrics import confusion_matrix, classification_report
#print (confusion_matrix(y_true=y_true, y_pred=y_pred))
#print (classification_report(y_true=y_true, y_pred=y_pred))

In [18]:
# Obtain predictions for the test set.
test_y_pred, test_y_true = [],[]
for i, data in enumerate(testloader):
    joint_model.eval()
    input_ids, attention_mask, user_id, tweet_label = data['input_ids'], data['attention_mask'], data['userid'], data['label'] #TODO fix this mess
    input_ids, attention_mask, tweet_label = input_ids.to(device),  attention_mask.to(device), tweet_label.to(device)
#    print(input_ids, tweet_label)
    test_predictions = joint_model(input_ids,attention_mask, user_id,shap=True,set_GRAPH_to_NULL = True)
    test_predictions = torch.nn.functional.softmax(test_predictions)
    test_max_pred = torch.argmax(test_predictions)
    test_y_pred.append(test_max_pred.item())
    test_y_true.append(tweet_label.item())

Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.


In [19]:
# Print metrics
print (confusion_matrix(y_true=test_y_true, y_pred=test_y_pred))
print (classification_report(y_true=test_y_true, y_pred=test_y_pred, digits=3))

NameError: name 'confusion_matrix' is not defined

In [10]:
# SHAP computations with class ShapExplainer
from SHAP.shap import ShapExplainer

In [14]:
# Shapley configuration
tweet_as_one = True
vocab_as_one=True
network_as_one = True
untokenize = True
dataset = 'wich'
tokenizer_b = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-german-cased')

In [15]:
model_explainer = ShapExplainer(joint_model, tweet_as_one = tweet_as_one, vocab_as_one=vocab_as_one, network_as_one = network_as_one, dataset = dataset, untokenize = untokenize)


In [20]:
shap_output = []
for i, data in enumerate(testloader):
    joint_model.eval()
    input_ids, attention_mask, user_id, tweet_label = data['input_ids'], data['attention_mask'], data['userid'], data['label'] #TODO fix this mess
    input_ids, attention_mask, tweet_label = input_ids.to(device),  attention_mask.to(device), tweet_label.to(device)
    test_predictions = joint_model(input_ids,attention_mask, user_id)
    test_predictions = torch.nn.functional.softmax(test_predictions)
    test_max_pred = torch.argmax(test_predictions)
    shapley_values, predicted_class, feature_distribution, vocab_indices = model_explainer.approximate_shap_values(input_ids, attention_mask, user_id)
    res = tokenizer_b.convert_ids_to_tokens(input_ids[0], skip_special_tokens = True)
    res = tokenizer_b.convert_tokens_to_string(res)
    tweet_none = shapley_values[0,0].item()
    vocab_none = shapley_values[0,1].item()
    network_none = shapley_values[0,2].item()
    
    shap_output.append([i,user_id.item(),tweet_label.item(),test_max_pred.item(),tweet_none,vocab_none,network_none,res])
    if i % 250 == 0:
        print(i)
# Now use the blocks specified in SHAP_plots.ipynb to get SHAP visualizations for the data.

Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.


0


KeyboardInterrupt: 

In [17]:
import pandas as pd
df = pd.DataFrame(shap_output)
df.to_excel("shap.xlsx")

In [26]:

# Print metrics
print (confusion_matrix(y_true=test_y_true, y_pred=test_y_pred))
print (classification_report(y_true=test_y_true, y_pred=test_y_pred, digits=3))

[[6296  251]
 [ 173 1181]]
              precision    recall  f1-score   support

           0      0.973     0.962     0.967      6547
           1      0.825     0.872     0.848      1354

    accuracy                          0.946      7901
   macro avg      0.899     0.917     0.908      7901
weighted avg      0.948     0.946     0.947      7901

