In [1]:
import add_path
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle

In [2]:
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY
from tucore_gcn_bert_tokenizer import SpeakerBertTokenizer
from tucore_gcn_bert_pipeline import ConversationalSequenceClassificationPipeline
from tucore_gcn_bert_modelling import TUCOREGCN_BertForSequenceClassification, TUCOREGCN_BertConfig
import os
PIPELINE_REGISTRY.register_pipeline(
    "conversational-sequence-classification",
    pipeline_class=ConversationalSequenceClassificationPipeline
)
speaker_tokenizer = SpeakerBertTokenizer.from_pretrained('bert-base-uncased')
model = TUCOREGCN_BertForSequenceClassification(TUCOREGCN_BertConfig.from_json_file("../models/BERT/tucoregcn_bert_mlc.json"))
model.cuda()
model.load_state_dict(torch.load("../TUCOREGCN_BERT_DialogRE/tucoregcn_pytorch_model.pt"))
model.cuda()
classifier = pipeline("conversational-sequence-classification", model=model, tokenizer=speaker_tokenizer, device="cuda:0", n_class=36, max_seq_length=512)

d:\projects\affect\TUCORE-GCN\.venv\Lib\site-packages\dgl\dgl.dll


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'SpeakerBertTokenizer'.


In [11]:
from tucore_gcn_bert_processor import SpeakerRelation, Conversation, Message

c = Conversation(
	messages=[
		Message("Speaker 1", "Howdy! I'm Flowey, Flowey the Flower!"),
		Message("Speaker 2", "Hello Flowey. I'm your very best friend!"),
		Message("Speaker 2", "You're new to the underground, aren'tcha?"),
	],
	speaker_relations=[
		SpeakerRelation("Speaker 1", "Speaker 2")
	]
)

labels, scores, logits = classifier(c).values()
print(labels, scores)

['LABEL_12'] [0.3450720012187958]


In [None]:
from tucore_gcn_transformers.tucore_gcn_bert_processor import DialogRE
gen = DialogRE()._generate_examples("../datasets/DialogRE/dev.json", "dev")

In [None]:
idx, test_entry = next(gen)
test_entry

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'SpeakerBertTokenizer'.


{'dialog': "{entity_2} hey!\n{speaker_2} hey.\n{speaker_3} hey, man. what's up?\n{entity_2} maybe you can tell me. my agent would like to know why i didn't show up at the audition i didn't know i had today. the first good thing she gets me in weeks. how could you not give me the message?!\n{speaker_3} well, i'll tell ya i do enjoy guilt, but, ah, it wasn't me.\n{speaker_2} yes, it was! it was him! uh huh! okay, it was me!\n{entity_2} how is it you?\n{speaker_2} well, it was just, it was all so crazy, you know. i mean, chandler was in the closet, counting to 10, and he was up to 7 and i hadn't found a place to hide yet. i-i-i meant to tell you, and i wrote it all down on my hand. see, all of it.\n{entity_2} yep, that's my audition.\n{speaker_4} see, now this is why i keep notepads everywhere.\n{speaker_2} yep, and that's why we don't invite you to play.\n{speaker_5} what is the great tragedy here? you go get yourself another appointment.\n{entity_2} well, estelle tried, you know. the ca

In [None]:
test_entry['relation']

<tucore_gcn_transformers.tucore_gcn_bert_processor.SpeakerRelation at 0x225187adf90>

In [None]:
(
            tokens,
            input_ids,
            input_mask,
            segment_ids,
            speaker_ids,
            mention_ids,
            turn_masks,
            graph,
        ) = classifier.preprocess(c, 46, 512)

['[CLS]', '{entity_1}', 'how', '##dy', '!', 'i', "'", 'm', 'flow', '##ey', ',', 'flow', '##ey', 'the', 'flower', '!', '{entity_2}', 'hello', 'flow', '##ey', '.', 'i', "'", 'm', 'your', 'very', 'best', 'friend', '!', '{entity_2}', 'you', "'", 're', 'new', 'to', 'the', 'underground', ',', 'aren', "'", 'tc', '##ha', '?', '[SEP]', '{entity_1}', '[SEP]', '{entity_2}', '[SEP]'] [101, 11, 2129, 5149, 999, 1045, 1005, 1049, 4834, 3240, 1010, 4834, 3240, 1996, 6546, 999, 12, 7592, 4834, 3240, 1012, 1045, 1005, 1049, 2115, 2200, 2190, 2767, 999, 12, 2017, 1005, 2128, 2047, 2000, 1996, 5230, 1010, 4995, 1005, 22975, 3270, 1029, 102, 11, 102, 12, 102] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1] [0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11

In [None]:
(
            input_ids,
            segment_ids,
            input_mask,
            speaker_ids,
            graph,
            mention_ids,
			labels,
            turn_masks,
        ) = (torch.LongTensor(input_ids).to("cuda:0"),
torch.LongTensor(segment_ids).to("cuda:0"),
torch.LongTensor(input_mask).to("cuda:0"),
torch.LongTensor(speaker_ids).to("cuda:0"),
[g.to("cuda:0") for g in graph],
torch.LongTensor(mention_ids).to("cuda:0"),
None,
torch.LongTensor(turn_masks).to("cuda:0"))

In [6]:
bert_output = model.tucoregcn_bert.bert(input_ids, speaker_ids=speaker_ids, token_type_ids=segment_ids, encoder_attention_mask=input_mask)
bert_output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.3879,  0.5708, -0.5046,  ..., -0.6117, -0.8277,  0.1304],
         [-0.0450,  0.9382,  0.0936,  ...,  0.0105, -1.4474, -0.6745],
         [-0.9549, -0.3507,  0.3520,  ...,  0.8504, -1.5653, -0.8183],
         ...,
         [-0.6475,  0.6178, -0.7366,  ..., -0.9329, -0.9559,  0.2092],
         [-0.6849,  0.6250, -0.6169,  ..., -0.6115, -0.8303,  0.0150],
         [ 0.5369,  0.6531, -0.5462,  ..., -0.8428, -0.9308, -0.0714]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 6.2078e-01,  5.6908e-01,  9.4322e-01,  8.3594e-01, -2.7726e-01,
         -9.6566e-01, -6.6047e-01, -5.1129e-01,  1.3318e-01, -7.5890e-01,
         -6.3225e-01,  7.0253e-01, -3.1610e-01, -4.3767e-01, -5.5505e-01,
          5.9546e-01,  3.6387e-01,  4.5363e-01, -6.1991e-01,  1.8072e-01,
         -7.1707e-02, -9.5231e-01,  8.2422e-01, -9.5213e-01, -7.3405e-01,
          4.1658e-01,  3.5350e-01, -5.6004e-01,  2

In [7]:
sequence_outputs, pooled_outputs = (
		bert_output.last_hidden_state,
		bert_output.pooler_output,
	)
sequence_outputs, attn = model.tucoregcn_bert.turnAttention(
	sequence_outputs, sequence_outputs, sequence_outputs, turn_masks
)
attn

tensor([[[[1.1111, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0466,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0365, 0.0429,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 1.1111, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.1111, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.1111]],

         [[1.1111, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0394, 0.0419,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0365, 0.0401,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 1.1111, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.1111, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.1111]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0441, 0.0398,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0483,  ..., 0

In [8]:
def apply_along_axis(function, x, axis: int = 0):
	return torch.stack([
	    function(x_i) for x_i in torch.unbind(x, dim=axis)
	], dim=axis)

In [9]:
def normalize_attns(v):
	v_min, v_max = v[v>0.01].min(), v[v>0.01].max()
	new_min, new_max = 0, 0.5

	v_p = (v - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
	v_p[v_p<=0] = 0
	return v_p

In [10]:
lim_attn_bert = tuple(attn[:, :, 1:43, 1:43].expand(1,-1,-1,-1,-1))
lim_attn_bert[0].shape

torch.Size([1, 12, 42, 42])

In [11]:
def normalise_all_attns(v):
	test = [[list(torch.unbind(entry_c, dim=0)) for entry_c in entry_b] for entry_b in [torch.unbind(entry_a, dim=0) for entry_a in torch.unbind(v, dim=0)]]
	for i in range(len(test)):
		for j in range(len(test[0])):
			for k in range(len(test[0][0])):
				test[i][j][k] = normalize_attns(test[i][j][k])
	for_illustration = torch.stack([torch.stack(entry_c, dim=0) for entry_c in [[torch.stack(entry_b, dim=0) for entry_b in entry_a] for entry_a in test]], dim=0)
	return for_illustration

In [12]:
norm_attn = [normalise_all_attns(entry) for entry in lim_attn_bert]

In [13]:
from tucore_gcn_transformers.tucore_gcn_bert_view import model_view
html = model_view(norm_attn, tokens[0][1:43], html_action='return')

In [14]:
with open('data.html', 'w') as file:
	file.write(html.data)

In [150]:
attn[:,:,1:43,1:43].max()

tensor(0.0580, device='cuda:0', grad_fn=<MaxBackward1>)

In [151]:
def apply_along_axis(function, x, axis: int = 0):
	return torch.stack([
	    function(x_i) for x_i in torch.unbind(x, dim=axis)
	], dim=axis)

In [152]:
def normalize_attns(v):
	v_min, v_max = v.min(), v.max()
	new_min, new_max = 0, 1

	v_p = (v - v_min)/(v_max - v_min)*(new_max - new_min) + new_min
	return v_p

In [153]:
norm_attn = apply_along_axis(normalize_attns, attn[:,:,1:43,1:43], axis=3)
norm_attn

tensor([[[[0.8433, 0.7996, 0.6903,  ..., 0.0000, 0.0000, 0.0000],
          [0.9147, 0.7941, 0.7395,  ..., 0.0000, 0.0000, 0.0000],
          [0.8225, 0.8152, 0.7875,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.8203, 0.8014, 0.6958],
          [0.0000, 0.0000, 0.0000,  ..., 0.7358, 0.7985, 0.7268],
          [0.0000, 0.0000, 0.0000,  ..., 0.7355, 0.7370, 0.0000]],

         [[0.8763, 0.7472, 0.7537,  ..., 0.0000, 0.0000, 0.0000],
          [0.8521, 0.7621, 0.6763,  ..., 0.0000, 0.0000, 0.0000],
          [0.8576, 0.7906, 0.7095,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.6042, 0.0000, 0.7543],
          [0.0000, 0.0000, 0.0000,  ..., 0.7193, 0.7821, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.6927, 0.7648, 0.8928]],

         [[0.0000, 0.9002, 0.9016,  ..., 0.0000, 0.0000, 0.0000],
          [0.8989, 0.9323, 0.9657,  ..., 0.0000, 0.0000, 0.0000],
          [0.8889, 0.9629, 0.0000,  ..., 0

In [176]:
norm_attn[:,:,1:43,1:43][0][0][16]

tensor([0.0000, 0.4888, 0.4951, 0.4955, 0.5952, 0.5493, 0.5543, 0.0000, 0.4508,
        0.5434, 0.5878, 0.5318, 0.0000, 0.5005, 0.5383, 0.4800, 0.5170, 0.4873,
        0.4610, 0.4654, 0.5885, 0.4129, 0.0000, 0.5670, 0.4796, 0.4786, 0.4126,
        0.4830, 0.5028, 0.5631, 0.4504, 0.4842, 0.0000, 0.0000, 0.5209, 0.5977,
        0.5479, 0.4939, 0.4752, 0.4705, 0.4898], device='cuda:0',
       grad_fn=<SelectBackward0>)

In [178]:
norm_attn[:,:,1:43,1:43][0][0][1][(norm_attn[:,:,1:43,1:43][0][0][1]>0.3)].min()

tensor(0.5995, device='cuda:0', grad_fn=<MinBackward1>)

In [179]:
norm_attn[:,:,1:43,1:43][0][0][1][(norm_attn[:,:,1:43,1:43][0][0][1]>0.3)].max()

tensor(0.9921, device='cuda:0', grad_fn=<MaxBackward1>)

In [148]:
torch.set_printoptions(sci_mode=False)

In [181]:
((norm_attn[:,:,1:43,1:43][0][0][1]-0.5995)/(0.9921-0.5995))*60

tensor([    32.9679,     28.7255,     29.9570,     25.9843,     24.8490,
            42.8263,     58.8003,     37.1217,     10.1437,     45.3317,
            60.0053,     25.3963,     23.9343,     23.5328,     16.2534,
            20.4763,     34.2004,     24.4644,     -0.0072,     10.1718,
            19.4641,      8.6621,     34.6081,     44.5964,     36.6098,
           -91.6200,     11.4393,    -91.6200,    -91.6200,    -91.6200,
           -91.6200,    -91.6200,    -91.6200,    -91.6200,    -91.6200,
           -91.6200,    -91.6200,    -91.6200,    -91.6200,    -91.6200,
           -91.6200], device='cuda:0', grad_fn=<MulBackward0>)

In [172]:
def trunc_attns(v):
	v_min, v_max = v[(v>0.4)].min(), v[(v>0.4)].max()
	new_min, new_max = 0, 1
	v_p = ((v - v_min)/(v_max - v_min)*(new_max - new_min) + new_min)*180
	v_p[v_p<=0] = 0
	return v_p

In [173]:
trunc_attn = apply_along_axis(trunc_attns, norm_attn, axis=3)
trunc_attn

tensor([[[[125.0555, 119.8253,  81.9364,  ...,   0.0000,   0.0000,   0.0000],
          [150.0990, 118.1594,  97.4993,  ...,   0.0000,   0.0000,   0.0000],
          [117.7518, 124.5119, 112.6926,  ...,   0.0000,   0.0000,   0.0000],
          ...,
          [  0.0000,   0.0000,   0.0000,  ..., 125.9845, 120.4146,  80.5000],
          [  0.0000,   0.0000,   0.0000,  ..., 100.5747, 119.5287,  90.6407],
          [  0.0000,   0.0000,   0.0000,  ..., 100.4802, 101.0929,   0.0000]],

         [[136.6346, 104.0937, 102.0086,  ...,   0.0000,   0.0000,   0.0000],
          [128.1483, 108.5745,  77.4901,  ...,   0.0000,   0.0000,   0.0000],
          [130.0492, 117.1320,  87.9887,  ...,   0.0000,   0.0000,   0.0000],
          ...,
          [  0.0000,   0.0000,   0.0000,  ...,  61.0299,   0.0000,  99.6398],
          [  0.0000,   0.0000,   0.0000,  ...,  95.6105, 114.6197,   0.0000],
          [  0.0000,   0.0000,   0.0000,  ...,  87.6192, 109.4289, 144.9254]],

         [[  0.0000, 150.0293,

In [177]:
trunc_attn[:,:,1:43,1:43][0][0][1]

tensor([124.5119, 112.6926, 111.8435, 110.2714, 108.6271, 140.7646, 175.0542,
        126.5410,  79.4143, 147.5887, 177.2875, 101.1655, 105.7277, 104.4473,
         80.1939,  99.0641, 125.2623, 107.7009,  57.4164,  79.7408,  97.8982,
         76.1715, 127.2320, 145.9639, 131.6871,   0.0000,  80.2176,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [175]:
from tucore_gcn_transformers.tucore_gcn_bert_view import model_view
model_view(tuple(trunc_attn.expand(1,-1,-1,-1,-1)), tokens[0][1:43])

<IPython.core.display.Javascript object>

In [9]:
bert_output.last_hidden_state[0]

tensor([[ 0.5279,  1.0138, -0.2643,  ..., -0.8992, -0.9599,  0.0524],
        [ 0.1786,  1.0213, -0.3096,  ...,  0.3506, -1.0663, -0.3449],
        [-0.6579, -0.3266,  0.0094,  ...,  0.1761, -1.2165, -0.1764],
        ...,
        [-0.9050, -0.1952, -0.2401,  ..., -0.9366, -1.1067,  0.2013],
        [-0.6985,  0.7810, -0.0891,  ..., -1.1606, -0.8233,  0.0920],
        [ 0.2789,  1.0261, -0.4485,  ..., -0.9486, -0.3965,  0.2999]],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [10]:
state_dict = model.state_dict()
state_dict

OrderedDict([('tucoregcn_bert.bert.embeddings.word_embeddings.weight',
              tensor([[-0.0314, -0.0045,  0.0182,  ..., -0.0309,  0.0204, -0.0345],
                      [-0.0295, -0.0486,  0.0746,  ..., -0.0363,  0.0261, -0.0108],
                      [-0.0338, -0.0583, -0.0123,  ..., -0.0930,  0.0442,  0.0212],
                      ...,
                      [-0.0337, -0.0518, -0.0280,  ..., -0.0174,  0.0078, -0.0010],
                      [-0.0022, -0.0297, -0.0167,  ..., -0.0472, -0.0006,  0.0128],
                      [-0.0631, -0.0144, -0.0232,  ...,  0.0072, -0.0703, -0.0479]],
                     device='cuda:0')),
             ('tucoregcn_bert.bert.embeddings.position_embeddings.weight',
              tensor([[-0.0097, -0.0079,  0.0048,  ..., -0.0249,  0.0091,  0.0052],
                      [-0.0004,  0.0039,  0.0092,  ..., -0.0009,  0.0138,  0.0175],
                      [-0.0104, -0.0081,  0.0199,  ...,  0.0004,  0.0212,  0.0130],
                      ...,
   

In [None]:
print(f"Weights: {len(state_dict)}")
print("\n".join(state_dict.keys()))


In [None]:
print(f"Vocab: {state_dict['tucoregcn_bert.bert.embeddings.word_embeddings.weight'].shape[0]}")
print(f"hidden_size: {state_dict['tucoregcn_bert.bert.embeddings.word_embeddings.weight'].shape[1]}")
print(f"Word Embeddings: {state_dict['tucoregcn_bert.bert.embeddings.word_embeddings.weight'].shape}, {30522*768} floats, {30522*768*4/1000/1000} MB")
print(state_dict['tucoregcn_bert.bert.embeddings.word_embeddings.weight'])

In [None]:
print("Bert Encoder: Layer 0 Attention Weight")
print(f"Vocab: {state_dict['tucoregcn_bert.bert.encoder.layer.0.attention.self.query.weight'].shape[0]}")
print(f"hidden_size: {state_dict['tucoregcn_bert.bert.encoder.layer.0.attention.self.query.weight'].shape[1]}")
print(f"Word Embeddings: {state_dict['tucoregcn_bert.bert.encoder.layer.0.attention.self.query.weight'].shape}, {768*768} floats, {768*768*4/1000/1000} MB")
print(state_dict['tucoregcn_bert.bert.encoder.layer.0.attention.self.query.weight'])