In [17]:
from datasets import load_dataset
import torch
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mean_squared_error
from datasets import load_dataset
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import BertTokenizerFast, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [2]:
dataset = load_dataset("commonsense_qa")

Downloading and preparing dataset None/None to C:/Users/Mia/.cache/huggingface/datasets/parquet/commonsense_qa-4d2bfefd7b5e8ac5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/160k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/Mia/.cache/huggingface/datasets/parquet/commonsense_qa-4d2bfefd7b5e8ac5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
generics_kb = load_dataset("generics_kb", "generics_kb")

Found cached dataset generics_kb (C:/Users/Mia/.cache/huggingface/datasets/generics_kb/generics_kb/1.0.0/9b41cde494db24f842a9260588bcfb2e3a257364568666ef240e98c70fb0e709)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train_ds = dataset['train'][:500]
test_ds = dataset['test']
val_ds = dataset['validation']

In [None]:
questions = train_ds['question']
choices = [choice['text'] for choice in train_ds['choices']]
answers = train_ds['answerKey']

In [16]:
kb_sentences = generics_kb['train']['generic_sentence']

In [31]:
import random
# decrease the size of the knowledge base to speed up encoding
kb_sentences = random.sample(kb_sentences, 1000)

In [9]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
input_ids = Input(shape=(100,), name='input_token', dtype='int32')
att_masks = Input(shape=(100,), name='masked_token', dtype='int32')
bert_in = bert_model(input_ids, attention_mask=att_masks)[1]
answer_output = Dense(5, activation='relu', name='answer')(bert_in)

In [11]:
model = Model(inputs=[input_ids, att_masks], outputs=[answer_output])
model.compile(optimizer=Adam(learning_rate=0.01), loss=mean_squared_error, metrics=['accuracy'])

In [12]:
# bert_model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
def compute_semantic_similarity(question_tokens, knowledge_base_tokens):
    similarities = [cosine_similarity(
        np.mean(tokenizer.encode_plus(question_tokens, return_tensors='pt')['input_ids'].detach().numpy(), axis=1),
        np.mean(tokenizer.encode_plus(kb_token, return_tensors='pt')['input_ids'].detach().numpy(), axis=1)
    )[0][0] for kb_token in knowledge_base_tokens]

    return similarities

In [15]:
len(kb_sentences)

1000

In [180]:
# Question Tokens Shape: torch.Size([27])
# Knowledge Base Tokens Shape: torch.Size([11])

In [34]:
max_len = 27

In [32]:
knowledge_base_tokens = [tokenizer.encode(sentence, return_tensors='pt')[0] for sentence in kb_sentences]

In [35]:
knowledge_base_tokens_padded = torch.stack([
    torch.nn.functional.pad(token, (0, max_len - len(token)))
    for token in knowledge_base_tokens
])

In [36]:
def convert_answer_for_index(i):
    answer = answers[i]
    converted_answer = choices[i][0]
    if answer == 'A':
        converted_answer = choices[i][0]
    elif answer == 'B':
        converted_answer = choices[i][1]
    elif answer == 'C':
        converted_answer = choices[i][2]
    elif answer == 'D':
        converted_answer = choices[i][3]
    elif answer == 'E':
        converted_answer = choices[i][3]
    return converted_answer    

In [185]:
print(convert_answer_for_index(4))

natural habitat


In [39]:
sentences_merged = []
counter = 0
for i in range(len(questions)):
    question = questions[i]
    choices_i = choices[i]

    question_tokens = tokenizer.encode(question, return_tensors='pt')[0]
    question_tokens = torch.nn.functional.pad(question_tokens, (0, max_len - len(question_tokens)))

    # print("Question Tokens Shape:", question_tokens.shape)
    # print("Knowledge Base Tokens Shape:", knowledge_base_tokens_padded[0].shape)

    similarities = [cosine_similarity(question_tokens.detach().reshape(1, -1).numpy(), kb_token.reshape(1, -1).detach().numpy())[0][0] for kb_token in knowledge_base_tokens_padded]

    most_similar_index = np.argmax(similarities)
    selected_kb_sentence = kb_sentences[most_similar_index]

    input_text = f"{question} {selected_kb_sentence}"
    sentences_merged.append(input_text)
    counter += 1
    print(counter)
    # input_tokens = tokenizer.encode(input_text, return_tensors='pt')

    # input_ids = input_tokens
    # attention_mask = torch.ones_like(input_tokens)  

    # with torch.no_grad():
    #     outputs = bert_model(input_ids, attention_mask=attention_mask)
    #     start_scores = outputs.start_logits
    #     end_scores = outputs.end_logits

    # start_index = torch.argmax(start_scores)
    # end_index = torch.argmax(end_scores)

    # predicted_answer = tokenizer.decode(input_ids[0][start_index:end_index+1])

    # pred.append(predicted_answer)
    # hyp.append(convert_answer_for_index(i))

    # # print("Question:", question)
    # # print("Answer:", predicted_answer)
    # # print("Expected answer: ", convert_answer_for_index(i))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
