In [204]:
from datasets import load_dataset
import torch
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mean_squared_error
from datasets import load_dataset
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import BertTokenizerFast, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [205]:
dataset = load_dataset("commonsense_qa")

Found cached dataset parquet (C:/Users/Mia/.cache/huggingface/datasets/parquet/commonsense_qa-4d2bfefd7b5e8ac5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [206]:
generics_kb = load_dataset("generics_kb", "generics_kb")

Found cached dataset generics_kb (C:/Users/Mia/.cache/huggingface/datasets/generics_kb/generics_kb/1.0.0/9b41cde494db24f842a9260588bcfb2e3a257364568666ef240e98c70fb0e709)


  0%|          | 0/1 [00:00<?, ?it/s]

In [208]:
train_ds = dataset['train'][:500]
test_ds = dataset['test']
val_ds = dataset['validation']

In [243]:
questions = train_ds['question']
choices = [choice['text'] for choice in train_ds['choices']]
answers = train_ds['answerKey']

In [210]:
kb_sentences = generics_kb['train']['generic_sentence']

In [211]:
# decrease the size of the knowledge base to speed up encoding
kb_sentences = random.sample(kb_sentences, 1000)

In [212]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [213]:
def compute_semantic_similarity(question_tokens, knowledge_base_tokens):
    similarities = [cosine_similarity(
        np.mean(tokenizer.encode_plus(question_tokens, return_tensors='pt')['input_ids'].detach().numpy(), axis=1),
        np.mean(tokenizer.encode_plus(kb_token, return_tensors='pt')['input_ids'].detach().numpy(), axis=1)
    )[0][0] for kb_token in knowledge_base_tokens]

    return similarities

In [214]:
len(kb_sentences)

1000

In [216]:
max_len = 27

In [217]:
knowledge_base_tokens = [tokenizer.encode(sentence, return_tensors='pt')[0] for sentence in kb_sentences]

In [218]:
knowledge_base_tokens_padded = torch.stack([
    torch.nn.functional.pad(token, (0, max_len - len(token)))
    for token in knowledge_base_tokens
])

In [None]:
def convert_answer_for_index(i, ds):
    answer_index = ord(ds['answerKey'][i]) - ord('A')
    return ds['choices'][i]['text'][answer_index] 

In [220]:
sentences_merged = []
# counter = 0
for i in range(len(questions)):
    question = questions[i]
    choices_i = choices[i]

    question_tokens = tokenizer.encode(question, return_tensors='pt')[0]
    question_tokens = torch.nn.functional.pad(question_tokens, (0, max_len - len(question_tokens)))

    # print("Question Tokens Shape:", question_tokens.shape)
    # print("Knowledge Base Tokens Shape:", knowledge_base_tokens_padded[0].shape)

    similarities = [cosine_similarity(question_tokens.detach().reshape(1, -1).numpy(), kb_token.reshape(1, -1).detach().numpy())[0][0] for kb_token in knowledge_base_tokens_padded]

    most_similar_index = np.argmax(similarities)
    selected_kb_sentence = kb_sentences[most_similar_index]

    input_text = f"{question} {selected_kb_sentence}"
    sentences_merged.append(input_text)
    # counter += 1
    # print(counter)

In [267]:
len(sentences_merged)

500

In [222]:
sequences = []
answers_converted = [convert_answer_for_index(i, train_ds) for i in range(0, len(answers))]
for sentence, answer in zip(sentences_merged, answers_converted):
        sequences.append(f'{sentence} - {answer}')

In [268]:
len(sequences)

500

In [281]:
tokenized_data = tokenizer(
    sequences,
    padding='max_length',  
    return_tensors='tf',
    truncation=True,
    max_length=100 
)

In [282]:
len(tokenized_data['input_ids'])

500

In [283]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [284]:
input_ids = Input(shape=(100,), name='input_token', dtype='int32')
att_masks = Input(shape=(100,), name='masked_token', dtype='int32')
bert_in = bert_model(input_ids, attention_mask=att_masks)[1]
answer_output = Dense(5, activation='relu', name='answer')(bert_in)

In [285]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [286]:
model = Model(inputs=[input_ids, att_masks], outputs=[answer_output])
model.compile(optimizer=Adam(learning_rate=0.01), loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])

In [288]:
print(len(tokenized_data['input_ids']))
print(len(tokenized_data['attention_mask']))
print(len(answers))

500
500
500


In [312]:
letter_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
numeric_values_answers = [letter_to_index[letter] for letter in answers]

In [315]:
model.fit([tokenized_data['input_ids'], tokenized_data['attention_mask']], np.array(numeric_values_answers), epochs=3, batch_size=32)

Epoch 1/3

In [202]:
pred_tokens = tokenizer(val_ds['question'], max_length=100, return_tensors='np', truncation=True, pad_to_max_length=True)



In [173]:
pred_tokens['attention_mask'].shape

(1221, 100)

In [174]:
pred = model.predict([pred_tokens['input_ids'], pred_tokens['attention_mask']])



In [176]:
y_pred = []
answerKeys = ['A', 'B', 'C', 'D', 'E']

for p in pred:
    y_pred.append(answerKeys[np.argmax(p)])

In [179]:
y_test = val_ds['answerKey']

In [180]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [181]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [182]:
y_pred

['E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'C',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'A',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'A',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'C',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'A',
 'C',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'C',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'C',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'E',
 'A',
 'E',
 'A',
 'E',
 'E',
 'A',
 'E',
 'E',
 'E'

In [183]:
y_test

['A',
 'A',
 'B',
 'A',
 'A',
 'C',
 'B',
 'D',
 'A',
 'C',
 'E',
 'D',
 'A',
 'D',
 'C',
 'D',
 'D',
 'E',
 'E',
 'D',
 'D',
 'C',
 'D',
 'D',
 'E',
 'D',
 'B',
 'A',
 'B',
 'B',
 'B',
 'A',
 'E',
 'D',
 'E',
 'C',
 'C',
 'B',
 'A',
 'B',
 'B',
 'B',
 'E',
 'E',
 'D',
 'E',
 'A',
 'E',
 'E',
 'C',
 'C',
 'B',
 'D',
 'C',
 'D',
 'E',
 'B',
 'D',
 'B',
 'A',
 'B',
 'E',
 'C',
 'D',
 'A',
 'A',
 'D',
 'A',
 'C',
 'D',
 'E',
 'E',
 'D',
 'A',
 'C',
 'B',
 'C',
 'E',
 'B',
 'C',
 'B',
 'D',
 'E',
 'A',
 'E',
 'A',
 'D',
 'B',
 'E',
 'C',
 'C',
 'C',
 'D',
 'E',
 'E',
 'E',
 'B',
 'D',
 'B',
 'E',
 'B',
 'D',
 'D',
 'E',
 'D',
 'D',
 'E',
 'A',
 'A',
 'E',
 'C',
 'C',
 'E',
 'D',
 'B',
 'B',
 'E',
 'C',
 'E',
 'C',
 'B',
 'C',
 'D',
 'D',
 'A',
 'B',
 'D',
 'B',
 'E',
 'D',
 'D',
 'C',
 'A',
 'D',
 'D',
 'B',
 'B',
 'E',
 'E',
 'A',
 'D',
 'E',
 'B',
 'A',
 'D',
 'E',
 'C',
 'C',
 'C',
 'E',
 'A',
 'B',
 'D',
 'D',
 'E',
 'D',
 'B',
 'E',
 'A',
 'E',
 'A',
 'D',
 'C',
 'B',
 'C',
 'A',
 'E'