#Downloads and Imports

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('commonsense_qa')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained("bert-large-uncased")

#QA without memory

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})

In [None]:
dataset['train']

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
    num_rows: 9741
})

In [None]:
questions = dataset['train']['question']
choices = [choice['text'] for choice in dataset['train']['choices']]
answers = dataset['train']['answerKey']

In [None]:
questions[0]

'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?'

In [None]:
choices[0]

['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']

In [None]:
answers[0]

'A'

“[CLS] If ... ? [SEP] bedroom [SEP]”

In [None]:
sequences = []
for question, choice in zip(questions, choices):
  sequences.extend([f'{question} [SEP] {c}' for c in choice])

In [None]:
sequences[0]

'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] ignore'

In [None]:
max_length = max([len(seq) for seq in sequences])
min_length = min([len(seq) for seq in sequences])

In [None]:
max_length

395

In [None]:
min_length

26

In [None]:
total_length = 0
count = 0
for seq in sequences:
    total_length += len(seq)
    count += 1

average_length = total_length / count
avg = int(average_length)

In [None]:
avg

85

In [None]:
for i in range(5):
  print(sequences[i])

The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] ignore
The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] enforce
The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] authoritarian
The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] yell at
The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? [SEP] avoid


In [None]:
tokens = tokenizer(sequences, truncation=True, max_length=100)

In [None]:
for k, v in tokens.items():
  print(k)

input_ids
token_type_ids
attention_mask


In [None]:
len(tokens["input_ids"])

48705

In [None]:
tokens = {k: [v[i: i + 5] for i in range(0, len(v), 5)] for k, v in tokens.items()}

In [None]:
len(tokens['input_ids'])

9741

In [None]:
len(tokens["input_ids"]) * 5 == 48705

True

In [None]:
len(answers)

9741

In [None]:
answers[0:15]

['A', 'B', 'A', 'D', 'C', 'D', 'E', 'B', 'E', 'D', 'B', 'C', 'C', 'A', 'C']

In [None]:
answer_letters = ['A', 'B', 'C', 'D', 'E']

In [None]:
answers_n = [answer_letters.index(ans) for ans in answers]

In [None]:
answers_n[0:15]

[0, 1, 0, 3, 2, 3, 4, 1, 4, 3, 1, 2, 2, 0, 2]

[CLS] - 101
[SEP] - 102
[PAD] - 0

In [None]:
tokens["attention_mask"][0][1]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [None]:
tokens["input_ids"][0][1]

[101,
 1996,
 17147,
 2114,
 1996,
 2082,
 2020,
 1037,
 16385,
 2075,
 6271,
 1010,
 1998,
 2027,
 2790,
 2000,
 2054,
 1996,
 4073,
 1996,
 2082,
 2018,
 2081,
 2000,
 2689,
 1029,
 102,
 16306,
 102]

#DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
import torch


In [None]:
class MyDataset(Dataset):
    def __init__(self, tokens, answers, max_len):
        self.tokens = tokens
        self.answers = answers
        self.max_len = max_len

    def __len__(self):
        return len(self.answers)

    def __getitem__(self, idx):
            input = self.tokens["input_ids"][idx]
            attn_mask = self.tokens["attention_mask"][idx]
            answer = self.answers[idx]

            # print(input)
            # print(attn_mask)
            # print(answer)

            for i,elem in enumerate(input):
              if len(elem)<self.max_len:
                len1 = self.max_len-len(elem)
                input[i] = elem+[0 for i in range(len1)]
              elif len(elem)>self.max_len:
                input[i] = elem[0:self.max_len]

            for i,elem in enumerate(attn_mask):
              if len(elem)<self.max_len:
                len1 = self.max_len-len(elem)
                attn_mask[i] = elem+[0 for i in range(len1)]
              elif len(elem)>self.max_len:
                attn_mask[i] = elem[0:self.max_len]


            input_ids = torch.tensor(input, dtype=torch.long)
            attention_mask = torch.tensor(attn_mask, dtype=torch.long)
            numerical_label = torch.tensor(answer, dtype=torch.long)

            # print(input_ids.shape)
            # print(attention_mask.shape)
            # print(numerical_label)

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'numerical_label': numerical_label
            }

In [None]:
dataset = MyDataset(tokens, answers_n, max_length)

In [None]:
data_loader = DataLoader(dataset, batch_size=32)

#Creating the model

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F


pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) — Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns the classification token after processing through a linear layer and a tanh activation function. The linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.

Create a DataLoader for for creating tensors. Also, change the answers to numerical values.

In [None]:
from transformers import BertTokenizer, BertForMultipleChoice
model = BertForMultipleChoice.from_pretrained("bert-base-uncased")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for index, batch in enumerate(data_loader):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    numerical_label = batch['numerical_label']
    print(input_ids.shape)
    print(attention_mask.shape)
    print(numerical_label)
    outputs = model(input_ids, attention_mask)
    logits = outputs.pooler_output

    loss = criterion(logits, numerical_label)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


torch.Size([32, 5, 395])
torch.Size([32, 5, 395])
tensor([0, 1, 0, 3, 2, 3, 4, 1, 4, 3, 1, 2, 2, 0, 2, 3, 4, 3, 3, 1, 4, 1, 3, 4,
        2, 3, 3, 3, 4, 0, 2, 1])


#Get similar sentences

In [None]:
kb = load_dataset('generics_kb')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1020868 [00:00<?, ? examples/s]

In [None]:
for key in kb:
  print(key)

train


In [None]:
type(kb['train'])

datasets.arrow_dataset.Dataset

In [None]:
kb['train'][0]

{'source': 'Waterloo',
 'term': 'aa battery',
 'quantifier_frequency': '',
 'quantifier_number': '',
 'generic_sentence': 'AA batteries maintain the settings if the power ever goes off.',
 'score': 0.35092294216156006}

In [None]:
sentences = kb['train']['generic_sentence']

In [None]:
len(sentences)

1020868

In [None]:
len(questions)

9741

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=8dde954e42dc1e21ef2a253594168524b0cd49403c71086a0a854b6f63a20906
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tr

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

embs = []
for sen in tqdm(sentences):
    emb = model.encode(sen)
    embs.append(emb)


  0%|          | 2075/1020868 [00:58<7:59:38, 35.40it/s]


KeyboardInterrupt: 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
questions1 = []
questions2 = []

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

for question in tqdm(questions):
    embedding = model.encode(question).reshape(1, -1)

    embs_array = np.array(embs)

    similarities = cosine_similarity(embedding, embs_array)
    top_indices = np.argsort(similarities[0])[::-1][:3]

    k1 = sentences[top_indices[0]]
    k2 = sentences[top_indices[1]]
    k3 = sentences[top_indices[2]]

    questions1.append(f"{question} | {k1}")
    questions2.append(f"{question} | {k1} {k2} {k3}")


  1%|▏         | 141/9741 [00:14<16:07,  9.92it/s]


KeyboardInterrupt: 

In [None]:
questions1[0]

'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? | Abolitionists agitate against Southern slavery and Northern racism.'

In [None]:
questions2[0]

'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change? | Abolitionists agitate against Southern slavery and Northern racism. Some Aboriginal parents are even afraid to send their kids to school. Aborigines face arbitrary arrest, systemic discrimination, and mistreatment by police.'