In [257]:
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig, BertModel, AutoModel, AutoTokenizer
from sentence_transformers import util
import torch
from torch import Tensor
import torch.nn.functional as F
import pandas as pd
import os
import numpy as np
from glob import glob
from nltk.tokenize import sent_tokenize
import json
from pprint import pprint
from tqdm.auto import tqdm

# Download sent_tokenize model

In [258]:
# import nltk
# nltk.download('punkt')

In [259]:
USED_MODEL = "sentence-transformers/all-mpnet-base-v2"

In [260]:
# sentences[0] is the sentence in the paragraph(Premise), sentences[1] is the question sentence(Hypothesis)
file_path = glob(os.path.join('data', '**', '*.json'), recursive=True)
file_path.sort()
def read_data(file):
    with open(file, 'r') as file:
        data = json.load(file)
    df = pd.DataFrame(data["reading_question"])
    df["year"] = data["Year"]
    return df
data = pd.concat(map(read_data, file_path), ignore_index=True)
display(data)

Unnamed: 0,ID,answer,choice,content,question,year
0,41,C,{'A': 'Rapid speech without mistakes is a reli...,There is a long-held belief that when meeting ...,Which of the following statements is true acco...,題目/100學測
1,42,C,{'A': 'Fixing your eyes on the person will mak...,There is a long-held belief that when meeting ...,What is true about fixing your eyes on a perso...,題目/100學測
2,43,A,"{'A': 'Facial expressions.', 'B': 'Physical co...",There is a long-held belief that when meeting ...,Which of the following is NOT discussed in the...,題目/100學測
3,44,D,{'A': 'People have an instinct for interpretin...,There is a long-held belief that when meeting ...,What is the main idea of the passage?,題目/100學測
4,45,A,{'A': 'Maasai people are a threat to elephants...,It is easy for us to tell our friends from our...,"According to the passage, which of the followi...",題目/100學測
...,...,...,...,...,...,...
475,47,B,{'A': 'Little palm civets eat only the outer l...,Coffee experts are willing to pay large sums o...,"Which of the following statements is true, acc...",題目/99指考
476,48,A,{'A': 'He was the son of a Nazi and a victimiz...,Gunter Grass was the winner of the 1999 Nobel ...,What caused Grass to feel confused and trouble...,題目/99指考
477,49,B,{'A': 'He victimized the Poles and criticized ...,Gunter Grass was the winner of the 1999 Nobel ...,Why has Grass been praised as “the conscience ...,題目/99指考
478,50,C,{'A': 'He was inspired by a fine arts master i...,Gunter Grass was the winner of the 1999 Nobel ...,Why was Grass’s trip to important to him?,題目/99指考


In [261]:
tokenizer = AutoTokenizer.from_pretrained(USED_MODEL)


In [262]:
def tokenize_question(data):
    A, B, C, D = data["choice"].values()
    # A_tokenized = tokenizer.batch_encode_plus(list(map(lambda x: data["question"]+A+"[SEP]"+x, data["content_sent_token"])), padding="max_length",truncation=True, return_tensors="pt")
    # B_tokenized = tokenizer.batch_encode_plus(list(map(lambda x: data["question"]+B+"[SEP]"+x, data["content_sent_token"])), padding="max_length",truncation=True, return_tensors="pt")
    # C_tokenized = tokenizer.batch_encode_plus(list(map(lambda x: data["question"]+C+"[SEP]"+x, data["content_sent_token"])), padding="max_length",truncation=True, return_tensors="pt")
    # D_tokenized = tokenizer.batch_encode_plus(list(map(lambda x: data["question"]+D+"[SEP]"+x, data["content_sent_token"])), padding="max_length",truncation=True, return_tensors="pt")
    A_tokenized = tokenizer(data["question"]+A, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    B_tokenized = tokenizer(data["question"]+B, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    C_tokenized = tokenizer(data["question"]+C, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    D_tokenized = tokenizer(data["question"]+D, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    return (A_tokenized, B_tokenized, C_tokenized, D_tokenized)
def tokenize_content(data):
    content_token = list(map(lambda x: tokenizer(x, padding="max_length", max_length=128, truncation=True, return_tensors="pt"), data["content_sent_token"])) 
    return content_token

In [263]:
data["content_sent_token"] = data["content"].apply(lambda x: sent_tokenize(x))
data["content_token"] = data.apply(lambda x: tokenize_content(x), axis=1)
data["statement_token"] = data.apply(lambda x: tokenize_question(x), axis=1)

In [264]:
# display(data["content_token"][0][0])
# display(data["statement_token"][0][0])
# print(tokenizer.decode(data["input"][0][0]["input_ids"][1]))

In [265]:
model = AutoModel.from_pretrained(USED_MODEL)

In [273]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def bert_forward(input, pbar):
    model_output = model(**input)
    embedding = mean_pooling(model_output, input["attention_mask"])
    embedding = F.normalize(embedding, p=2, dim=1)[0]
    pbar.update(1)
    return embedding
        

In [267]:
model.to(device="cuda")
def move_to_cuda(data):
    return list(map(lambda x: x.to(device="cuda"), data))
def move_to_cpu(data):
    return list(map(lambda x: x.to(device="cpu"), data))
data["content_token"] = data["content_token"].apply(lambda x: move_to_cuda(x))
data["statement_token"] = data["statement_token"].apply(lambda x: move_to_cuda(x))

In [268]:
with torch.no_grad():
    model_output1 = model(**data["content_token"][0][0])
    model_output2 = model(**data["statement_token"][0][0])
sentences1_embedding = mean_pooling(model_output1, data["content_token"][0][0]["attention_mask"])
sentences2_embedding = mean_pooling(model_output2, data["statement_token"][0][0]["attention_mask"])
sentences1_embedding = F.normalize(sentences1_embedding, p=2, dim=1)[0]
sentences2_embedding = F.normalize(sentences2_embedding, p=2, dim=1)[0]
print(util.dot_score(sentences1_embedding, sentences2_embedding))

tensor([[0.1014]], device='cuda:0')


In [280]:
with tqdm(total=sum(data["content_embedding"].apply(len))+sum(data["statement_embedding"].apply(len))) as pbar: 
    with torch.no_grad():
        data["content_embedding"] = data["content_token"].apply(lambda x: list(map(lambda y: bert_forward(y, pbar), x)))
        data["statement_embedding"] = data["statement_token"].apply(lambda x: list(map(lambda y: bert_forward(y, pbar), x)))

100%|██████████| 8428/8428 [00:26<00:00, 313.10it/s]


In [270]:
# data["content_embedding"].apply(lambda x: print(len(x)))

In [219]:
# list to tensor
data["content_embedding"] = data["content_embedding"].apply(lambda x: torch.stack(x))
data["statement_embedding"] = data["statement_embedding"].apply(lambda x: torch.stack(x))

RuntimeError: stack expects a non-empty TensorList

In [None]:
def predict(data):
    statement = data["statement_embedding"]
    content = data["content_embedding"]
    score = torch.mm(s, c.T).sum(dim=1)
    return score
data["predict_score"] = data.apply(lambda x: predict(x), axis=1)

In [None]:
question = 206

In [None]:
print(data["question"][question])
pprint(data["choice"][question])
print(data["content_sent_token"][question][0])
print(data["answer"][question])

Which of the following is true regarding Japan’s national anthem?
{'A': 'It was not written until the 20th century.',
 'B': 'The lyrics was written by a Japanese officer.',
 'C': 'The melody was first composed by a British musician.',
 'D': 'The current version is barely influenced by western music.'}
During the past three hundred years, when a country gains its freedom or independence, one of the first things established is a national anthem.
C


In [None]:
statement1 = data["statement_embedding"][question][0]
statement2 = data["statement_embedding"][question][1]
statement3 = data["statement_embedding"][question][2]
statement4 = data["statement_embedding"][question][3]
content = data["content_embedding"][question]
socre1 = 0
socre2 = 0
socre3 = 0
socre4 = 0
for i in range(len(content)):
    socre1 += util.dot_score(statement1[0], content[i])
    socre2 += util.dot_score(statement2[0], content[i])
    socre3 += util.dot_score(statement3[0], content[i])
    socre4 += util.dot_score(statement4[0], content[i])
print(f"A: {socre1}, B: {socre2}, C: {socre3}, D: {socre4}")


A: tensor([[5.3357]]), B: tensor([[5.3111]]), C: tensor([[5.8168]]), D: tensor([[6.2368]])


In [None]:
# use all-MiniLM-L6-v2 to compare the sentecnse by NLI and create label for the sentences
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

inputs = tokenizer(sentences[0], sentences[1], return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts the probability of the label
probs = torch.softmax(logits, dim=1)
# get the index of the label with the highest probability
pred = torch.argmax(probs, dim=1)
print(pred)

# get the probability of the label
print(probs)



NameError: name 'AutoModelForSequenceClassification' is not defined