# BERT Setup and Knowledge Retrieval

## 1. Imports

In [None]:
# Imports
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch as pt

Importing the DISTILBERT Model from Pretrained HuggingFace models

In [None]:
file_path = "../../models/bert-base-cased-squad2"
tokenizer = AutoTokenizer.from_pretrained(file_path) 
model = AutoModelForQuestionAnswering.from_pretrained(file_path)

Preparing the Question and the context

In [None]:
question = "Which city is Pakistan's most cosmopolitan city?"
text = "Karachi is Pakistan's most cosmopolitan city, linguistically, ethnically, and religiously diverse, as well as one of Pakistan's most secular and socially liberal cities."
# text = '''The 1973 oil crisis began in October 1973 when the members of the Organization of Arab Petroleum Exporting Countries (OAPEC, consisting of the Arab members of OPEC plus Egypt and Syria) proclaimed an oil embargo. By the end of the embargo in March 1974, the price of oil had risen from US$3 per barrel to nearly $12 globally; US prices were significantly higher. The embargo caused an oil crisis, or "shock", with many short- and long-term effects on global politics and the global economy. It was later called the "first oil shock", followed by the 1979 oil crisis, termed the "second oil shock."'''
inputs = tokenizer.encode_plus(question, text, return_tensors='pt')

start_scores, end_scores = model(**inputs, return_dict=False)

In [None]:
token_start = pt.argmax(start_scores)
token_end = pt.argmax(end_scores) + 1
ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][token_start:token_end]))

In [None]:
print (ans)

<h3>Sentence similarity using BERT</h3>

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertModel
import torch
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class BERTBased:
    def __init__(self, application='QA'):
        model_path = "../../models/bert-base-cased-squad2"
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        if application == 'QA':
            self.model = AutoModelForQuestionAnswering.from_pretrained(model_path)
        else:
            self.model = BertModel.from_pretrained(model_path)
        self.TAG = 'BERTBased'

    def getAnswers(self, message, knowledge_source):
        print(f"Input: {message}\nKnowledge Source: {knowledge_source[0:30]}")
        inputs = self.tokenizer.encode_plus(message, knowledge_source, return_tensors='pt')

        start_scores, end_scores = self.model(**inputs, return_dict=False)

        token_start = torch.argmax(start_scores)
        token_end = torch.argmax(end_scores) + 1
        response = self.tokenizer.convert_tokens_to_string(
            self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][token_start:token_end]))

        answer_prefix = self.tokenizer.convert_tokens_to_string(
                self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][:token_start], skip_special_tokens=True)
            )
        span_start_index = len(answer_prefix) - len(message)
        span_end_index = span_start_index + len(response)

        print(f"[{self.TAG}] Response: {response}\nanswer prefix: {answer_prefix}\n\n, start index: {span_start_index},"
              f" end index: {span_end_index}")
        return response if response != "[CLS]" else "", span_start_index, span_end_index

    def vectorize(self, sentences):
        tokenized = list(map(lambda x: self.tokenizer.encode(x, add_special_tokens=True), sentences))

        max_len = 0
        for i in tokenized:
            if len(i) > max_len:
                max_len = len(i)

        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
        input_ids = torch.tensor(np.array(padded)).type(torch.LongTensor)
        # attention_mask = torch.tensor(np.where(padded != 0, 1, 0)).type(torch.LongTensor)

        with torch.no_grad():
            outputs = self.model(input_ids, output_hidden_states=True)
            print(f"Keys: {outputs.keys()}")
            hidden_states = outputs.hidden_states
            #last_hidden_states = self.model(input_ids, output_hidden_states=True).hidden_states

        print(f"hidden state type: {type(hidden_states)}")
        #print(f"hidden state shape: {np.array(list(last_hidden_states)).shape}")
        vectors = np.array(hidden_states[0])
        print(f"Vectors length: {vectors.shape}")
        
        # Time to try something new
        
        
        
        return vectors
        

    def get_most_similar_sentence(self, sentence, candidates):
        """
        Given the sentence to be matched, find the candidate sentence that is most similar using BERT's hidden states
        :param sentence: Sentence to be matched
        :param candidates: Candidates from which the one that is most similar to sentence must be found
        :return: The candidate sentence that is most similar to the sentence parameter
        """
        sentences = [sentence]
        sentences.extend(candidates)

        # Vectorize sentence to be matched, and all candidate sentences in a batch
        vectorized_sentences = self.vectorize(sentences)
        sent_vector = vectorized_sentences[0]
        candidate_vectors = vectorized_sentences[1:]

        # Find the candidate sentence that is most similar to the sentence to be matched
        max_distance = -1
        most_similar_sentence = None
        for candidate_id, candidate_vector in enumerate(candidate_vectors):
            #print(f"sent vector:\n{sent_vector}\n\ncandidate vec:\n{candidate_vector}")
            distance = spatial.distance.cosine(sent_vector, candidate_vector)
            print(f"sentence: {candidates[candidate_id]}\tscore: {distance}")
            if distance > max_distance:
                max_distance = distance
                most_similar_sentence = candidates[candidate_id]

        return most_similar_sentence

    def test_forward(self, sentence, candidates):
        # initialize dictionary that will contain tokenized sentences
        tokens = {'input_ids': [], 'attention_mask': []}

        sentences = [sentence]
        sentences.extend(candidates)
        
        for sent in sentences:
            # tokenize sentence and append to dictionary lists
            new_tokens = self.tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                               padding='max_length', return_tensors='pt')
            tokens['input_ids'].append(new_tokens['input_ids'][0])
            tokens['attention_mask'].append(new_tokens['attention_mask'][0])

        # reformat list of tensors into single tensor
        tokens['input_ids'] = torch.stack(tokens['input_ids'])
        tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

        outputs = self.model(**tokens, output_hidden_states=True)

        embeddings = outputs.hidden_states[12]

        attention_mask = tokens['attention_mask']

        mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

        masked_embeddings = embeddings * mask

        summed = torch.sum(masked_embeddings, 1)

        summed_mask = torch.clamp(mask.sum(1), min=1e-9)

        mean_pooled = summed / summed_mask

        mean_pooled = mean_pooled.detach().numpy()

        # calculate
        scores = cosine_similarity(
            [mean_pooled[0]],
            mean_pooled[1:]
        )
        print(f"Scores shape: {np.array(scores).shape}")
        scores = scores[0]

        # Find the candidate sentence that is most similar to the sentence to be matched
        max_score = -1
        most_similar_sentence = None
        for candidate_id, score in enumerate(scores):
            #print(f"sent vector:\n{sent_vector}\n\ncandidate vec:\n{candidate_vector}")
            print(f"sentence: {candidates[candidate_id]}\tscore: {score}")
            if score > max_score:
                max_score = score
                most_similar_sentence = candidates[candidate_id]

        return most_similar_sentence

In [None]:
model = BERTBased(application='QA')

In [None]:
model2 = BERTBased(application='EM')

In [None]:
print(model.test_forward("I can't remember the name of Mercedes' founder", 
                                ["Mercedes is a German automotive manufacturer",
                                "Mercedes is founded by karl benz in 1901",
                                "Mercedes was the largest manufacturer of cars in 1920"]))

In [None]:
print(model2.get_most_similar_sentence("Yeah! It's one of the largest manufactures, I believe", 
                                ["Mercedes is a German automotive manufacturer",
                                "Mercedes was founded by karl benz in 1901",
                                "Mercedes was the largest manufacturer of cars in 1920"]))

In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
out = [sent.text for sent in nlp("Mercedes is a German automotive manufacturer. Mercedes was founded by karl benz in 1901." + 
    " Mercedes was the largest manufacturer of cars in 1920.").sents]

In [None]:
print(out)
print(out[0])
print(type(out[0]))

<h2>Sentence Similarity using Sentence Transformers</h2>

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('../../models/all-mpnet-base-v2')

sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "Standing on one's head at job interviews forms a lasting impression.",
    "It took him a month to finish the meal.",
    "He found a leprechaun in his walnut shell."
]

In [None]:
sentences = ["Is Mercedes Pakistani?",
             "Mercedes is a German automotive manufacturer",
                                "Mercedes is owned by karl benz in 1901",
                                "Mercedes was the largest manufacturer of cars in 1920"]

sentence_embeddings = model.encode(sentences)

cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)