## Project for the course of Intelligent Systems 2022
 - Matheus Galvão
 - Matteo Prina
 - Luan Klein 

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
import json

In [2]:
with open("data/train-v2.0.json", 'r') as f:
    data = json.load(f)

In [3]:
# get the available questions and answers for a given topic
def get_qa(topic, data):
    q = []
    a = []
    for d in data['data']:
        if d['title']==topic:
            for paragraph in d['paragraphs']:
                for qa in paragraph['qas']:
                    if not qa['is_impossible']:
                        q.append(qa['question'])
                        a.append(qa['answers'][0]['text'])
            return q,a

questions, answers = get_qa(topic='Premier_League', data=data)

print("Number of available questions: {}".format(len(questions)))

Number of available questions: 357


In [6]:
def get_model(model_name):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer
  
model, tokenizer = get_model(model_name="paraphrase-MiniLM-L6-v2")

In [7]:
# Mean Pooling - Take attention mask into account for correct averaging
# source: https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    
    input_mask_expanded = (
      attention_mask
      .unsqueeze(-1)
      .expand(token_embeddings.size())
      .float()
    )
    
    pool_emb = (
      torch.sum(token_embeddings * input_mask_expanded, 1) 
      / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    )
    
    return pool_emb

def get_embeddings(questions, tokenizer, model):
  # Tokenize sentences
  encoded_input = tokenizer(questions, padding=True, truncation=True, return_tensors='pt')

  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)

  # Average pooling
  embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 
  
  return embeddings

embeddings = get_embeddings(questions[:3], tokenizer, model)
print("Embeddings shape: {}".format(embeddings.shape))

Embeddings shape: torch.Size([3, 384])


In [8]:
new_question = 'Which days have the most events played at?'
new_embedding = get_embeddings([new_question], tokenizer, model)

# squared Euclidean distance between sample questions and new_question
((embeddings - new_embedding)**2).sum(axis=1)

tensor([71.4030, 59.8726, 23.9430])