In [1]:
# this is a proof of concept on how we intend of 
# measuring similarity between people in the community 

import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [9]:

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

user1 = """my name is Hachem Betrouni, I am a Data Science and AI student from Algeria,
            currently working at InstaDeep on predicting and generating new variants of Sars-CoV-2 using GANs"""

user2 = """mohcen is a computer vision expert from algeria, worked with instadeep 
            on predicting biological properties of protein sequences"""

user3 = """smail from algeria talks about innovation and startups, passionate about the food industry"""

user4 = """koko from russia works in a mining field, and has a family of four kids and 2 wives"""

users = [user1, user2, user3, user4]
users = ["[CLS] " + user + " [SEP]" for user in users]

tokenized_users = [tokenizer.tokenize(user) for user in users]


indexed_tokens = [tokenizer.convert_tokens_to_ids(tokenized_user) for tokenized_user in tokenized_users]
segments_ids = [[1] * len(tokenized_user) for i, tokenized_user in zip(range(1,len(users)+1),tokenized_users)]

tokens_tensor = [torch.tensor([indexed_token]) for indexed_token in indexed_tokens]
segments_tensors = [torch.tensor([segments_id]) for segments_id in segments_ids]


In [10]:

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    hidden_states = []
    for tokens, segments in zip(tokens_tensor, segments_tensors):
        
        outputs = model(tokens, segments)
        hidden_states.append(outputs[2])

In [12]:
token_embeddings = [torch.stack(h, dim=0) for h in hidden_states]
token_embeddings = [torch.squeeze(t_e, dim=1) for t_e in token_embeddings]
token_embeddings = [t_e.permute(1,0,2) for t_e in token_embeddings]

In [13]:
# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = [h[-2][0] for h in hidden_states]

In [14]:

# Calculate the average of all 22 token vectors.
sentence_embedding = [torch.mean(token_vec, dim=0) for token_vec in token_vecs]

In [15]:
#measuring cosine similarity 

#user 1 and user 2
print("similarity between user1 and user2 :", 1 - cosine(sentence_embedding[0], sentence_embedding[1]))
#user 2 and user 3
print("similarity between user1 and user3 :", 1 - cosine(sentence_embedding[0], sentence_embedding[2]))
#user 1 and user4
print("similarity between user1 and user4 :", 1 - cosine(sentence_embedding[0], sentence_embedding[3]))

similarity between user1 and user2 : 0.9203354716300964
similarity between user1 and user3 : 0.8274563550949097
similarity between user1 and user4 : 0.7707939147949219
