In [90]:
version = '_tiny2_context_softmax_final'
max_length=2048

In [78]:
from transformers import AutoTokenizer, AutoModel
import torch
'''
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model_question = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model_answer = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
'''
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model_question = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model_answer = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

In [91]:
model_question.load_state_dict(torch.load(f'models/model_anchor{version}.bin', map_location=torch.device('cpu')))
model_answer.load_state_dict(torch.load(f'models/model_pos_neg{version}.bin', map_location=torch.device('cpu')))

<All keys matched successfully>

In [92]:
def embed_bert_cls(model_output):
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings

def get_question_embs(question):
    encoded_input = tokenizer(question, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    with torch.no_grad():
        model_output = model_question(**encoded_input)
        
    question_embeddings = embed_bert_cls(model_output)
    return question_embeddings

def get_answer_embs(answer):
    encoded_input = tokenizer(answer, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    with torch.no_grad():
        model_output = model_answer(**encoded_input)
        
    answer_embeddings = embed_bert_cls(model_output)
    return answer_embeddings

In [93]:
import pandas as pd
df_test = pd.read_csv('data/val_onestr.csv')
df = pd.read_csv('data/grammar_cities_fix.csv')

In [94]:
import pickle

def write_list(filename, to_store):
    with open(filename, 'wb') as fp:
        pickle.dump(to_store, fp)

def read_list(filename):
    with open(filename, 'rb') as fp:
        n_list = pickle.load(fp)
        return n_list

In [95]:
from annoy import AnnoyIndex

ANN = AnnoyIndex(312, 'angular')
ANN.load(f'{version[1:]}.ann')

True

# Cosine

In [96]:
import torch.nn as nn

def cosine_similarity_transform(angular_distance):
    return (2-(angular_distance**2))/2

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

def get_vector_top1(response):
    ids, _ = response
    id_1 = ids[0]
    text = df['message'][id_1]
    return get_answer_embs(text)

In [97]:
total_cosine = 0
for i in df_test.index:
    message = df_test["anchor_one_str"][i]
    vector = get_question_embs(message)[0]
    _, distances = ANN.get_nns_by_vector(vector, 1, include_distances=True)
    total_cosine += cosine_similarity_transform(distances[0])
print(total_cosine / df_test.shape[0])

0.9538758407546579


In [98]:
total_cosine_ans = 0
for i in df_test.index:
    vector_q = get_question_embs(df_test["anchor_one_str"][i])
    vector_a = get_answer_embs(df_test["positive"][i])
    total_cosine_ans += cos(vector_q, vector_a)
print(float((total_cosine_ans / df_test.shape[0])[0]))

0.6343808770179749


In [99]:
'''total_cosine_ans_search = 0
for i in df_test.index:
    vector_q = get_question_embs(df_test["anchor_one_str"][i])[0]
    vector_real = get_answer_embs(df_test["positive"][i])
    response = ANN.get_nns_by_vector(vector, 1, include_distances=True)
    vector_found = get_vector_top1(response)
    total_cosine_ans_search += cos(vector_real, vector_found)
print(float((total_cosine_ans_search / df_test.shape[0])[0]))''';

# Recall@k

In [100]:
def find_among_first_k(to_find, response, k):
    ids, _ = response
    for i in range(len(ids)):
        id_cur = ids[i]
        text = df['message'][id_cur]
        if text == to_find:
            return 1
    return 0

In [101]:
for k in [3, 5, 10, 15]:
    total_recall = 0
    for i in df_test.index:
        message = df_test["anchor_one_str"][i]
        vector = get_question_embs(message)
        response = ANN.get_nns_by_vector(vector[0], k, include_distances=True)
        total_recall += find_among_first_k(df_test['positive'][i], response, k)
    print(f'Recall@{k}:', total_recall / df_test.shape[0])

Recall@3: 0.2983606557377049
Recall@5: 0.33114754098360655
Recall@10: 0.37540983606557377
Recall@15: 0.4098360655737705


# MRR

In [12]:
def get_place(to_find, response):
    ids, _ = response
    for i in range(len(ids)):
        id_cur = ids[i]
        text = df['message'][id_cur]
        if text == to_find:
            return i+1
    return -1

In [13]:
from tqdm import tqdm 
import numpy as np

MRR = 0
places = []
test_size = df_test.shape[0]
for i in tqdm(df_test.index):
    message = df_test["anchor_one_str"][i]
    vector = get_question_embs(message)
    response = ANN.get_nns_by_vector(vector, 2934, include_distances=True)
    place = get_place(df_test['positive'][i], response)
    places.append(place)
    MRR += (1 / place)
MRR = MRR / test_size
print(MRR)

100%|██████████| 610/610 [00:16<00:00, 37.67it/s]

0.2973106644737639



