In [1]:
# -*- coding: utf-8 -*-
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import pandas as pd
import numpy as np
import json
import pathlib
import random
import zipfile

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertConfig, AutoTokenizer
from transformers.optimization import AdamW

from scipy.special import expit
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [3]:
def encode_text_pairs(tokenizer, sentences):
    bs = 20000
    input_ids, attention_masks, token_type_ids = [], [], []
    
    for _, i in enumerate(range(0, len(sentences), bs)):
        tokenized_texts = []
        for sentence in sentences[i:i+bs]:
            final_tokens = ['']+tokenizer.tokenize( sentence )[:MAX_LEN-2]+['']
            arr = np.array(final_tokens)
            tokenized_texts.append(final_tokens)

        b_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

        b_input_ids = pad_sequences(b_input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

        b_token_type_ids = []
        for i, row in enumerate(b_input_ids):
            row = np.array(row)
            mask = row==tokenizer.convert_tokens_to_ids('')
            idx = np.where(mask)[0][0]
            token_type_row = np.zeros(row.shape[0], dtype=np.int)
            b_token_type_ids.append(token_type_row)

        b_attention_masks = []
        for seq in b_input_ids:
            seq_mask = [float(i>0) for i in seq]
            b_attention_masks.append(seq_mask)

        attention_masks.append(b_attention_masks)
        input_ids.append(b_input_ids)
        token_type_ids.append(b_token_type_ids)
    input_ids, attention_masks = np.vstack(input_ids), np.vstack(attention_masks)
    token_type_ids = np.vstack(token_type_ids)

    return input_ids, attention_masks, token_type_ids

In [4]:
SEED = 128
seed_everything(SEED)

In [5]:
MAX_LEN = 512

In [6]:
task_name = 'RuMedTest'
data_file_name = 'test.jsonl'

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
if device == -1:
    device = torch.device('cpu')
else:
    device = torch.device(device)

In [9]:
out_path = pathlib.Path('.').absolute()

In [10]:
data_path = pathlib.Path().absolute()

In [11]:
data_path = data_path.parent.parent.parent / 'data' / task_name / data_file_name

In [12]:
data_path

PosixPath('/ayb/vol2/home/blinoff/RuMedBench-draft-main/data/RuMedTest/test.jsonl')

In [13]:
test_data = pd.read_json(data_path, lines=True)

In [14]:
index_id = 'idx'
if task_name == 'RuMedTest':
    options = ['1', '2', '3', '4']
    question_id = 'question'
    label_id = 'answer'
else:
    raise ValueError('unknown task')

In [15]:
tokenizer = AutoTokenizer.from_pretrained("alexyalunin/RuBioRoBERTa")

In [16]:
from transformers import RobertaForSequenceClassification, RobertaForTokenClassification
model = RobertaForTokenClassification.from_pretrained('alexyalunin/RuBioRoBERTa')

Some weights of the model checkpoint at alexyalunin/RuBioRoBERTa were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [17]:
model = model.to(device)

In [18]:
model.eval();

In [19]:
def get_embeddings(texts):
    input_ids, attention_masks, token_type_ids = encode_text_pairs(tokenizer, texts)
    ##prediction_dataloader
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    token_type_ids = torch.tensor(token_type_ids)

    batch_size = 16
    prediction_data = TensorDataset(input_ids, attention_masks, token_type_ids)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, worker_init_fn=seed_worker)

    predictions = []
    for step, batch in enumerate(prediction_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids = batch
        with torch.no_grad():
            outputs = model( b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask )[0]
            outputs = outputs.detach().cpu().numpy()
            predictions.append(outputs)
    predictions = expit(np.vstack(predictions))
    return predictions

In [20]:
q_vecs = get_embeddings(test_data['question'])

In [21]:
nsamples, nx, ny = q_vecs.shape

In [22]:
d2_q_vecs = q_vecs.reshape((nsamples,nx*ny))

In [23]:
sims = []
for option in options:
    option_vecs = get_embeddings(test_data[option])
    
    nsamples, nx, ny = option_vecs.shape
    d2_option_vecs = option_vecs.reshape((nsamples,nx*ny))
    
    sim = cosine_similarity(d2_q_vecs, d2_option_vecs).diagonal()
    sims.append(sim)
sims = np.array(sims).T

In [24]:
recs = []
for i, pred in zip(test_data[index_id], sims):
    recs.append( { index_id: i, label_id: str(1+np.argmax(pred)) } )

In [25]:
out_fname = out_path / f'{task_name}.jsonl'
with open(out_fname, 'w') as fw:
    for rec in recs:
        json.dump(rec, fw, ensure_ascii=False)
        fw.write('\n')