In [1]:
!nvidia-smi

Fri Jun 10 01:32:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    45W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q sentence-transformers

In [3]:
import os
import sys

import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader

from sentence_transformers import SentenceTransformer, models
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from sentence_transformers import util

In [4]:
clean_legal_corpus_df = pd.read_pickle('/content/drive/MyDrive/NLP/Information_Retrieval/Legal_Text_Retrieval_Zalo2021/datasets/clean_legal_corpus.pkl').reset_index()
clean_legal_corpus_df

Unnamed: 0,law_id,article_id,text
0,01/2009/tt-bnn,1,phạm_vi áp_dụng thông_tư hướng_dẫn tuần_tra ca...
1,01/2009/tt-bnn,2,tổ_chức lực_lượng hàng trước mùa mưa_lũ uỷ_ban...
2,01/2009/tt-bnn,3,tiêu_chuẩn thành_viên lực_lượng tuần_tra canh_...
3,01/2009/tt-bnn,4,nhiệm_vụ lực_lượng tuần_tra canh_gác đê chấp_h...
4,01/2009/tt-bnn,5,phù_hiệu lực_lượng tuần_tra canh_gác đê phù_hi...
...,...,...,...
61420,99/2020/nđ-cp,60,thẩm_quyền xử_phạt hải_quan chi_cục trưởng chi...
61421,99/2020/nđ-cp,61,thẩm_quyền xử_phạt quản_lý thị_trường kiểm_soá...
61422,99/2020/nđ-cp,62,thẩm_quyền xử_phạt thanh_tra thanh_tra_viên gi...
61423,99/2020/nđ-cp,63,phân_định thẩm_quyền xử_phạt chủ_tịch uỷ_ban_n...


In [5]:
ids = clean_legal_corpus_df[['law_id', 'article_id']].to_dict('records')
corpus = clean_legal_corpus_df['text'].values

In [6]:
embedder = SentenceTransformer('/content/drive/MyDrive/NLP/Information_Retrieval/Legal_Text_Retrieval_Zalo2021/checkpoints/TSDAE_legal_corpus/153570')
embedder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [7]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/1920 [00:00<?, ?it/s]

In [8]:
top_k = 20
neg_pairs_top_20 = []

for i, (id, query_emb) in enumerate(zip(ids, corpus_embeddings)):
    cos_scores = util.cos_sim(query_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k+1)

    # print("\n\n======================\n\n")
    # print("Query:", corpus[i])
    # print("\nTop 5 most similar sentences in corpus:")

    neg_pair = {'positive': id, 'negative': [ids[idx] for idx in top_results[1][1:]]}
    neg_pairs_top_20.append(neg_pair)

    # for score, idx in zip(top_results[0], top_results[1]):
    #     print(ids[idx], "(Score: {:.4f})".format(score))

In [9]:
neg_pairs_top_20[:10]

[{'negative': [{'article_id': '1', 'law_id': '26/2009/tt-bnn'},
   {'article_id': '2', 'law_id': '113/2007/nđ-cp'},
   {'article_id': '7', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '1', 'law_id': '113/2007/nđ-cp'},
   {'article_id': '11', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '1', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '1', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '16', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '2', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '17', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '7', 'law_id': '26/2009/tt-bnn'},
   {'article_id': '4', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '17', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '18', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '2', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '9', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '10', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '4', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '4', 'la

In [10]:
import pickle

with open('/content/drive/MyDrive/NLP/Information_Retrieval/Legal_Text_Retrieval_Zalo2021/datasets/neg_pairs_top_20.pkl', 'wb') as f:
    pickle.dump(neg_pairs_top_20, f)

In [11]:
with open('/content/drive/MyDrive/NLP/Information_Retrieval/Legal_Text_Retrieval_Zalo2021/datasets/neg_pairs_top_20.pkl', 'rb') as f:
    test = pickle.load(f)

In [12]:
test[:10]

[{'negative': [{'article_id': '1', 'law_id': '26/2009/tt-bnn'},
   {'article_id': '2', 'law_id': '113/2007/nđ-cp'},
   {'article_id': '7', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '1', 'law_id': '113/2007/nđ-cp'},
   {'article_id': '11', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '1', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '1', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '16', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '2', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '17', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '7', 'law_id': '26/2009/tt-bnn'},
   {'article_id': '4', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '17', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '18', 'law_id': '53/2019/nđ-cp'},
   {'article_id': '2', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '9', 'law_id': '54/2013/tt-bnnptnt'},
   {'article_id': '10', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '4', 'law_id': '01/2009/tt-bnn'},
   {'article_id': '4', 'la