#Imports and mounting

In [1]:
!pip install sentence-transformers
import pandas as pd
import json
import random
import csv
from collections import defaultdict
import pickle
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sentence_transformers import models, losses, util, SentenceTransformer, SentencesDataset, InputExample, evaluation
from sentence_transformers.cross_encoder import CrossEncoder
from torch import nn
from torch.utils.data import DataLoader
import torch

random.seed(13)








###Reading the android dataset; android_related.json

In [2]:
from google.colab import drive
drive.mount('/gdrive')
folder = '/gdrive/MyDrive/Linked'
os.makedirs(folder,exist_ok = True)
with open('/gdrive/MyDrive/sbert_supervised(for_data_only)/android_rearranged_linked.txt','rb') as a:
    qr = pickle.load(a)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


###Some Important Functions

In [3]:
def string_sentence(i,qr):
    return qr.loc[i,'Title'] + ' '  + ' '.join(qr.loc[i,'Tags']) + ' '  + qr.loc[i,'Text']

In [4]:
def both_related(qr):
    for i in qr.dropna(subset=['Linked']).index.values:
        if(len(qr.loc[i,'Linked'])!=0):
            for j in qr.loc[i,'Linked']:
                if(j in qr.index.values):
                    if((i in qr.loc[j,'Linked'])==0):
                        qr.loc[j,'Linked'].append(i)
    return qr

In [5]:
def insert_sent(index,qr):
    a = list()
    for j in range(0,len(index)):
        t1 = string_sentence(index[j][0],qr)
        t2 = string_sentence(index[j][1],qr)
        a.append([t1,t2])
    return a

In [6]:
qr.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 21217 entries, 31.0 to 218179.0
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AcceptedAnswerId  7955 non-null   float64
 1   AnswerCount       21217 non-null  float64
 2   AnswerIds         17132 non-null  object 
 3   Attachments       4246 non-null   object 
 4   OS                8331 non-null   object 
 5   PostTypeId        21217 non-null  float64
 6   Related           21217 non-null  object 
 7   Score             21217 non-null  float64
 8   Tags              21217 non-null  object 
 9   Text              21217 non-null  object 
 10  Title             21217 non-null  object 
 11  Related_Scores    17132 non-null  object 
 12  New_Related       17132 non-null  object 
 13  Linked            8797 non-null   object 
dtypes: float64(4), object(10)
memory usage: 2.4+ MB


#Creating Datas for different objectives (Only Once)

Source File: https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/quora_duplicate_questions/create_splits.py

Objectives: 

1. Classfication   
2. Related Question Mining
3. Information Retreival



In [7]:
# qr = both_related(qr)
train_qr,dev_qr = train_test_split(qr,test_size=0.02,shuffle = True,random_state=13)
train_qr,test_qr = train_test_split(train_qr,test_size=0.02,shuffle=True,random_state = 13)

###Positive Pair Generation

In [8]:
def pos_pair_gen(qr,qr_index):
    index = 0
    data_pos = list()
    al_pos = list()
    related_not_found = 0
    # score = list()
    for i in qr_index: 
        for j in qr.loc[i,'Linked']:
            if( ( [i,j] in al_pos or [j,i] in al_pos) != 1 ):
                if(j in qr.index.values):
                    # t1 = string_sentence(i,qr)
                    # t2 = string_sentence(j,qr)
                    data_pos.append([i,j])
                    index+= 1
                    al_pos.append([ i , j ])
                    # score.append(1)
                    # break
                else:
                    related_not_found+=1
                    # print('Related not found,parent id: ',i,' id:',j,'overall count: ',related_not_found)
    print(f'Len of data {len(data_pos)} and related not found = {related_not_found}')
    return data_pos

###Negative Data Pairs

In [9]:
def neg_pair_gen(qr,num_pairs,al_acc=list()):
    data_neg = list()
    for i in range(0,num_pairs):
        id1 = random.choice(qr.dropna(subset=['Linked']).index.values)
        id2 = random.choice(qr.dropna(subset=['Linked']).index.values)
        while(id2 in qr.loc[id1,'Linked'] or id1 in qr.loc[id2,'Linked'] or [id1,id2] in al_acc or [id2,id1] in al_acc):
            id1 = random.choice(qr.dropna(subset=['Linked']).index.values)
            id2 = random.choice(qr.dropna(subset=['Linked']).index.values)
        # t1 = string_sentence(id1,qr)
        # t2 = string_sentence(id2,qr)
        data_neg.append([id1,id2])
        # score.append(0)
        al_acc.append([id1,id2])
    print(len(data_neg))
    # print(len(score))
    return data_neg,al_acc

###Add transitive closure (if a,b and b,c duplicates => a,c are duplicates) (Not for related)

In [10]:
def transitive(qr,al_pos):
    for i in qr.index.values:
        for j in qr.loc[i,'Related']:
            if(j in qr.index.values):
                for k in qr.loc[j,'Related']:
                    if(k in qr.index.values):
                        if(([i,k] in al_pos or [k,i] in al_pos or i==k) ==0):
                            al_pos.append([i,k])
    print(len(al_pos))
    return al_pos
    # with open('/gdrive/MyDrive/quora_android/data_pos.txt','wb') as a:
    #     pickle.dump(data_pos,a)
    #     pickle.dump(al_pos,a)

###Dataset Formation, splitting and saving

In [11]:
train_data_pos = pos_pair_gen(train_qr,train_qr.dropna(subset=['Linked']).index.values)
train_data_neg,al_acc = neg_pair_gen(train_qr,len(train_data_pos))
dev_data_pos = pos_pair_gen(pd.concat([dev_qr,train_qr]),dev_qr.dropna(subset=['Linked']).index.values)
dev_data_neg,al_acc = neg_pair_gen(dev_qr,len(dev_data_pos),al_acc)
test_data_pos = pos_pair_gen(pd.concat([test_qr,train_qr]),test_qr.dropna(subset=['Linked']).index.values)
test_data_neg,al_acc = neg_pair_gen(test_qr,len(test_data_pos),al_acc)

Len of data 7118 and related not found = 494
7118
Len of data 244 and related not found = 5
244
Len of data 255 and related not found = 5
255


In [12]:
os.makedirs(folder+'/data/splits',exist_ok=True)
with open(folder+'/data/splits/pandas_split.txt','wb') as a:
    pickle.dump(train_qr,a)
    pickle.dump(dev_qr,a)
    pickle.dump(test_qr,a)

In [13]:
with open(folder+'/data/splits/data_pos_neg.txt','wb') as a:
    pickle.dump(train_data_pos,a)
    pickle.dump(train_data_neg,a)
    pickle.dump(dev_data_pos,a)
    pickle.dump(dev_data_neg,a)
    pickle.dump(test_data_pos,a)
    pickle.dump(test_data_neg,a)
    
    # For next step, save and load al_acc for paraphase mining negative pairs

In [14]:
# with open(folder+'/data/android.txt','rb') as a:
#     data_pos = pickle.load(a)
#     al_pos = pickle.load(a)
#     data_neg = pickle.load(a)
#     score = pickle.load(a)

with open(folder+'/data/splits/data_pos_neg.txt','rb') as a:
    train_data_pos = pickle.load(a)
    train_data_neg = pickle.load(a)
    dev_data_pos = pickle.load(a)
    dev_data_neg = pickle.load(a)
    test_data_pos = pickle.load(a)
    test_data_neg = pickle.load(a)

In [20]:
len(train_data_neg)

7118

In [21]:
# data = data_pos + data_neg
# data,score = shuffle(data,score)
# train_data,dev_data,train_score,dev_score = train_test_split(data,score,test_size=0.15,random_state=13)
# test_data,dev_data,test_score,dev_score = train_test_split(dev_data,dev_score,test_size=0.33,random_state=13)

train_score = np.ones(2*len(train_data_pos)).tolist()
for i in range(len(train_data_pos),2*len(train_data_pos)):
    train_score[i] = 0
train_data = train_data_pos + train_data_neg
train_data,train_score = shuffle(train_data,train_score)

dev_score = np.ones(2*len(dev_data_pos)).tolist()
for i in range(len(dev_data_pos),2*len(dev_data_pos)):
    dev_score[i] = 0
dev_data = dev_data_pos + dev_data_neg
dev_data,dev_score = shuffle(dev_data,dev_score)

test_score = np.ones(2*len(test_data_pos)).tolist()
for i in range(len(test_data_pos),2*len(test_data_pos)):
    test_score[i] = 0
test_data = test_data_pos + test_data_neg
test_data,test_score = shuffle(test_data,test_score)

In [22]:
print(train_score[:10])

[1.0, 1.0, 0, 1.0, 1.0, 0, 1.0, 1.0, 1.0, 1.0]


In [25]:
with open(folder+'/data/splits/train_dev_test.txt','wb') as a:
    pickle.dump(train_data,a)
    pickle.dump(train_score,a)
    pickle.dump(dev_data,a)
    pickle.dump(dev_score,a)
    pickle.dump(test_data,a)
    pickle.dump(test_score,a)

#Sentence Transformer (For Training, start from here)

###Loading Data

In [None]:
with open(folder+'/data/splits/pandas_split.txt','rb') as a:
    train_qr=pickle.load(a)
    dev_qr = pickle.load(a)
    test_qr = pickle.load(a)

In [None]:
with open(folder+'/data/splits/data_pos_neg.txt','rb') as a:
    train_data_pos = pickle.load(a)
    train_data_neg = pickle.load(a)
    dev_data_pos = pickle.load(a)
    dev_data_neg = pickle.load(a)
    test_data_pos = pickle.load(a)
    test_data_neg = pickle.load(a)

In [None]:
with open(folder+'/data/splits/train_dev_test.txt','rb') as a:
    train_data=pickle.load(a)
    train_score = pickle.load(a)
    dev_data = pickle.load(a)
    dev_score = pickle.load(a)
    test_data = pickle.load(a)
    test_score = pickle.load(a)

In [None]:
train_data = insert_sent(train_data,qr)
dev_data = insert_sent(dev_data,qr)
test_data = insert_sent(test_data,qr)

###Creating Dataset 

In [None]:
train_examples = list()
for i in range(0,len(train_data)):
    train_examples.append(InputExample(texts=[train_data[i][0],train_data[i][1]], label=train_score[i]))

dev_s1=list()
dev_s2 = list()
for i in range(0,len(dev_data)):
    dev_s1.append(dev_data[i][0])
    dev_s2.append(dev_data[i][1])
test_s1=list()
test_s2 = list()
for i in range(0,len(test_data)):
    test_s1.append(test_data[i][0])
    test_s2.append(test_data[i][1])

##Model

###Creating Model

In [None]:
# For model from HuggingFace Eg: BERT/ROberta etc.
# word_embedding_model = models.Transformer('distilbert-base-uncased',max_seq_length=512)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# #For Pretrrained Sentence-Transformers, downloaded from the github of sentence-transformers
model = SentenceTransformer('stsb-distilbert-base')

HBox(children=(FloatProgress(value=0.0, max=244715968.0), HTML(value='')))




####Models Parameters

In [None]:
batch_size = 16
num_epochs = 10
os.makedirs(folder+'/bert_w_spaces',exist_ok=True)
model_path = folder+'/bert_w_spaces'

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
margin = 0.8

train_constrantiveloss = DataLoader(train_examples,batch_size=batch_size,shuffle=True)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(model=model, distance_metric=distance_metric, margin=margin)

###MultipleNegativeRanking Loss

In [None]:
train_samples_MNRloss = list()
for i in range(0,len(train_data_pos)):
    train_samples_MNRloss.append(InputExample(texts=[train_data_pos[i][0],train_data_pos[i][1]],label=1))
    train_samples_MNRloss.append(InputExample(texts=[train_data_pos[i][1],train_data_pos[i][0]],label=1))

train_MNRloss = DataLoader(train_samples_MNRloss,batch_size=batch_size,shuffle=True)
train_loss_MNRloss = losses.MultipleNegativesRankingLoss(model)

###Evaluators

There are three evaluators mentioned in training_multi-task-learning.py

1.   BinaryClassification Evaluator
2.   Paraphase Mining (equivalent to top k related but for small number of queries)
3.   Information retreival(equivalent to top k related but for a larger number of querires; Use a bi encoder and cross encoder for efficient and fast retreival)

Both paraphase mining and information retrieval evaluators uses metrics such as Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)

** **For now only implementing binary classifier evaluator** **



In [None]:
evaluator_dev = list()
evaluator_test = list()

In [None]:
def corpus(qr):
    corpus = dict()
    for i in qr.index.values:
        corpus[i] = string_sentence(i,qr)
    return corpus

####Binary Evaluator

In [None]:
evaluator_b_dev = evaluation.BinaryClassificationEvaluator(dev_s1,dev_s2,dev_score,show_progress_bar=True)
evaluator_b_test = evaluation.BinaryClassificationEvaluator(test_s1, test_s2, test_score,show_progress_bar=True)
evaluator_dev.append(evaluator_b_dev)
evaluator_test.append(evaluator_b_test)

####Paraphrase Mining Evaluator

In [None]:
def paraphrase_dataset(index,qr,qr_orig):
    # Currently the code is for positive pairs only  
    duplicates_list = list()
    duplicates_dict = dict()
    for i in index:
        if(len(qr_orig.loc[i,'Related'])!=0):
            for j in qr_orig.loc[i,'Related']:
                if(j in qr.index.values):
                    duplicates_list.append([i,j])
                    # if(i in duplicates_dict_dev):
                    #     duplicates_dict_dev[i][j] = True
                    # else:
                    #     duplicates_dict_dev[i] = dict()
                    #     duplicates_dict_dev[i][j] = True #For negative pairs, create dataset again and save al_acc
    return duplicates_list

In [None]:
## For dev dataset
duplicates_list_dev = paraphrase_dataset(dev_qr.index.values,train_qr,qr) 
print(len(duplicates_list_dev))

## For test dataset
duplicates_list_test = paraphrase_dataset(test_qr.index.values,train_qr,qr)
print(len(duplicates_list_test))

1820
1738


In [None]:
# evaluator_p_dev = evaluation.ParaphraseMiningEvaluator(corpus(train_qr),duplicates_list_dev,show_progress_bar=True)
# evaluator_p_test = evaluation.ParaphraseMiningEvaluator(corpus(train_qr),duplicates_list_test,show_progress_bar=True)
# evaluator_dev.append(evaluator_p_dev)
# evaluator_test.append(evaluator_p_test)

####Information Retreival Evaluator

In [None]:
def info_ret(index,qr,qr_orig,limit):
    queries = dict()
    rel_docs = dict()
    cnt = 0
    done = list()
    lim_index = 0
    for j in index:
        if(len(qr_orig.loc[j,'Related'])!=0):
            for k in qr_orig.loc[j,'Related']:
                if(k in qr.index.values):
                    lim_index+=1
                    break
    if(lim_index<limit):
        limit = lim_index
    for i in range(0,limit):
        id = random.choice(index)
        while(id in done):
            id = random.choice(index)
        if(len(qr_orig.loc[id,'Related'])!=0):
            for j in qr_orig.loc[id,'Related']:
                if(j in qr.index.values):
                    queries[id] = string_sentence(id,qr_orig)
                    done.append(id)
                    if(id in rel_docs):
                        rel_docs[id].append(j)
                    else:
                        rel_docs[id] = list()
                        rel_docs[id].append(j)
        cnt+=1
    return queries,rel_docs

In [None]:
#For dev dataset
queries_dev,rel_docs_dev = info_ret(dev_qr.index.values,train_qr,qr,100)
#For test dataset
queries_test,rel_docs_test = info_ret(test_qr.index.values,train_qr,qr,100)
#For train
queries_train,rel_docs_train = info_ret(train_qr.index.values,train_qr,qr,100)

In [None]:
print(len(queries_train))

77


In [None]:
with open(folder+'/devinfo_100.txt','wb') as a:
    pickle.dump(queries_dev,a)
    pickle.dump(rel_docs_dev,a)

with open(folder+'/testinfo_100.txt','wb') as a:
    pickle.dump(queries_test,a)
    pickle.dump(rel_docs_test,a)

with open(folder+'/traininfo_100.txt','wb') as a:
    pickle.dump(queries_train,a)
    pickle.dump(rel_docs_train,a)

In [None]:
with open(folder+'/devinfo_100.txt','rb') as a:
    queries_dev = pickle.load(a)
    rel_docs_dev= pickle.load(a)

with open(folder+'/testinfo_100.txt','rb') as a:
    queries_test= pickle.load(a)
    rel_docs_test= pickle.load(a)

with open(folder+'/traininfo_100.txt','rb') as a:
    queries_train= pickle.load(a)
    rel_docs_train= pickle.load(a)

In [None]:
evaluator_i_dev = evaluation.InformationRetrievalEvaluator(queries_dev,corpus(train_qr),rel_docs_dev,show_progress_bar=True)
evaluator_i_test = evaluation.InformationRetrievalEvaluator(queries_test,corpus(train_qr),rel_docs_test,show_progress_bar=True)
evaluator_i_train = evaluation.InformationRetrievalEvaluator(queries_train,corpus(train_qr),rel_docs_train,show_progress_bar=True)
evaluator_dev.append(evaluator_i_dev)
evaluator_test.append(evaluator_i_test)

####Sequential Evaluator

Combining all the three evaluators mentioned above

In [None]:
seq_evaluator_dev = evaluation.SequentialEvaluator(evaluator_dev, main_score_function=lambda scores: scores[-1])
seq_evaluator_test = evaluation.SequentialEvaluator(evaluator_test, main_score_function=lambda scores: scores[-1])

Evaluating Model without training

In [None]:
# seq_evaluator_dev(model, epoch=0, steps=0, output_path=model_path)
# seq_evaluator_test(model, epoch=0, steps=0, output_path=model_path)

In [None]:
model_pre = SentenceTransformer(folder+'/bert_w_spaces')
model_idk = SentenceTransformer(folder+'/distilbert_margin_9')

In [None]:
os.makedirs(folder+'/bert_w_spaces/trial',exist_ok=True)
evaluator_i_train(model_idk,output_path=folder+'/bert_w_spaces/trial')

HBox(children=(FloatProgress(value=0.0, description='Batches', max=3.0, style=ProgressStyle(description_width=…

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-42-89880170e8bd>", line 2, in <module>
    evaluator_i_train(model_idk,output_path=folder+'/bert_w_spaces/trial')
  File "/usr/local/lib/python3.7/dist-packages/sentence_transformers/evaluation/InformationRetrievalEvaluator.py", line 104, in __call__
    query_embeddings = model.encode(self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True)
  File "/usr/local/lib/python3.7/dist-packages/sentence_transformers/SentenceTransformer.py", line 194, in encode
    out_features = self.forward(features)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/container.py", line 119, in forward
    input = module(input)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
    re

KeyboardInterrupt: ignored

###Fitting Model

In [None]:
import math
warmup_steps = math.ceil(len(train_examples) * num_epochs / batch_size * 0.1) #10% of train data for warm-up

In [None]:
print(torch.cuda.is_available())

False


In [None]:
x = list()
y = list()
def callback_model(score,epoch,steps):
    if(steps==-1):
        steps=0
        epoch+=1
    x.append(float(epoch + steps/89590))
    y.append(float(score))
    print('Score after epoch ', str(epoch),' and steps ', str(steps),'is ',str(score))

In [None]:
model.fit(train_objectives=[(train_constrantiveloss,train_loss_ConstrativeLoss),(train_MNRloss,train_loss_MNRloss)],
          evaluator=seq_evaluator_dev,
          epochs=num_epochs,
          evaluation_steps=4000,
          warmup_steps = warmup_steps,
          output_path=model_path,
          output_path_ignore_not_empty=True,
          use_amp = True,
          callback = callback_model
          )

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=9533.0, style=ProgressStyle(description_w…

TypeError: ignored

Plots

In [None]:
# %matplotlib inline
# from matplotlib import pyplot as plt
# plt.plot(x,y)
# plt.show()

# os.makedirs(folder+'/plots',exist_ok=True)
# plt.savefig(folder+'/plots/accuarcy.png')
# with open(folder+'/plots/plot_data.txt','wb') as f:
#     pickle.dump(x,f)
#     pickle.dump(y,f)

#Testing

###Loading Models

In [None]:
os.makedirs(folder+'/test/pretrained',exist_ok=True)
os.makedirs(folder+'/test/final_3',exist_ok=True)
os.makedirs(folder+'/test/sbert_naive',exist_ok=True)
os.makedirs(folder+'/test/quora_we',exist_ok=True)

os.makedirs(folder+'/dev_t/pretrained',exist_ok=True)
os.makedirs(folder+'/dev_t/final_3',exist_ok=True)
os.makedirs(folder+'/dev_t/sbert_naive',exist_ok=True)
os.makedirs(folder+'/dev_t/quora_we',exist_ok=True)

os.makedirs(folder+'/train_t/pretrained',exist_ok=True)
os.makedirs(folder+'/train_t/final_3',exist_ok=True)
os.makedirs(folder+'/train_t/sbert_naive',exist_ok=True)
os.makedirs(folder+'/train_t/quora_we',exist_ok=True)

model_pre = SentenceTransformer('stsb-distilbert-base')
model_final = SentenceTransformer('/gdrive/MyDrive/final_model_3')
model_unk = SentenceTransformer('//gdrive/MyDrive/sbert_supervised')
model_quora_we = SentenceTransformer(folder+'/distilbert_quora')

In [None]:
sent1 = ['Shroud','Valorant']
sent2 = ['Omen','Sage']
a = model.encode(sent1,convert_to_tensor=True)
b = model.encode(sent2,convert_to_tensor=True)
cos_sim = util.pytorch_cos_sim(a, b)
cos = nn.CosineSimilarity(dim=1,eps=1e-6)
cos_sim2 = cos(a,b)

In [None]:
evaluator_i_train(model_pre, epoch=0, steps=0, output_path=folder+'/train_t/pretrained')
evaluator_i_train(model_final, epoch=0, steps=0, output_path=folder+'/train_t/final_3')
evaluator_i_train(model_unk, epoch=0, steps=0, output_path=folder+'/train_t/sbert_naive')
evaluator_i_train(model_quora_we, epoch=0, steps=0, output_path=folder+'/train_t/quora_we')

In [None]:
seq_evaluator_dev(model_pre, epoch=0, steps=0, output_path=folder+'/dev_t/pretrained')
seq_evaluator_dev(model_final, epoch=0, steps=0, output_path=folder+'/dev_t/final_3')
seq_evaluator_dev(model_unk, epoch=0, steps=0, output_path=folder+'/dev_t/sbert_naive')
seq_evaluator_dev(model_quora_we, epoch=0, steps=0, output_path=folder+'/dev_t/quora_we')

In [None]:
seq_evaluator_test(model_pre, epoch=0, steps=0, output_path=folder+'/test/pretrained')
seq_evaluator_test(model_final, epoch=0, steps=0, output_path=folder+'/test/final_3')
seq_evaluator_test(model_unk, epoch=0, steps=0, output_path=folder+'/test/sbert_naive')
seq_evaluator_test(model_quora_we, epoch=0, steps=0, output_path=folder+'/test/quora_we')

In [None]:
os.makedirs(folder+'/test',exist_ok=True)
seq_evaluator_test(model,output_path=folder+'/test')

In [None]:
os.makedirs(folder+'/train',exist_ok=True)
evaluator_i_train(model,output_path=folder+'/train')

# AUC

In [None]:
torch.cuda.current_device()

0

In [None]:
from sentence_transformers import util
import torch.nn as nn

In [None]:
model = SentenceTransformer(folder+'/distilbert_margin_8')

In [None]:
train_s1 = list()
train_s2 = list()
for j in test_data:
    train_s1.append(j[0])
    train_s2.append(j[1])

In [None]:
emb1 = model.encode(train_s1,show_progress_bar=True,convert_to_tensor = True,device=0)
emb2 = model.encode(train_s2,show_progress_bar=True,convert_to_tensor = True,device=0)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=106.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=106.0, style=ProgressStyle(description_widt…




In [None]:
cos = nn.CosineSimilarity(dim=1,eps=1e-6)
y_pred = cos(emb1,emb2)

In [None]:
import sklearn.metrics as skm
score = skm.roc_auc_score(test_score,y_pred.cpu().numpy(),max_fpr = 0.05)

In [None]:
score

0.8625472456036328