In [38]:
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
from datetime import datetime
from zipfile import ZipFile
import logging
import csv
import sys
import torch
import math
import gzip
import os
import nltk

In [3]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
#model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
model_name = "roberta-base"
batch_size = 16
num_epochs = 1
max_seq_length = 128
use_cuda = torch.cuda.is_available()

###### Read Datasets ######
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
qqp_dataset_path = 'quora-IR-dataset'


# Check if the STSb dataset exsist. If not, download and extract it
if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


# Check if the QQP dataset exists. If not, download and extract
if not os.path.exists(qqp_dataset_path):
    logging.info("Dataset not found. Download")
    zip_save_path = 'quora-IR-dataset.zip'
    util.http_get(url='https://sbert.net/datasets/quora-IR-dataset.zip', path=zip_save_path)
    with ZipFile(zip_save_path, 'r') as zipIn:
        zipIn.extractall(qqp_dataset_path)


cross_encoder_path = 'output/cross-encoder/stsb_indomain_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
bi_encoder_path = 'output/bi-encoder/qqp_cross_domain_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


In [4]:
###### Cross-encoder (simpletransformers) ######

logging.info("Loading cross-encoder model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
cross_encoder = CrossEncoder(model_name, num_labels=1)

###### Bi-encoder (sentence-transformers) ######

logging.info("Loading bi-encoder model: {}".format(model_name))

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2020-11-01 18:06:41 - Loading cross-encoder model: roberta-base


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

2020-11-01 18:06:47 - Use pytorch device: cuda
2020-11-01 18:06:47 - Loading bi-encoder model: roberta-base
2020-11-01 18:06:52 - Use pytorch device: cuda


In [5]:
logging.info("Step 1: Train cross-encoder: {} with STSbenchmark (source dataset)".format(model_name))

gold_samples = []
dev_samples = []
test_samples = []

with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        if row['split'] == 'dev':
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
        elif row['split'] == 'test':
            test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
        else:
            #As we want to get symmetric scores, i.e. CrossEncoder(A,B) = CrossEncoder(B,A), we pass both combinations to the train set
            gold_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
            gold_samples.append(InputExample(texts=[row['sentence2'], row['sentence1']], label=score))


# We wrap gold_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the cross-encoder model
cross_encoder.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=cross_encoder_path)

2020-11-01 18:06:52 - Step 1: Train cross-encoder: roberta-base with STSbenchmark (source dataset)
2020-11-01 18:06:52 - Warmup-steps: 72


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=719.0, style=ProgressStyle(description_wi…


2020-11-01 18:08:21 - CECorrelationEvaluator: Evaluating the model on sts-dev dataset after epoch 0:
2020-11-01 18:08:23 - Correlation:	Pearson: 0.9071	Spearman: 0.9036
2020-11-01 18:08:23 - Save model to output/cross-encoder/stsb_indomain_roberta-base-2020-11-01_18-06-41



In [78]:
logging.info("Step 2: Label QQP (target dataset) with cross-encoder: {}".format(model_name))

cross_encoder = CrossEncoder(cross_encoder_path)

silver_data = []

with open(os.path.join(qqp_dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['is_duplicate'] == '1':
            silver_data.append([row['question1'], row['question2']])

silver_scores1 = cross_encoder.predict(silver_data)

# All model predictions should be between [0,1]
assert all(0.0 <= score <= 1.0 for score in silver_scores1)

binary_silver_scores1 = [1 if score >= 0.5 else 0 for score in silver_scores1]


2020-11-01 19:03:00 - Step 2: Label QQP (target dataset) with cross-encoder: roberta-base
2020-11-01 19:03:04 - Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=3240.0, style=ProgressStyle(description_wid…




In [79]:
count1 = 0
count2 = 0
for i in binary_silver_scores1:
    if i == 1:
        count1 += 1
    else:
        count2 += 1
print(count1)
print(count2)

92399
11264


In [8]:
import ast

filename = r'/home/miboj/NLP/document-summarizer/data/processed/articles.json'
file = open(filename, encoding='ascii', errors='ignore')
text = file.read()
file.close()

json_content = ast.literal_eval(text)

In [45]:
import re
def remove_empty_string(input_string):
    for e, i in enumerate(input_string):
        try:
            if i[-1] == ' ' and input_string[e+1][-1] == ' ':
                input_string[e] = i.rstrip()
        except IndexError:
            print('Out of index')
    joined_string = ''.join(input_string)
    try:
        for e, i in enumerate(joined_string):
            if i == ' ' and joined_string[e+1] == ' ':
                del i
    except IndexError:
            print()
    sentences = nltk.sent_tokenize(joined_string)
    return sentences

In [47]:
sen_list = []
for i in json_content:
    for sen in i['content']:
        for o in nltk.sent_tokenize(sen):
            sen_list.append(o)

In [50]:
sen_list[100000]

'But the current serviceability state of this equipment, particularly those with the Indian Air Force (IAF) and the Navy, is less than 50 percent because of a lack of spares.'

In [74]:
silver_data = []
count = 0
for i in sen_list:
    try:
        silver_data.append([sen_list[count], sen_list[count+1]])
    except IndexError:
        break
    count += 2

In [75]:
silver_data[2]

['When it comes to leading at the highest levels of joint strategy and policy, and as someone who sets the standard for critical collaboration with our allies and partners, there is no one more qualified for the role of vice chief.',
 ' Allvin will succeed the current vice chief, Gen. Seve Wilson, who is expected to retire after 39 years in uniform.']

In [76]:

silver_scores = cross_encoder.predict(silver_data)

# All model predictions should be between [0,1]
assert all(0.0 <= score <= 1.0 for score in silver_scores)

binary_silver_scores = [1 if score >= 0.5 else 0 for score in silver_scores]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=2270.0, style=ProgressStyle(description_wid…




In [77]:
count = 0
count2 = 0
for i in binary_silver_scores:
    if i == 1:
        count += 1
    else:
        count2 += 1
        
print(count)
print(count2)

4412
68201


In [83]:
logging.info("Step 3: Train bi-encoder: {} over labeled QQP (target dataset)".format(model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Loading BERT labeled QQP dataset")
qqp_train_data = list(InputExample(texts=[data[0], data[1]], label=score) for (data, score) in zip(silver_data, binary_silver_scores))

train_dataset = SentencesDataset(qqp_train_data, bi_encoder)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder)

###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
logging.info("Read QQP dev dataset")

dev_sentences1 = []
dev_sentences2 = []
dev_labels = []

with open(os.path.join(qqp_dataset_path, "classification/dev_pairs.tsv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences1.append(row['question1'])
        dev_sentences2.append(row['question2'])
        dev_labels.append(int(row['is_duplicate']))

evaluator = BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          #evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=bi_encoder_path,
          output_path_ignore_not_empty=True
          )

2020-11-01 19:49:59 - Step 3: Train bi-encoder: roberta-base over labeled QQP (target dataset)
2020-11-01 19:49:59 - Loading BERT labeled QQP dataset
2020-11-01 19:49:59 - Read QQP dev dataset
2020-11-01 19:50:00 - Warmup-steps: 454


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 19:59:02 - Binary Accuracy Evaluation of the model on  dataset after epoch 0:
2020-11-01 19:59:53 - Accuracy with Cosine-Similarity:           77.64	(Threshold: 0.8493)
2020-11-01 19:59:53 - F1 with Cosine-Similarity:                 70.89	(Threshold: 0.7801)
2020-11-01 19:59:53 - Precision with Cosine-Similarity:          60.73
2020-11-01 19:59:53 - Recall with Cosine-Similarity:             85.14
2020-11-01 19:59:53 - Average Precision with Cosine-Similarity:  73.21

2020-11-01 19:59:54 - Accuracy with Manhatten-Distance:           77.57	(Threshold: 197.8090)
2020-11-01 19:59:54 - F1 with Manhatten-Distance:                 70.81	(Threshold: 233.6341)
2020-11-01 19:59:54 - Precision with Manhatten-Distance:          61.94
2020-11-01 19:59:54 - Recall with Manhatten-Distance:             82.64
2020-11-01 19:59:54 - Average Precision with Manhatten-Distance:  73.02

2020-11-01 19:59:54 - Accuracy with Euclidean-Distance:           77.60	(Threshold: 9.0927)
2020-11-01 19:59:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:08:42 - Binary Accuracy Evaluation of the model on  dataset after epoch 1:
2020-11-01 20:09:33 - Accuracy with Cosine-Similarity:           77.62	(Threshold: 0.8424)
2020-11-01 20:09:33 - F1 with Cosine-Similarity:                 71.21	(Threshold: 0.7595)
2020-11-01 20:09:33 - Precision with Cosine-Similarity:          60.70
2020-11-01 20:09:33 - Recall with Cosine-Similarity:             86.11
2020-11-01 20:09:33 - Average Precision with Cosine-Similarity:  73.21

2020-11-01 20:09:34 - Accuracy with Manhatten-Distance:           77.08	(Threshold: 203.9983)
2020-11-01 20:09:34 - F1 with Manhatten-Distance:                 70.08	(Threshold: 233.7149)
2020-11-01 20:09:34 - Precision with Manhatten-Distance:          61.75
2020-11-01 20:09:34 - Recall with Manhatten-Distance:             80.99
2020-11-01 20:09:34 - Average Precision with Manhatten-Distance:  72.12

2020-11-01 20:09:34 - Accuracy with Euclidean-Distance:           77.11	(Threshold: 9.1936)
2020-11-01 20:09:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:18:20 - Binary Accuracy Evaluation of the model on  dataset after epoch 2:
2020-11-01 20:19:11 - Accuracy with Cosine-Similarity:           77.74	(Threshold: 0.8379)
2020-11-01 20:19:11 - F1 with Cosine-Similarity:                 70.99	(Threshold: 0.7499)
2020-11-01 20:19:11 - Precision with Cosine-Similarity:          61.07
2020-11-01 20:19:11 - Recall with Cosine-Similarity:             84.76
2020-11-01 20:19:11 - Average Precision with Cosine-Similarity:  71.62

2020-11-01 20:19:11 - Accuracy with Manhatten-Distance:           77.05	(Threshold: 191.3977)
2020-11-01 20:19:11 - F1 with Manhatten-Distance:                 70.13	(Threshold: 242.3848)
2020-11-01 20:19:11 - Precision with Manhatten-Distance:          60.23
2020-11-01 20:19:11 - Recall with Manhatten-Distance:             83.92
2020-11-01 20:19:11 - Average Precision with Manhatten-Distance:  70.66

2020-11-01 20:19:12 - Accuracy with Euclidean-Distance:           77.09	(Threshold: 8.9445)
2020-11-01 20:19:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:27:55 - Binary Accuracy Evaluation of the model on  dataset after epoch 3:
2020-11-01 20:28:49 - Accuracy with Cosine-Similarity:           78.62	(Threshold: 0.8367)
2020-11-01 20:28:49 - F1 with Cosine-Similarity:                 72.15	(Threshold: 0.7766)
2020-11-01 20:28:49 - Precision with Cosine-Similarity:          63.66
2020-11-01 20:28:49 - Recall with Cosine-Similarity:             83.26
2020-11-01 20:28:49 - Average Precision with Cosine-Similarity:  74.64

2020-11-01 20:28:49 - Accuracy with Manhatten-Distance:           78.30	(Threshold: 219.6533)
2020-11-01 20:28:49 - F1 with Manhatten-Distance:                 71.58	(Threshold: 257.6602)
2020-11-01 20:28:49 - Precision with Manhatten-Distance:          62.60
2020-11-01 20:28:49 - Recall with Manhatten-Distance:             83.56
2020-11-01 20:28:49 - Average Precision with Manhatten-Distance:  74.10

2020-11-01 20:28:50 - Accuracy with Euclidean-Distance:           78.30	(Threshold: 10.1187)
2020-11-01 20:28

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:37:36 - Binary Accuracy Evaluation of the model on  dataset after epoch 4:
2020-11-01 20:38:27 - Accuracy with Cosine-Similarity:           77.85	(Threshold: 0.8187)
2020-11-01 20:38:27 - F1 with Cosine-Similarity:                 71.17	(Threshold: 0.7658)
2020-11-01 20:38:27 - Precision with Cosine-Similarity:          61.85
2020-11-01 20:38:27 - Recall with Cosine-Similarity:             83.79
2020-11-01 20:38:27 - Average Precision with Cosine-Similarity:  72.47

2020-11-01 20:38:27 - Accuracy with Manhatten-Distance:           77.38	(Threshold: 205.4686)
2020-11-01 20:38:27 - F1 with Manhatten-Distance:                 70.35	(Threshold: 229.6387)
2020-11-01 20:38:27 - Precision with Manhatten-Distance:          62.67
2020-11-01 20:38:27 - Recall with Manhatten-Distance:             80.19
2020-11-01 20:38:27 - Average Precision with Manhatten-Distance:  71.77

2020-11-01 20:38:28 - Accuracy with Euclidean-Distance:           77.40	(Threshold: 9.4526)
2020-11-01 20:38:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:47:12 - Binary Accuracy Evaluation of the model on  dataset after epoch 5:
2020-11-01 20:48:04 - Accuracy with Cosine-Similarity:           78.37	(Threshold: 0.8401)
2020-11-01 20:48:04 - F1 with Cosine-Similarity:                 71.30	(Threshold: 0.7566)
2020-11-01 20:48:04 - Precision with Cosine-Similarity:          61.40
2020-11-01 20:48:04 - Recall with Cosine-Similarity:             85.01
2020-11-01 20:48:04 - Average Precision with Cosine-Similarity:  73.30

2020-11-01 20:48:05 - Accuracy with Manhatten-Distance:           77.99	(Threshold: 194.8791)
2020-11-01 20:48:05 - F1 with Manhatten-Distance:                 70.76	(Threshold: 239.9119)
2020-11-01 20:48:05 - Precision with Manhatten-Distance:          61.75
2020-11-01 20:48:05 - Recall with Manhatten-Distance:             82.85
2020-11-01 20:48:05 - Average Precision with Manhatten-Distance:  72.79

2020-11-01 20:48:05 - Accuracy with Euclidean-Distance:           78.06	(Threshold: 9.0642)
2020-11-01 20:48:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 20:56:51 - Binary Accuracy Evaluation of the model on  dataset after epoch 6:
2020-11-01 20:57:42 - Accuracy with Cosine-Similarity:           78.50	(Threshold: 0.8432)
2020-11-01 20:57:42 - F1 with Cosine-Similarity:                 71.97	(Threshold: 0.7622)
2020-11-01 20:57:42 - Precision with Cosine-Similarity:          62.23
2020-11-01 20:57:42 - Recall with Cosine-Similarity:             85.33
2020-11-01 20:57:42 - Average Precision with Cosine-Similarity:  73.87

2020-11-01 20:57:43 - Accuracy with Manhatten-Distance:           78.17	(Threshold: 207.6480)
2020-11-01 20:57:43 - F1 with Manhatten-Distance:                 71.30	(Threshold: 246.2399)
2020-11-01 20:57:43 - Precision with Manhatten-Distance:          62.63
2020-11-01 20:57:43 - Recall with Manhatten-Distance:             82.75
2020-11-01 20:57:43 - Average Precision with Manhatten-Distance:  73.38

2020-11-01 20:57:43 - Accuracy with Euclidean-Distance:           78.23	(Threshold: 9.5726)
2020-11-01 20:57:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 21:06:29 - Binary Accuracy Evaluation of the model on  dataset after epoch 7:
2020-11-01 21:07:21 - Accuracy with Cosine-Similarity:           78.51	(Threshold: 0.8462)
2020-11-01 21:07:21 - F1 with Cosine-Similarity:                 71.99	(Threshold: 0.7699)
2020-11-01 21:07:21 - Precision with Cosine-Similarity:          63.19
2020-11-01 21:07:21 - Recall with Cosine-Similarity:             83.62
2020-11-01 21:07:21 - Average Precision with Cosine-Similarity:  73.95

2020-11-01 21:07:21 - Accuracy with Manhatten-Distance:           78.10	(Threshold: 207.1911)
2020-11-01 21:07:21 - F1 with Manhatten-Distance:                 71.34	(Threshold: 239.8456)
2020-11-01 21:07:21 - Precision with Manhatten-Distance:          63.36
2020-11-01 21:07:21 - Recall with Manhatten-Distance:             81.61
2020-11-01 21:07:21 - Average Precision with Manhatten-Distance:  73.31

2020-11-01 21:07:21 - Accuracy with Euclidean-Distance:           78.11	(Threshold: 9.5683)
2020-11-01 21:07:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 21:16:08 - Binary Accuracy Evaluation of the model on  dataset after epoch 8:
2020-11-01 21:17:02 - Accuracy with Cosine-Similarity:           78.63	(Threshold: 0.8316)
2020-11-01 21:17:02 - F1 with Cosine-Similarity:                 71.96	(Threshold: 0.7690)
2020-11-01 21:17:02 - Precision with Cosine-Similarity:          63.35
2020-11-01 21:17:02 - Recall with Cosine-Similarity:             83.29
2020-11-01 21:17:02 - Average Precision with Cosine-Similarity:  73.78

2020-11-01 21:17:02 - Accuracy with Manhatten-Distance:           78.16	(Threshold: 212.4594)
2020-11-01 21:17:02 - F1 with Manhatten-Distance:                 71.42	(Threshold: 246.7106)
2020-11-01 21:17:02 - Precision with Manhatten-Distance:          63.18
2020-11-01 21:17:02 - Recall with Manhatten-Distance:             82.12
2020-11-01 21:17:02 - Average Precision with Manhatten-Distance:  73.25

2020-11-01 21:17:03 - Accuracy with Euclidean-Distance:           78.22	(Threshold: 9.7475)
2020-11-01 21:17:

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=4539.0, style=ProgressStyle(description_w…


2020-11-01 21:25:47 - Binary Accuracy Evaluation of the model on  dataset after epoch 9:
2020-11-01 21:26:39 - Accuracy with Cosine-Similarity:           78.64	(Threshold: 0.8312)
2020-11-01 21:26:39 - F1 with Cosine-Similarity:                 71.91	(Threshold: 0.7704)
2020-11-01 21:26:39 - Precision with Cosine-Similarity:          63.42
2020-11-01 21:26:39 - Recall with Cosine-Similarity:             83.02
2020-11-01 21:26:39 - Average Precision with Cosine-Similarity:  73.87

2020-11-01 21:26:39 - Accuracy with Manhatten-Distance:           78.27	(Threshold: 214.8118)
2020-11-01 21:26:39 - F1 with Manhatten-Distance:                 71.44	(Threshold: 242.2879)
2020-11-01 21:26:39 - Precision with Manhatten-Distance:          64.37
2020-11-01 21:26:39 - Recall with Manhatten-Distance:             80.25
2020-11-01 21:26:39 - Average Precision with Manhatten-Distance:  73.37

2020-11-01 21:26:39 - Accuracy with Euclidean-Distance:           78.33	(Threshold: 9.8158)
2020-11-01 21:26:

In [8]:
bi_encoder = SentenceTransformer(bi_encoder_path)

logging.info("Read QQP test dataset")
test_sentences1 = []
test_sentences2 = []
test_labels = []

with open(os.path.join(qqp_dataset_path, "classification/test_pairs.tsv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        test_sentences1.append(row['question1'])
        test_sentences2.append(row['question2'])
        test_labels.append(int(row['is_duplicate']))

evaluator = BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels)
bi_encoder.evaluate(evaluator)

2020-11-01 16:18:00 - Load pretrained SentenceTransformer: output/bi-encoder/qqp_cross_domain_roberta-base-2020-11-01_15-00-31
2020-11-01 16:18:00 - Load SentenceTransformer from folder: output/bi-encoder/qqp_cross_domain_roberta-base-2020-11-01_15-00-31
2020-11-01 16:18:03 - Use pytorch device: cuda
2020-11-01 16:18:03 - Read QQP test dataset
2020-11-01 16:18:03 - Binary Accuracy Evaluation of the model on  dataset:
2020-11-01 16:19:55 - Accuracy with Cosine-Similarity:           76.97	(Threshold: 0.7849)
2020-11-01 16:19:55 - F1 with Cosine-Similarity:                 74.69	(Threshold: 0.7238)
2020-11-01 16:19:55 - Precision with Cosine-Similarity:          64.50
2020-11-01 16:19:55 - Recall with Cosine-Similarity:             88.71
2020-11-01 16:19:55 - Average Precision with Cosine-Similarity:  75.68

2020-11-01 16:19:55 - Accuracy with Manhatten-Distance:           76.45	(Threshold: 246.6084)
2020-11-01 16:19:55 - F1 with Manhatten-Distance:                 74.00	(Threshold: 278.2

0.7567855674036059

In [21]:
for i in range(1,1500):
    if test_labels[i] != 0 and test_labels[i] != 1:
        print(test_sentences1[i])
        print(test_sentences2[i])
        print(test_labels[i])

IndexError: list index out of range