# Setup

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
! unzip drive/MyDrive/sbert-data.zip

Archive:  drive/MyDrive/sbert-data.zip
  inflating: data/multinli_dev_matched.csv  
  inflating: data/multinli_dev_mismatched.csv  
  inflating: data/multinli_train.csv  
  inflating: data/newspaper.json     
  inflating: data/snli_dev.csv       
  inflating: data/snli_test.csv      
  inflating: data/snli_train.csv     
  inflating: data/sts-dev.csv        
  inflating: data/sts-test.csv       
  inflating: data/sts-train.csv      


In [7]:
! pip install transformers sentence_transformers



# Stuff

In [8]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util

from typing import List

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
mapLabels = { 'entailment': 1.0, 'neutral': 0.0, 'contradiction': -1.0 }
def readCsv(filenames: List[str], isSTS: bool = False):
    samples = []
    s1s = []
    s2s = []
    labels = []
    for filename in filenames:
        c = 0
        with open(filename, 'r') as f:
            if not isSTS:
                next(f)
            for l in tqdm(f):
                data = l.split('\t')

                label = None
                if isSTS:
                    label = float(data[4]) / 2.5 - 1
                elif data[0] in mapLabels:
                    label = float(mapLabels[data[0]])

                if label is not None:
                    samples.append(InputExample(texts=[data[5], data[6]], label=label, ))
                    s1s.append(data[5])
                    s2s.append(data[6])
                    labels.append(label)
    return (
        DataLoader(samples, shuffle=True, batch_size=64),
        evaluation.EmbeddingSimilarityEvaluator(s1s, s2s, labels, write_csv=True, show_progress_bar=True, name='diocanevaluator'),
        s1s, s2s, torch.Tensor(labels)
    )

In [10]:
dataPreTr, evaluatorPre, _, _, _ = readCsv([ './data/snli_train.csv', './data/multinli_train.csv'])
dataTr, evaluator, _, _, _ = readCsv([ './data/sts-train.csv'], isSTS=True)
dataDe, _, _, _, _ = readCsv([ './data/sts-dev.csv'], isSTS=True)
dataTe, _, s1Test, s2Test, lTest = readCsv([ './data/sts-test.csv'], isSTS=True)

550152it [00:04, 115069.14it/s]
392700it [00:03, 117602.34it/s]
5749it [00:00, 241642.82it/s]
1500it [00:00, 224014.81it/s]
1379it [00:00, 144439.75it/s]


In [11]:
word_embedding_model = models.Transformer('albert-base-v2', max_seq_length=64)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)
loss = losses.CosineSimilarityLoss(model)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

In [None]:
model.fit(
    train_objectives=[(dataPreTr, loss)],
    epochs=1,
    warmup_steps=10,
    evaluation_steps=1000,
    evaluator=evaluatorPre,
    output_path='diocan.model'
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/14720 [00:00<?, ?it/s]

Batches:   0%|          | 0/58880 [00:00<?, ?it/s]

Batches:   0%|          | 0/58880 [00:00<?, ?it/s]

In [None]:
model.fit(
    train_objectives=[(dataTr, loss)],
    epochs=1,
    warmup_steps=10,
    evaluation_steps=1000,
    evaluator=evaluator,
    output_path='diocan.model'
)


In [None]:
e1 = model.encode(s1Test)
e2 = model.encode(s2Test)
cos_sim = util.cos_sim(e1, e2)
predictions = torch.Tensor([ cos_sim[i][i] for i in range(cos_sim.shape[0]) ]).round()


In [None]:
torch.sum(lTest.round() == predictions) / len(predictions)

In [None]:
torch.mean(lTest - predictions)

# Already done

In [None]:
import math
import os
import gzip
import csv
import logging
from datetime import datetime
from torch.utils.data import DataLoader
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [None]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
nliPath = 'data/AllNLI.tsv.gz'
slsPath = 'data/stsbenchmark.tsv.gz'
if not os.path.exists(nliPath): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nliPath)
if not os.path.exists(slsPath): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', slsPath)

  0%|          | 0.00/40.8M [00:00<?, ?B/s]

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [None]:
modelName = 'distilbert-base-uncased'
batchSize = 16
nEpochs = 1
savePath = 'models/nli_'+modelName.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
word_embedding_model = models.Transformer(modelName)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

2021-12-07 13:09:42 - Use pytorch device: cuda


In [None]:
mapLabel = {"contradiction": 0, "entailment": 1, "neutral": 2}
def loadDataset(path, split, isSts=False, perc=1):
    with gzip.open(path, 'rt', encoding='utf8') as fIn:
        data = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        samples = [
            InputExample(texts=[row['sentence1'], row['sentence2']], label=float(row['score']) / 5.0 if isSts else mapLabel[row['label']])
            for row in data if row['split'] == split
        ]
    return samples[:int(len(samples)*perc)]

In [None]:
trSamples = loadDataset(nliPath, 'train', perc=0.01)
devSamples = loadDataset(slsPath, 'dev', isSts=True, perc=0.01)

trDataloader = DataLoader(trSamples, shuffle=True, batch_size=batchSize)
trLoss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(mapLabel))

devEval = EmbeddingSimilarityEvaluator.from_input_examples(devSamples, batch_size=batchSize, name='sts-dev')

warmup = math.ceil(len(trSamples) * nEpochs * 0.1) #10% of train data for warm-up

2021-12-07 13:10:27 - Softmax loss: #Vectors concatenated: 3


In [None]:
model.fit(
    train_objectives=[(trDataloader, devSamples)],
    evaluator=devEval,
    epochs=nEpochs,
    evaluation_steps=1000,
    warmup_steps=warmup,
    output_path=savePath
)

AttributeError: ignored

In [None]:
testSamples = loadDataset(slsPath, 'test', isSts=True)
model = SentenceTransformer(savePath)
testEval = EmbeddingSimilarityEvaluator.from_input_examples(testSamples, batch_size=batchSize, name='sts-test')
testEval(model, output_path=savePath)