# Load GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dependencies

In [None]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 24.7 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 37.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 54.1 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 487 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

# Common

In [None]:
import math
import os
import gzip
import csv
import torch
import json
from datetime import datetime
from tqdm import tqdm
from typing import Dict, List, Tuple, Callable
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, LabelAccuracyEvaluator

In [None]:
def loadDataset(filename: str, getScore: Callable[[Dict], float]) -> Tuple[List[InputExample], List[InputExample], List[InputExample]]:
    tr = []
    dev = []
    test = []
    with gzip.open(filename, 'rt', encoding='utf8') as f:
        for row in tqdm(csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)):
            sample = InputExample(texts=[row['sentence1'], row['sentence2']], label=getScore(row))
            sample.texts
            if row['split'] == 'train':
                tr.append(sample)
            elif row['split'] == 'dev':
                dev.append(sample)
            elif row['split'] == 'test':
                test.append(sample)
    return (tr, dev, test)

In [None]:
nliPath = 'data/AllNLI.tsv.gz'
slsPath = 'data/stsbenchmark.tsv.gz'
if not os.path.exists(nliPath): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nliPath)
if not os.path.exists(slsPath): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', slsPath)

  0%|          | 0.00/40.8M [00:00<?, ?B/s]

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [None]:
label2int = {"contradiction": 0, "neutral": 1, "entailment": 2 }
NLItr, NLIdev, NLItest = loadDataset(nliPath, lambda row: label2int[row['label']])
STStr, STSdev, STStest = loadDataset(slsPath, lambda row: (float(row['score']) / 2.5) - 1.0)

981382it [00:12, 75800.06it/s]
8628it [00:00, 84266.59it/s]


In [None]:
modelName = 'distilbert-base-uncased'
batchSize = 16

# Regression

In [None]:
savePath = f'models/sbert_{modelName}_regression_STS'

In [None]:
embeddingModel = models.Transformer(modelName)
poolingModel = models.Pooling(
    embeddingModel.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[embeddingModel, poolingModel])

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
loss = losses.CosineSimilarityLoss(model=model)
dataloaderTr = DataLoader(STStr, shuffle=True, batch_size=batchSize)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(STSdev, name='sts-dev')

In [None]:
nEpochs = 10
model.fit(
    train_objectives=[(dataloaderTr, loss)],
    evaluator=evaluator,
    epochs=nEpochs,
    evaluation_steps=100,
    warmup_steps=math.ceil(len(dataloaderTr) * nEpochs  / 10),
    output_path=savePath
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [None]:
model = SentenceTransformer(savePath)
testEvaluator = EmbeddingSimilarityEvaluator.from_input_examples(STStest, name='sts-test')
testEvaluator(model, output_path=savePath)

0.7045706722263464

# Classification

Training on NLI

In [None]:
savePath = f'models/sbert_{modelName}_classification_NLI'

In [None]:
embeddingModel = models.Transformer(modelName)
poolingModel = models.Pooling(
    embeddingModel.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[embeddingModel, poolingModel])

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
dataloaderTr = DataLoader(NLItr, shuffle=True, batch_size=batchSize)
dataloaderDev = DataLoader(NLIdev, shuffle=False, batch_size=512) 
evaluator = LabelAccuracyEvaluator(dataloaderDev, name='nli-dev', softmax_model=loss)

In [None]:
nEpochs = 1
model.fit(
    train_objectives=[(dataloaderTr, loss)],
    evaluator=evaluator,
    epochs=nEpochs,
    evaluation_steps=2000,
    warmup_steps=math.ceil(len(dataloaderTr) * nEpochs  / 10),
    output_path=savePath
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/58880 [00:00<?, ?it/s]

In [None]:
model = SentenceTransformer(savePath).to('cuda:0')
dataloaderTest = DataLoader(NLItest, shuffle=False, batch_size=512) 
testEvaluator = LabelAccuracyEvaluator(dataloaderTest, name='nli-test', softmax_model=loss) 
testEvaluator(model, output_path=savePath)

0.7963471713471714

Fine-tuning on STS

In [None]:
savePath = f'models/sbert_{modelName}_classification_NLI-STS'

In [None]:
loss = losses.CosineSimilarityLoss(model=model).to('cuda:0')
dataloaderTr = DataLoader(STStr, shuffle=True, batch_size=batchSize)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(STSdev, name='sts-dev')

In [None]:
nEpochs = 10
model.fit(
    train_objectives=[(dataloaderTr, loss)],
    evaluator=evaluator,
    epochs=nEpochs,
    evaluation_steps=100,
    warmup_steps=math.ceil(len(dataloaderTr) * nEpochs  / 10),
    output_path=savePath
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

In [None]:
model = SentenceTransformer(savePath)
testEvaluator = EmbeddingSimilarityEvaluator.from_input_examples(STStest, name='sts-test')
testEvaluator(model, output_path=savePath)

0.7626878851160475

# Newspaper query

In [None]:
! mkdir -p embeddings

In [None]:
embedingsFilename = 'embeddings/embeddings.tensor'

In [None]:
if not os.path.exists('data/newspaper.json'):
  util.http_get('http://nicolatoscan.altervista.org/newspaper.json', 'data/newspaper.json')

  0%|          | 0.00/83.7M [00:00<?, ?B/s]

In [None]:
corpus = []
links = []
with open('data/newspaper.json', 'r') as f:
  corpus.extend([ json.loads(l)['headline'] for l in f ])
with open('data/newspaper.json', 'r') as f:
  links.extend([ json.loads(l)['link'] for l in f ])

In [None]:
def getCorpusEmbeding():
  if not os.path.exists(embedingsFilename):
    # Compute embeddings
    emb =  model.encode(corpus, convert_to_tensor=True)
    torch.save(emb, embedingsFilename)
    return emb
  else:
    # Load from file
    return torch.load(embedingsFilename)
embeddings = getCorpusEmbeding()

In [None]:
def query(query, k=5):
  cosScores = util.pytorch_cos_sim(
    model.encode(query, convert_to_tensor=True),
    embeddings
  )[0]
  res = torch.topk(cosScores, k=k)
  titles = [ ( score, corpus[idx], links[idx] ) for score, idx in zip(res[0], res[1]) ]
  for scores, title, link in titles:
    print(title, link)

In [None]:
query('President of the USA', k=10)

The Case For President Pence https://www.huffingtonpost.com/entry/the-case-for-president-pence_us_59224398e4b03b485cb262f1
The United Base Of America https://www.huffingtonpost.com/entry/the-united-base-of-america_us_59d4dabbe4b0da85e7f5ed2f
Come To Listen, Mr. President https://www.huffingtonpost.com/entry/come-to-listen-mr-president_us_59ca33c0e4b0f2df5e83b146
Joe Scarborough: Steve Bannon Is President Of The United States https://www.huffingtonpost.com/entry/steve-bannon-president-joe-scarborough_us_593175c0e4b0c242ca2344ab
From the Steps of the United States Supreme Court https://www.huffingtonpost.com/entry/from-the-steps-of-the-united-states-supreme-court_b_7182938.html
The 'President Of The United States' Made An Appearance At The Tonys https://www.huffingtonpost.com/entry/frank-underwood-president-tony-awards_us_593eab9be4b0b13f2c6c724a
Americans https://www.huffingtonpost.com/entry/post_us_5b9d5d4ae4b03a1dcc871566
About That American Idea https://www.huffingtonpost.comhttp://w

# Save model to GDrive

In [None]:
! zip -r models.zip models

updating: models/ (stored 0%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/ (stored 0%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/tokenizer_config.json (deflated 41%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/sentence_bert_config.json (deflated 4%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/similarity_evaluation_sts-test_results.csv (deflated 46%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/pytorch_model.bin (deflated 8%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/vocab.txt (deflated 53%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/config.json (deflated 44%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/1_Pooling/ (stored 0%)
updating: models/sbert_distilbert-base-uncased_classification_NLI-STS/1_Pooling/config.json (deflated 47%)
updating: models/sbert_distilbert-base-uncased_c

In [None]:
! cp models.zip drive/MyDrive/SBERT/models.zip

In [None]:
! zip -r embeddings.zip embeddings

  adding: embeddings/ (stored 0%)
  adding: embeddings/embeddings.tensor (deflated 7%)


In [None]:
! cp embeddings.zip drive/MyDrive/SBERT/embeddings.zip

 # Download from GDrive

In [None]:
! cp drive/MyDrive/SBERT/model.zip .

In [None]:
! unzip model.zip

In [None]:
! cp drive/MyDrive/SBERT/embeddings.zip .

In [None]:
! unzip embeddings.zip