In [1]:
!pip install sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!cd '/content/drive/MyDrive/2023 Spring/DataScinece Project'

Mounted at /content/drive


In [3]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [4]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [5]:
pretrained_model_name = "allenai/longformer-base-4096"
sts_num_epochs = 4
train_batch_size = 16

sts_model_save_path = 'output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [6]:
glue_sts_train = load_dataset("glue", "stsb", split='train[:90%]')
glue_sts_valid = load_dataset("glue", "stsb", split='train[-10%:]') 
glue_sts_test = load_dataset("glue", "stsb", split='validation')

print('Length of Train : ',len(glue_sts_train))
print('Length of Valid : ',len(glue_sts_valid))
print('Length of Test : ',len(glue_sts_test))

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.




Length of Train :  5174
Length of Valid :  575
Length of Test :  1500


In [7]:
glue_sts_train[0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'label': 5.0,
 'idx': 0}

In [8]:
def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['sentence1']
        sentence2 = data['sentence2']
        score = (data['label']) / 5.0  
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

In [9]:
sts_train_examples = make_sts_input_example(glue_sts_train)
sts_valid_examples = make_sts_input_example(glue_sts_valid)
sts_test_examples = make_sts_input_example(glue_sts_test)

In [10]:
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

In [11]:
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=2048,
    do_lower_case=True
)


pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
train_loss = losses.CosineSimilarityLoss(model=model)

warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) 
logging.info("Warmup-steps: {}".format(warmup_steps))

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/324 [00:00<?, ?it/s]

Iteration:   0%|          | 0/324 [00:00<?, ?it/s]

Iteration:   0%|          | 0/324 [00:00<?, ?it/s]

Iteration:   0%|          | 0/324 [00:00<?, ?it/s]

In [14]:
test_evaluator(model, output_path=sts_model_save_path)

0.8813661918701089

In [16]:
docs = ['By Andrea Shalal and Pete Schroeder WASHINGTON (Reuters) - JPMorgan Chase &amp; Co\'s deal to buy First Republic Bank pushed the Biden administration into a corner, leaving officials scrambling to explain how their stance against mergers squared with allowing the largest U.S. bank to get even bigger. At a White House event on small business on Monday, President Joe Biden hailed the sale of the troubled San Francisco-based lender, saying it would protect all depositors and avert a government bailout. He did not mention JPMorgan and underscored his call for stronger banking regulations. Senator Elizabeth Warren, a Democrat and member of the Senate Banking Committee who has been pushing for tighter banking regulations, blasted the decision, sounding a theme that could hound Biden, who last week announced his bid to win another term in the White House and has struggled with low approval ratings. "A poorly supervised bank was snapped up by an even bigger bank — ultimately taxpayers will be on the hook," Warren tweeted. White House press secretary Karine Jean-Pierre said JPMorgan\'s acquisition of First Republic\'s assets was necessary to ensure continued resilience of the banking system and came at no cost to taxpayers. "No recent administration has done more to promote competition, address (the) concentration process across industries," she told a White House briefing. Jean-Pierre added that Biden administration officials valued the fact that community banks offer services to those who might not otherwise have banking access. The deal for the failed lender comes amid increased discussion among U.S. regulators about tightening rules on bank mergers, with officials growing worried that consolidation could undermine financial stability and leave communities wanting for services. Administration officials, mindful of the impact of a JPMorgan takeover on the banking sector, prodded smaller lenders to submit bids and worked hard to find a different solution, but the size of JPMorgan\'s offer ultimately gave it an edge, according to sources familiar with the process. Current law means the Federal Deposit Insurance Corp was legally bound to choose the offer that cost the least, said Aaron Klein, a former Treasury official and Senate staffer who helped craft the Dodd-Frank reform law passed in the wake of the global financial crisis. In the end, the need to avert contagion in the banking sector trumped worries about JPMorgan\'s becoming more powerful, former officials said. "Too big to fail is obviously a worry, but right now you\'ve got to put out the hottest fire first," said Ben Harris, who left his post as Treasury assistant secretary for economic policy at the end of March and had served as chief economist to Biden when he was President Barack Obama\'s vice president. (Reporting by Andrea Shalal and Pete Schroeder; additional reporting by David Lawder, Sruthi Shankar, Chris Prentice and Douglas Gillison; Editing by Leslie Adler)',
 '(Bloomberg) -- Senator Joe Manchin said he would push to repeal parts of the Inflation Reduction Act that he claims could raise the national debt. Manchin, who played a key role in the IRA being signed into law last year, said President Joe Biden is attempting to enforce the bill’s environmental provisions at an ever-increasing cost. “I’m going to repeal sections of what they’re trying to expand on,” Manchin said at the Milken Institute Global Conference in Beverly Hills, California, Monday. “They’re trying to expand the bill beyond what we wanted.” The Congressional Budget Office estimates that the cost of some energy-related tax credits has ballooned to $570 billion from $270 billion due to how the administration is enacting the bill, Manchin said. “I’m just so tired of all of us as Americans are getting games played,” he said. Biden is under increasing pressure to come to an agreement on raising the debt ceiling to avoid default. Manchin is the only Democrat in the chamber who isn’t calling for Congress to pass an increase of the ceiling with no strings attached, depriving Senate Majority Leader Chuck Schumer of a key vote. The US will not default, but should focus on how to reduce its debt, Manchin said. ©2023 Bloomberg L.P.',
 "(Adds more details, background) By David Shepardson WASHINGTON, May 1 (Reuters) - The United States will end its COVID-19 vaccination requirements for international travelers and federal workers on May 11, when the coronavirus public health emergency ends, the White House said on Monday. In February, the U.S. House of Representatives voted to lift the requirement that most foreign air travelers be vaccinated against COVID-19, one of the few remaining pandemic travel restrictions still in place. The Biden administration last June dropped its requirement that people arriving in the U.S. by air must test negative for COVID but kept in place Centers for Disease Control and Prevention (CDC) vaccination requirements for most foreign travelers. The rules barred Serbian tennis star Novak Djokovic from taking part in some U.S. tournaments because he is not vaccinated against COVID-19, but from May 12 he could freely enter and play in major American tournaments like the U.S. Open. The Homeland Security Department also said Monday starting May 12 it will no longer require non-U.S. travelers entering the United States via land ports of entry and ferries to be vaccinated against COVID-19 and provide proof of vaccination upon request. The Biden administration's rules imposed in September 2021 requiring about 3.5 million federal employees and contractors to be vaccinated or face firing or disciplinary action have not been enforced for over a year after a series of court rulings. A federal appeals court in March upheld a decision blocking enforcement of the employee vaccine requirement. The White House told federal agencies in October 2022 not to enforce the contractor vaccine requirements even after a nationwide injunction was lifted. The Health and Human Services Department said it will start the process to end vaccination requirements for Head Start educators and government-certified healthcare facilities. (Reporting by David Shepardson and Jasper Ward; Editing by Eric Beech and Sonali Paul)"]
document_embeddings = model.encode(docs)

query = "What threatens Biden right now?"
query_embedding = model.encode(query)

top_k = min(5, len(docs))

cos_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

top_results = torch.topk(cos_scores, k=top_k)

print(f"Question: {query}")
print(f"\n<Top {top_k} Articles>\n")

for i, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
    print(f"{i+1}: {docs[idx]} {'(similarity: {:.4f})'.format(score)}\n")

Question: What threatens Biden right now?

<Top 3 Articles>

1: By Andrea Shalal and Pete Schroeder WASHINGTON (Reuters) - JPMorgan Chase &amp; Co's deal to buy First Republic Bank pushed the Biden administration into a corner, leaving officials scrambling to explain how their stance against mergers squared with allowing the largest U.S. bank to get even bigger. At a White House event on small business on Monday, President Joe Biden hailed the sale of the troubled San Francisco-based lender, saying it would protect all depositors and avert a government bailout. He did not mention JPMorgan and underscored his call for stronger banking regulations. Senator Elizabeth Warren, a Democrat and member of the Senate Banking Committee who has been pushing for tighter banking regulations, blasted the decision, sounding a theme that could hound Biden, who last week announced his bid to win another term in the White House and has struggled with low approval ratings. "A poorly supervised bank was sn