<a href="https://colab.research.google.com/github/ounospanas/AIDL_A_02/blob/main/Retrieving_Similar_News_Posts_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine tune RoBERTa on STS-b

In [None]:
!pip install datasets transformers[sentencepiece]



In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

# download dataset
raw_datasets = load_dataset("glue", "stsb")

# define transformer and tokenizer
checkpoint = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# set a tokenization function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# apply tokenization to data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/803k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# change dataset's column names
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [None]:
# define the train/eval dataloaders

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
# retrieve pretrained model and set num of labels to 1 (it is a regression task)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

In [None]:
# define an optimizer, an optimization scheduler and the number of epochs
from transformers import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

2157


In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
# train loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2157 [00:00<?, ?it/s]

In [None]:
# run inference to get the eval scores
from datasets import load_metric

metric = load_metric("glue", "stsb")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    metric.add_batch(predictions=logits, references=batch["labels"])

metric.compute()

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

{'pearson': 0.9146322356739052, 'spearmanr': 0.9127506130733711}

In [None]:
# store model
torch.save(model.state_dict(), 'roberta_stsb.pt')

In [None]:
# load model
model.load_state_dict(torch.load('roberta_stsb.pt'))

{'pearson': 0.9146322356739052, 'spearmanr': 0.9127506130733711}

# Download example dataset

In [None]:
# install library
! pip install -q kaggle

In [None]:
# import files class to upload files to colab
from google.colab import files

In [None]:
# upload kaggle.json
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pkasnesis","key":"d202848b9a00e8f6959f9753b8abf697"}'}

In [None]:
# Make directory named kaggle and copy kaggle.json file there.
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download news category dataset and unzip to news folder
! kaggle datasets download 'rmisra/news-category-dataset'
! mkdir news
! unzip news-category-dataset.zip  -d news

Downloading news-category-dataset.zip to /content
 67% 17.0M/25.4M [00:00<00:00, 49.9MB/s]
100% 25.4M/25.4M [00:00<00:00, 84.9MB/s]
Archive:  news-category-dataset.zip
  inflating: news/News_Category_Dataset_v2.json  


In [None]:
# Convert json to list

import json 

list_ = []
with open('news/News_Category_Dataset_v2.json') as files:
    for file in files:
        list_.append(json.loads(file))

In [None]:
# Convert list to dataframe

import pandas as pd
data = pd.DataFrame(list_)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [None]:
# get description column

descriptions = data['short_description']

# Create Sentence embedding with Sentence Transformers (SRoBERTa)

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[?25l[K     |████▏                           | 10 kB 35.2 MB/s eta 0:00:01[K     |████████▍                       | 20 kB 40.1 MB/s eta 0:00:01[K     |████████████▌                   | 30 kB 41.5 MB/s eta 0:00:01[K     |████████████████▊               | 40 kB 44.8 MB/s eta 0:00:01[K     |████████████████████▉           | 51 kB 31.4 MB/s eta 0:00:01[K     |█████████████████████████       | 61 kB 34.5 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 71 kB 24.0 MB/s eta 0:00:01[K     |████████████████████████████████| 78 kB 7.5 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=120999 sha256=9a02ff9b720b257167a65fad02ecf42e8bea1de48e9c3a26b5197079b4991f63
  Stored in directory: /root/

In [None]:
from sentence_transformers import SentenceTransformer

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/335 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
# create embeddings (takes around 22 min) and store them
post_embeddings = embedder.encode(descriptions)
np.save('post_embeddings.npy', post_embeddings)

# Retrieve k most similar posts/news with cosine similarity

In [None]:
# add an example post and get the embeddings

input_post = 'A man killed his wife'
input_emb = embedder.encode(input_post)

In [None]:
%%timeit
# cosine similarity using input post embeddings and compare with the stored ones

cosine_similarity([input_emb],post_embeddings)

1 loop, best of 5: 805 ms per loop


In [None]:
# function for retrieving the k most similar news based on their textual silarity (SRoBERTa) 

def get_highest_similarity(embedding, post_embeddings, highest = 32):
    '''
    highest: how many relevant posts to retrieve
    '''

    text_similarities = cosine_similarity([embedding], 
                                                     post_embeddings)
    
    high_txt = np.argsort(text_similarities, )[0,-highest:]
    
    sim_txt = text_similarities[0, high_txt]
    
    highest_texts = {}
    
    for i in range(len(high_txt)):
        highest_texts[str(high_txt[i])] = [sim_txt[i]]
        
        
    return highest_texts

In [None]:
# get 32 most similar ones
highest_texts = get_highest_similarity(input_emb, post_embeddings, highest=32)

# Batch, tokenize and run inference using the finetuned RoBERTa_large on STS-b dataset

In [None]:
# store them pairwise in a list to be fed to the tokenizer
k_similar_posts = []

for i in highest_texts.keys():
  print(highest_texts[i],data.iloc[int(i)]['short_description'])
  k_similar_posts.append([input_post, data.iloc[int(i)]['short_description']])

[0.57522124] "He wanted to die, he wanted to end it all."
[0.5755888] The suspect killed himself inside a home after a standoff with police.
[0.57569] The actor's death was ruled a suicide.
[0.576674] The suspect was found dead of a self-inflicted gunshot wound.
[0.5769218] The suspect is accused of taking the life of an elderly man who just happened to cross his path.
[0.57790345] The suspect called his mother before killing himself.
[0.58078116] A grand jury indicted a 73-year-old man on Thursday for the alleged murder of his first wife more than 50 years ago who he
[0.5827253] "Kill me!" suspect says.
[0.5829408] Authorities say Kevin Janson Neal killed his wife late Monday before going on a shooting spree the following day.
[0.58308804] She left her husband. He killed their children. Just another day in America.
[0.5849714] Police are searching for the child's father over the woman's death.
[0.5860293] Michael Stasko allegedly shot wife and daughter dead before turning gun on himse

In [None]:
# tokenize the news
tokenized_similar_posts = tokenizer(k_similar_posts, padding=True,
                                    truncation=True, return_tensors='pt')
print(tokenized_similar_posts)

tokenized_similar_posts['input_ids'] = tokenized_similar_posts['input_ids'].to(device)
tokenized_similar_posts['attention_mask'] = tokenized_similar_posts['attention_mask'].to(device)

{'input_ids': tensor([[  0, 250, 313,  ...,   1,   1,   1],
        [  0, 250, 313,  ...,   1,   1,   1],
        [  0, 250, 313,  ...,   1,   1,   1],
        ...,
        [  0, 250, 313,  ...,   1,   1,   1],
        [  0, 250, 313,  ...,   1,   1,   1],
        [  0, 250, 313,  ...,   1,   1,   1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [None]:
# run inference using the RoBERTa_large STSb models
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_similar_posts)

In [None]:
# print the most similar one, which is different and more relevant than the output of the SRoBERTa
k_similar_posts[np.argmax(outputs.logits.cpu().detach().numpy())]

['A man killed his wife',
 'Authorities say Kevin Janson Neal killed his wife late Monday before going on a shooting spree the following day.']