In [1]:
!pip install -U "sentence-transformers==3.4.1" datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

There are two scenarios:
- The dataset already has embeddings pre-computed and stored in one of its column.
- The dataset has no pre-computed embeddings.

We will use the `sentence-transformers` library to compute embeddings of the target dataset.
- Then we will be able find to compute similarity between instances in the target dataset and seed domain-text using the same embedding model.

In [2]:
import sentence_transformers
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import os
import torch
print(sentence_transformers.__version__)

3.4.1


In [4]:
DATASET_ID = "rajpurkar/squad_v2" # the dataset to download
DATASET_SUBSET = None # in case you want to filter from a specific subset
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L12-v2" # Here you can select your embedding model: https://huggingface.co/spaces/mteb/leaderboard
EMBEDDING_COLUMN = "embedding" # the name of the embedding column (either existing or to create)

In [5]:
dataset = load_dataset(DATASET_ID)
dataset

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [6]:
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# The dataset does not contain embeddings so we'll create some.
def embed_texts(example):
    embeddings = model.encode(
        example["context"],
        device="cuda",
        show_progress_bar=False
    )
    return {"embedding": embeddings.tolist()}

# Map the function across the dataset with batching and on GPU for efficiency
dataset = dataset.map(embed_texts, batched=True, batch_size=256)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [11]:
# Check how much data will remain using a sample of 1000 instances.
# Semantically similar text we want to filter
QUERIES = [
    "Rocket Science",
    "Satellite Thermal Control",
    "Aerospace Engineering",
    "Space Propulsion",
    "Spacecraft Design",
    "Space Mission Engineering and Operations",
    "Space Technology Research",
    "Astronautical Engineering",
    "Space Systems Engineering",
    "Human Spaceflight",
    "Space Exploration"
]

SIMILARITY_THRESHOLD = 0.3

query_embeddings = model.encode(QUERIES, convert_to_tensor=True).to('cuda')

num_samples = 1000
sample_dataset = dataset['train'].shuffle(seed=42).select(range(num_samples))

def is_similar(example):
    example_embedding = torch.tensor(example["embedding"]).unsqueeze(0).to("cuda")
    sims = util.cos_sim(query_embeddings, example_embedding)
    max_sim = sims.max().item()
    return max_sim > SIMILARITY_THRESHOLD

sample_dataset_filtered = sample_dataset.filter(is_similar)
sample_dataset_filtered

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'embedding'],
    num_rows: 9
})

In [12]:
for item in sample_dataset_filtered['context']:
    print(item, '\n')

Earth's surface and the clouds absorb visible and invisible radiation from the sun and re-emit much of the energy as infrared back to atmosphere. Certain substances in the atmosphere, chiefly cloud droplets and water vapor, but also carbon dioxide, methane, nitrous oxide, sulfur hexafluoride, and chlorofluorocarbons, absorb this infrared, and re-radiate it in all directions including back to Earth. Thus, the greenhouse effect keeps the atmosphere and surface much warmer than if the infrared absorbers were absent from the atmosphere. 

In Canada, there are Affiliate Schools, Colleges, Institutes of Technology/Polytechnic Institutes, and Universities that offer instruction in a variety of programs that can lead to: engineering and applied science degrees, apprenticeship and trade programs, certificates, and diplomas. Affiliate Schools are polytechnic divisions belonging to a national university and offer select technical and engineering programs. Colleges, Institutes of Technology/Polyte

In [13]:
# filter the whole dataset (train and validation set) simultaneously
dataset_filtered = dataset.filter(is_similar)
dataset_filtered

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'embedding'],
        num_rows: 1759
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'embedding'],
        num_rows: 218
    })
})

In [14]:
USERNAME = "patrickfleith"
DATASET_NAME = "squad-v2-space-filtered"
from google.colab import userdata
import os
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
dataset_filtered.push_to_hub(repo_id=f"{USERNAME}/{DATASET_NAME}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/855 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/patrickfleith/squad-v2-space-filtered/commit/e087a0b1e6496800c62e67659cbc3895c24dff51', commit_message='Upload dataset', commit_description='', oid='e087a0b1e6496800c62e67659cbc3895c24dff51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/patrickfleith/squad-v2-space-filtered', endpoint='https://huggingface.co', repo_type='dataset', repo_id='patrickfleith/squad-v2-space-filtered'), pr_revision=None, pr_num=None)

To summarise, here is the code
```python
dataset = load_dataset(DATASET_ID) # for instance "rajpurkar/squad_v2"
model = SentenceTransformer(EMBEDDING_MODEL) # for instance "all-MiniLM-L12-v2". Check https://huggingface.co/spaces/mteb/leaderboard to choose.

# The dataset does not contain embeddings so we'll create some.
def embed_texts(example):
    embeddings = model.encode(
        example["context"],
        device="cuda",
        show_progress_bar=False
    )
    return {"embedding": embeddings.tolist()}

# Map the function across the dataset with batching and on GPU for efficiency
dataset = dataset.map(embed_texts, batched=True, batch_size=256)

# Variations of seed text to embed for similarity matching
QUERIES = [
    "Rocket Science",
    "Satellite Thermal Control",
    "Aerospace Engineering",
    "Space Propulsion",
    "Spacecraft Design",
    "Space Mission Engineering and Operations",
    "Space Technology Research",
    "Astronautical Engineering",
    "Space Systems Engineering",
    "Human Spaceflight",
    "Space Exploration"
]

# embed the seed texts
query_embeddings = model.encode(QUERIES, convert_to_tensor=True).to('cuda')

# You need to try some threshold on a sample of the dataset to find out the right threshold
SIMILARITY_THRESHOLD = 0.3

def is_similar(example):
    example_embedding = torch.tensor(example["embedding"]).unsqueeze(0).to("cuda")
    sims = util.cos_sim(query_embeddings, example_embedding)
    max_sim = sims.max().item()
    return max_sim > SIMILARITY_THRESHOLD

# filter the whole dataset (train and validation set) simultaneously
dataset_filtered = dataset.filter(is_similar)
dataset_filtered
```