In [1]:
%load_ext beam_setup

Setting up the Beam environment for interactive use
Standard modules will be automatically imported so you can use them without explicit import
Done importing packages. It took:  4.1 seconds
Beam library is loaded from path: /home/elad/docker/beamds/src/beam
The Beam version is: 2.4.6b


In [2]:
from beam.similarity import TFIDF, SparnnSimilarity, DenseSimilarity
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer
from beam import beam_logger as logger
from beam.utils import Timer

# Tokenize the sentences

In [3]:
# Load the 20 newsgroups dataset

logger.info(f"Loaded dataset: newsgroups_train")
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

tokenizer_name = "mistralai/Mistral-7B-v0.1"
logger.info(f"Loaded tokenizer: {tokenizer_name}")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

logger.info(f"Tokenizing data: newsgroups_train")
x = tokenizer(newsgroups_train.data, add_special_tokens=False)["input_ids"]

logger.info(f"Tokenizing data: newsgroups_test")
q = tokenizer(newsgroups_test.data, add_special_tokens=False)["input_ids"]
index = newsgroups_train['filenames']

[32m2024-02-26 15:00:36[0m | BeamLog | [1mINFO[0m | [1mLoaded dataset: newsgroups_train[0m
[32m2024-02-26 15:00:36[0m | BeamLog | [1mINFO[0m | [1mLoaded tokenizer: mistralai/Mistral-7B-v0.1[0m
[32m2024-02-26 15:00:37[0m | BeamLog | [1mINFO[0m | [1mTokenizing data: newsgroups_train[0m
[32m2024-02-26 15:00:39[0m | BeamLog | [1mINFO[0m | [1mTokenizing data: newsgroups_test[0m


## Apply TFIDFVectorizer (SKlearn)

In [4]:
x_str = [' '.join([str(i) for i in xi]) for xi in x]

with Timer(name='TfidfVectorizer.fit_transform', logger=logger) as t:
    tfidf = TfidfVectorizer()
    vectors = tfidf.fit_transform(x_str)

    logger.info(f"Transformed data: {vectors.shape}")
    logger.critical(f"1x2: {(vectors[0].toarray() * vectors[1].toarray()).sum()}")

[32m2024-02-26 15:00:42[0m | BeamLog | [1mINFO[0m | [1mStarting timer: TfidfVectorizer.fit_transform[0m
[32m2024-02-26 15:00:46[0m | BeamLog | [1mINFO[0m | [1mTransformed data: (11314, 23459)[0m
[32m2024-02-26 15:00:46[0m | BeamLog | [41m[1mCRITICAL[0m | [41m[1m1x2: 0.229594450049725[0m
[32m2024-02-26 15:00:46[0m | BeamLog | [1mINFO[0m | [1mTimer TfidfVectorizer.fit_transform paused. Elapsed time: 4.6792     Sec[0m


## Apply Beam's TFIDF

In [17]:
with Timer(name='BeamTFIDF.fit_transform', logger=logger) as t:
    # Create a TFIDF model
    tfidf = TFIDF(sparse_framework='torch', device=0, n_workers=1)

    # Fit the model
    vectors = tfidf.fit_transform(x, index)

    logger.info(f"Transformed data: {vectors.shape}")

    try:
        logger.critical(f"1x2: {(vectors[0].to_dense() * vectors[1].to_dense()).sum()}")
    except AttributeError:
        logger.critical(f"1x2: {(vectors[0].toarray() * vectors[1].toarray()).sum()}")

[32m2024-02-26 15:27:59[0m | BeamLog | [1mINFO[0m | [1mStarting timer: BeamTFIDF.fit_transform[0m
[32m2024-02-26 15:27:59[0m | BeamLog | [1mINFO[0m | [1mStarting transformer process: self[0m
[32m2024-02-26 15:27:59[0m | BeamLog | [1mINFO[0m | [1mSplitting data to chunks for transformer: self[0m
[32m2024-02-26 15:27:59[0m | BeamLog | [1mINFO[0m | [1mStarting transformer: self with 1 workers. Number of queued tasks is 1.[0m
[32m2024-02-26 15:27:59[0m | BeamLog | [1mINFO[0m | [1mStarting task: 0 (self)[0m
[32m2024-02-26 15:28:01[0m | BeamLog | [1mINFO[0m | [1mFinished task: 0 (self). Elapsed time: 2.813175678253174[0m
[32m2024-02-26 15:28:01[0m | BeamLog | [1mINFO[0m | [1mRunning queue (length=1) on the main thread: self with 1 worker[0m
[32m2024-02-26 15:28:01[0m | BeamLog | [1mINFO[0m | [1mFinish running queue: self.[0m
[32m2024-02-26 15:28:01[0m | BeamLog | [1mINFO[0m | [1mFinished transformer process: self. Collating results...[0m


In [18]:
sim = tfidf.search(q[:10], k=2)

[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting transformer process: self[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mSplitting data to chunks for transformer: self[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting transformer: self with 1 workers. Number of queued tasks is 1.[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting task: 0 (self)[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mFinished task: 0 (self). Elapsed time: 0.00810098648071289[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mRunning queue (length=1) on the main thread: self with 1 worker[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mFinish running queue: self.[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mFinished transformer process: self. Collating results...[0m


In [19]:
sim

Similarities(index=array([['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53691',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53894'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.crypt/15605',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.misc/176993'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51241',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51240'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51319',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53323'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51179',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51181'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.med/59165',
        '/root/scikit_learn_dat

## Caculate BM25 scores

In [20]:
with Timer(name='BeamTFIDF.bm25', logger=logger) as t:
    # Transform the test data
    scores = tfidf.bm25(q)

# Print the shape of the transformed data
print(scores)

[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting timer: BeamTFIDF.bm25[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting transformer process: self[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mSplitting data to chunks for transformer: self[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting transformer: self with 1 workers. Number of queued tasks is 1.[0m
[32m2024-02-26 15:28:08[0m | BeamLog | [1mINFO[0m | [1mStarting task: 0 (self)[0m
[32m2024-02-26 15:28:12[0m | BeamLog | [1mINFO[0m | [1mFinished task: 0 (self). Elapsed time: 3.3209149837493896[0m
[32m2024-02-26 15:28:12[0m | BeamLog | [1mINFO[0m | [1mRunning queue (length=1) on the main thread: self with 1 worker[0m
[32m2024-02-26 15:28:12[0m | BeamLog | [1mINFO[0m | [1mFinish running queue: self.[0m
[32m2024-02-26 15:28:12[0m | BeamLog | [1mINFO[0m | [1mFinished transformer process: self. Collating results...[0m
[32m202

In [21]:
sparnn = SparnnSimilarity()

In [22]:
sparnn.add(vectors, index=index)

In [27]:
vq = tfidf.transform(q[:10])

[32m2024-02-26 15:28:42[0m | BeamLog | [1mINFO[0m | [1mStarting transformer process: self[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mSplitting data to chunks for transformer: self[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mStarting transformer: self with 1 workers. Number of queued tasks is 1.[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mStarting task: 0 (self)[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mFinished task: 0 (self). Elapsed time: 0.011901617050170898[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mRunning queue (length=1) on the main thread: self with 1 worker[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mFinish running queue: self.[0m
[32m2024-02-26 15:28:43[0m | BeamLog | [1mINFO[0m | [1mFinished transformer process: self. Collating results...[0m


In [30]:
sparnn.search(vq, k=2)

Similarities(index=array([['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.electronics/53894',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/talk.religion.misc/84165'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.crypt/15605',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38377'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53380',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51240'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.guns/54229',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/52499'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51179',
        '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51191'],
       ['/root/scikit_learn_data/20news_home/20news-bydate-train/sci.med/59183',
        '/root/scikit_learn

# Apply beam dense similarity

In [31]:
from sentence_transformers import SentenceTransformer

In [32]:
device = beam_device(0)
dense_model = SentenceTransformer('BAAI/bge-base-en-v1.5', device=str(device))
d = dense_model.get_sentence_embedding_dimension()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
xd = dense_model.encode(newsgroups_train.data, show_progress_bar=True)

Batches:   0%|          | 0/354 [00:00<?, ?it/s]

In [None]:
qd = dense_model.encode(newsgroups_test.data, show_progress_bar=True)

In [40]:
dense_sim = DenseSimilarity(d=d, expected_population=len(x),
                 metric='cosine', training_device='cpu', inference_device='cpu', ram_footprint=2**8*int(1e9),
                 gpu_footprint=24*int(1e9), exact=False, nlists=None, faiss_M=None,
                 reducer='umap')

[32m2024-02-26 15:34:45[0m | BeamLog | [1mINFO[0m | [1mUsing HNSW64. Expected RAM footprint is 40.549     MB[0m


In [43]:
dense_sim.add(xd)

In [44]:
similarities = dense_sim.search(qd[:10], k=2)

In [45]:
similarities

Similarities(index=array([[ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165],
       [ 3123, 30165]]), distance=array([[1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006],
       [1.0000006, 1.0000006]], dtype=float32), values=None, sparse_scores=None, metric='cosine', model='faiss')