In [146]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [147]:
# import transformers
# model = transformers.AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# tokenizer = transformers.AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [179]:
omics_model_cfg = {
    "embedding_dim": 64,
    "hidden_dim": 128,
    "num_layers": 2,
    "num_heads": 4,
    "use_self_attention": False,
    "activation": "relu",
    "dropout": 0.1,
}

In [180]:
from sentence_transformers.models import MMContextEncoder

model = MMContextEncoder(
    text_encoder_name="sentence-transformers/all-MiniLM-L6-v2",
    omics_processor_name="none",
    omics_encoder_cfg=omics_model_cfg,
)

In [181]:
from sentence_transformers import SentenceTransformer

modules = [model]
bimodal_model = SentenceTransformer(modules=modules)

In [183]:
bimodal_model.save("../../../data/models/mmcontext_model")

In [153]:
model = SentenceTransformer("../../../data/models/mmcontext_model")

In [154]:
# from sentence_transformers.dataclass import omicsSample
import numpy as np
import random


def simulate_omics_dataset(
    num_samples: int,
    num_features: int,
    mean: float = 0.0,
    std_dev: float = 1.0,
    zero_fraction: float = 0.5,  # Fraction of values to set to zero
) -> list[dict]:
    """
    Simulates a dataset of omicsSample instances, with random zeros in counts.

    Parameters
    ----------
    num_samples : int
        Number of samples to simulate.
    num_features : int
        Number of features per sample.
    mean : float, optional
        Mean of the normal distribution used to generate counts, default is 0.0.
    std_dev : float, optional
        Standard deviation of the normal distribution used to generate counts, default is 1.0.
    zero_fraction : float, optional
        Fraction of the counts to randomly set to zero, default is 0.1.

    Returns
    -------
    list[omicsSample]
        A list of simulated omicsSample instances.
    """
    dataset = []

    for sample_id in range(1, num_samples + 1):
        # Generate random counts from a normal distribution
        counts = np.random.normal(loc=mean, scale=std_dev, size=num_features)
        counts = np.abs(counts)  # Make all counts positive
        # Randomly set some values to zero
        num_zeros = int(zero_fraction * num_features)
        zero_indices = np.random.choice(range(num_features), size=num_zeros, replace=False)
        counts[zero_indices] = 0

        # Generate random feature IDs
        featureIDs = [f"{random.randint(10000, 99999)}" for _ in range(num_features)]

        # Create a mapping from feature ID to index
        feature_to_idx = {feature_id: idx for idx, feature_id in enumerate(featureIDs)}

        # Create an omicsSample instance
        sample = {
            "counts": counts,
            "featureIDs": featureIDs,
            "feature_to_idx": feature_to_idx,
            "sample_id": str(sample_id),
            "is_omics": True,
        }

        dataset.append(sample)

    return dataset

In [155]:
# Example usage
num_samples = 5
num_features = 64
simulated_omicsSamples = simulate_omics_dataset(num_samples, num_features)

model.encode(
    [simulated_omicsSamples[0], simulated_omicsSamples[1], simulated_omicsSamples[2], simulated_omicsSamples[3]],
    batch_size=2,
    device="mps",
)
# model.encode(["This is a whole sentence. Maybe it is long, or not.","This is a another sentence.","one more","and another one"], batch_size=2)

array([[ 6.27828777e-01, -1.15590954e+00,  1.05809890e-01,
         1.28279865e+00,  4.61147904e-01, -1.90415710e-01,
        -1.07769382e+00, -7.64065146e-01, -1.16255488e-02,
        -6.28843963e-01, -1.49026692e+00, -8.07196319e-01,
        -2.64787376e-01,  2.47096324e+00,  2.00408340e+00,
        -9.25594747e-01, -3.15756679e-01, -6.21650279e-01,
        -7.46325791e-01, -4.23527420e-01, -4.95181233e-01,
         1.22440822e-01, -7.53941596e-01, -1.19791830e+00,
        -1.58664241e-01, -1.39167786e+00,  2.17830229e+00,
        -1.06094408e+00,  3.27068120e-01,  1.35503501e-01,
        -3.46453160e-01,  2.45105553e+00, -5.24826586e-01,
        -5.78065217e-01,  2.00095668e-01, -8.13486993e-01,
         2.14783698e-01, -5.67257524e-01,  1.22807288e+00,
         5.68531632e-01, -3.15219849e-01, -7.14145839e-01,
        -7.98902035e-01, -5.05987227e-01,  1.91066980e+00,
        -4.39112276e-01,  4.58433598e-01,  1.34201753e+00,
        -5.36090076e-01,  2.69287825e+00, -7.41880596e-0

In [156]:
# from sentence_transformers.dataclass import omicsSample
class InputExample:
    """Structure for one input example with texts, the label and a unique id"""

    def __init__(self, guid: str = "", texts: list[dict, str] = None, label: int | float = 0):
        """
        Creates one InputExample with the given texts, guid and label

        Args:
            guid: id for the example
            texts: the texts for the example.
            label: the label for the example
        """
        self.guid = guid
        self.texts = texts
        self.label = label

    def __str__(self):
        return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))

In [168]:
simulated_dataset = []
for idx in range(0, len(simulated_omicsSamples) - 1, 2):
    # We can use image pairs directly. Because our images aren't labeled, we use a random label as an example
    # train_dataset.append(InputExample(texts=[photos[idx], photos[idx + 1]], label=random.choice([0, 1])))

    # Or images and text together
    # simulated_dataset.append(InputExample(texts=[simulated_omicsSamples[idx], simulated_omicsSamples[idx+1]], label=1))
    simulated_dataset.append(
        InputExample(texts=[simulated_omicsSamples[idx], "This is another unrelated caption"], label=0)
    )
    simulated_dataset.append(InputExample(texts=[simulated_omicsSamples[idx], "This is a related caption"], label=1))
    # simulated_dataset.append(InputExample(texts=["This is just text", "this is related text"], label = 1))
    # simulated_dataset.append(InputExample(texts=["This is just text", "this is unrelated text"], label = 0))

In [169]:
# We'll create a DataLoader that batches our data and prepare a contrastive loss function
from sentence_transformers import losses
from torch.utils.data import DataLoader

train_dataloader = DataLoader(simulated_dataset, shuffle=True, batch_size=4)
train_loss = losses.ContrastiveLoss(model=model)

In [170]:
model.fit([(train_dataloader, train_loss)], epochs=5, show_progress_bar=True)


[A
  0%|          | 0/5 [03:13<?, ?it/s]
100%|██████████| 5/5 [00:20<00:00,  2.82s/it]
100%|██████████| 5/5 [00:20<00:00,  4.10s/it]

{'train_runtime': 20.4788, 'train_samples_per_second': 0.977, 'train_steps_per_second': 0.244, 'train_loss': 0.21001834869384767, 'epoch': 5.0}





In [172]:
model.similarity(simulated_omicsSamples[0], "This is a similar caption")

RuntimeError: Could not infer dtype of dict

In [190]:
from transformers.utils.import_utils import is_accelerate_available

is_accelerate_available("0.28.0")

True

In [153]:
from transformers.utils.import_utils import _accelerate_available, _accelerate_version

print(_accelerate_available)  # Should be True if the package is installed
print(_accelerate_version)  # Should print the version string, e.g., "0.26.0"

True
1.3.0


In [151]:
from transformers.utils.import_utils import _is_package_available

print(_is_package_available("accelerate", return_version=True))

(True, '1.3.0')


In [152]:
import importlib
import transformers.utils.import_utils

importlib.reload(transformers.utils.import_utils)

<module 'transformers.utils.import_utils' from '/Users/mengerj/repos/sentence-transformers/.venv/lib/python3.12/site-packages/transformers/utils/import_utils.py'>

In [149]:
import sys

print(sys.executable)  # Path to the current Python interpreter
print(sys.path)  # Directories being searched for packages

/Users/mengerj/repos/sentence-transformers/.venv/bin/python
['/Users/mengerj/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python312.zip', '/Users/mengerj/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12', '/Users/mengerj/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/lib-dynload', '', '/Users/mengerj/repos/sentence-transformers/.venv/lib/python3.12/site-packages']


In [133]:
import importlib


def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool:
    # Check if the package spec exists and grab its version to avoid importing a local directory
    package_exists = importlib.util.find_spec(pkg_name) is not None
    package_version = "N/A"
    if package_exists:
        try:
            # Primary method to get the package version
            package_version = importlib.metadata.version(pkg_name)
        except importlib.metadata.PackageNotFoundError:
            # Fallback method: Only for "torch" and versions containing "dev"
            if pkg_name == "torch":
                try:
                    package = importlib.import_module(pkg_name)
                    temp_version = getattr(package, "__version__", "N/A")
                    # Check if the version contains "dev"
                    if "dev" in temp_version:
                        package_version = temp_version
                        package_exists = True
                    else:
                        package_exists = False
                except ImportError:
                    # If the package can't be imported, it's not available
                    package_exists = False
            else:
                # For packages other than "torch", don't attempt the fallback and set as not available
                package_exists = False
        print(f"Detected {pkg_name} version: {package_version}")
    if return_version:
        return package_exists, package_version
    else:
        return package_exists

In [173]:
from sentence_transformers import models
from PIL import Image

clip = models.CLIPModel()
model = SentenceTransformer(modules=[clip])
# Encode an image:
img_emb = model.encode(Image.open("two_dogs_in_snow.jpg"))

In [174]:
photo = Image.open("two_dogs_in_snow.jpg")

In [175]:
train_dataset = []
# We can use image pairs directly. Because our images aren't labeled, we use a random label as an example
# train_dataset.append(InputExample(texts=[photos[idx], photos[idx + 1]], label=random.choice([0, 1])))

# Or images and text together
train_dataset.append(InputExample(texts=[photo, "This is the caption"], label=1))
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
train_loss = losses.ContrastiveLoss(model=model)

In [176]:
model.fit([(train_dataloader, train_loss)], epochs=5, show_progress_bar=True)

  0%|          | 0/5 [05:52<?, ?it/s]                                
100%|██████████| 5/5 [00:00<00:00,  6.38it/s]

{'train_runtime': 0.788, 'train_samples_per_second': 6.345, 'train_steps_per_second': 6.345, 'train_loss': 0.3352612018585205, 'epoch': 5.0}





In [178]:
model.similarity(photo, "This is a similar caption")

RuntimeError: Could not infer dtype of JpegImageFile