In [1]:
"""
Mount Drive onto this notebook. Note that there is a specific
file structure. In particular, we have
-- MyDrive
-- -- EVO
-- -- -- CRyPTIC_reuse_table_20231208.csv
-- -- -- h37rv_genebank_flatfile.gbff

See comments to obtain any missing files.
"""

import pickle
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

evo_general_dir = '/content/drive/MyDrive/EVO/'

samples_dir = 'vcfs/'

# obtain via wget: ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/195/955/GCA_000195955.2_ASM19595v2/
h37rv_genome_file = 'GCF_000195955.2_ASM19595v2_genomic.fna'

# obtain via wget: ftp.ebi.ac.uk/pub/databases/cryptic/release_june2022/reuse/
cryptic_general_file = 'CRyPTIC_reuse_table_20231208.csv'

# note: you can use Python variables in terminal, e.g. in the next section we use
# wget -P $output_dir $vcf_file
# where output_dir and vcf_file are variables

!pip install evo-model

import torch
from evo import Evo

!pip install gumpy

import gumpy as gp
from collections import Counter

class CustomEmbedding(torch.nn.Module):
  """
    Monkey patch to obtain Evo model embeddings,
    instead of logits.
  """
  def unembed(self, u):
    return u


class GeneEmbeddingFactory():
  """
    Just a wrapper around Evo. Because we call Evo often enough, this
    makes our lives marginally better.

    22.5 GB of GPU RAM is barely enough to perform inference on 500-length
    sequences. (Storing the model requires ~12, running inference requires ~10.)
  """
  def __init__(self,
               ref_genome, # Genome
               version='evo-1-8k-base',
               device='cuda:0',
               logits=False):

    self.ref_genome = ref_genome

    evo_model = Evo(version)
    self.model = evo_model.model
    self.tokenizer = evo_model.tokenizer

    self.device = device
    self.model.to(device)
    self.model.eval()

    if not logits:
      self.model.unembed = CustomEmbedding()


  def get_gene(self, vcf, gene_name):
    """
      Note: Unless you have a lot of RAM, it is not possible
      load 12,228 whole M. tb genomes. Hence, we work on a gene-by-gene
      basis.

      input:
      - vcf: gumpy VCFFile object, M. tb VCF
      - gene_name: string

      output:
      - gene: gumpy Gene object
    """
    genome = self.ref_genome + vcf

    return genome.build_gene(gene_name)

  def run(self,
          sequence,
          to_cpu=True):
    input_ids = torch.tensor(
        self.tokenizer.tokenize(sequence),
        dtype=torch.int,
    ).to(self.device).unsqueeze(0)

    embed, _ = self.model(input_ids) # (batch, length, embed dim)
    if to_cpu: embed = embed.to(dtype=torch.float64).detach().cpu().numpy()

    return embed

Mounted at /content/drive
Collecting evo-model
  Downloading evo_model-0.1.2-py3-none-any.whl (20 kB)
Collecting stripedhyena==0.2.2 (from evo-model)
  Downloading stripedhyena-0.2.2-py3-none-any.whl (30 kB)
Collecting biopython (from evo-model)
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Collecting flash-attn>=2.0.0 (from stripedhyena==0.2.2->evo-model)
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn>=2.0.0->stripedhyena==0.2.2->evo-model)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:

In [2]:
"""
  To change which embeddings are being generated,
  change the following variables...
  - data_dir
  - med
"""

data_dir = 'rif_embeddings_v2/'
med = 'RIF'

unique_ids = np.load(evo_general_dir + data_dir + 'unique_ids.npy')
seqs = np.load(evo_general_dir + data_dir + 'seqs.npy')

assert len(unique_ids) == len(seqs), "There are more unique_ids than seqs, or vice versa!"

ref_genome = pickle.load(open(evo_general_dir + 'h37rv_genebank.pkl', 'rb'))
gef = GeneEmbeddingFactory(ref_genome, logits=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

configuration_hyena.py:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- configuration_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hyena.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

utils.py:   0%|          | 0.00/2.87k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


engine.py:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- engine.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


layers.py:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- layers.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.py:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

tokenizer.py:   0%|          | 0.00/4.40k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


positional_embeddings.py:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- positional_embeddings.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


cache.py:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- cache.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- model.py
- tokenizer.py
- positional_embeddings.py
- cache.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- modeling_hyena.py
- utils.py
- engine.py
- layers.py
- model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [3]:
df = pd.read_csv(evo_general_dir + data_dir + 'rif_labels.csv')

temp = pd.concat(
      [pd.DataFrame({'SEQ':seqs}),
       pd.DataFrame({'UNIQUEID':unique_ids})
      ], axis=1)

df = pd.merge(df, temp, on='UNIQUEID', how='left')

In [4]:
from evo.scoring import prepare_batch, logits_to_logprobs

# taken from evo, for reference
def logits_to_logprobs(
    logits: torch.Tensor,
    input_ids: torch.Tensor,
    trim_bos: bool = True,
) -> torch.Tensor:
    """
    Takes in a tensor of logits of dimension (batch, length, vocab).
    Computes the log-likelihoods using a softmax along the vocab dimension.
    Uses the `input_ids` to index into the log-likelihoods and returns the likelihood
    of the provided sequence at each position with dimension (batch, length).
    """
    softmax_logprobs = torch.log_softmax(logits, dim=-1)
    if trim_bos:
        softmax_logprobs = softmax_logprobs[:, :-1] # Remove last prediction.
        input_ids = input_ids[:, 1:] # Trim BOS added by tokenizer.
    assert(softmax_logprobs.shape[1] == input_ids.shape[1])

    logprobs = torch.gather(
        softmax_logprobs,       # Gather likelihoods...
        2,                      # along the vocab dimension...
        input_ids.unsqueeze(-1) # using the token ids to index.
    ).squeeze(-1)

    return logprobs


# note: seqs should be a batch of 1, ... don't have the GPU RAM for more
# lightly modified version nof score_sequences
def get_logprobs(
    seqs,
    model,
    tokenizer,
    device='cuda:0'
):
    input_ids, seq_lengths = prepare_batch(seqs, tokenizer, device=device, prepend_bos=True)
    assert(len(seq_lengths) == input_ids.shape[0])

    with torch.inference_mode():
        logits, _ = model(input_ids) # (batch, length, vocab)

    logprobs = logits_to_logprobs(logits, input_ids, trim_bos=True)
    logprobs = logprobs.float().cpu().numpy()

    return logprobs

In [5]:
logprobs = []
for i in tqdm(range(len(df))):
  logprobs.append(
        get_logprobs(
              [df.iloc[i]['SEQ'][:]],
              gef.model,
              gef.tokenizer,
              device=gef.device
          )
        )

100%|██████████| 12072/12072 [58:20<00:00,  3.45it/s]


In [6]:
np.save(evo_general_dir + data_dir + 'rif_log_prob_0_500.npy', logprobs)

In [7]:
len(logprobs)

12072