In [9]:
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
from gradientlab.data_utils.experiment_path import get_ckpt_path_by_exp_name
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn.functional as F

In [24]:
exp_name = Path(".").resolve().absolute().name
ckpt_path = get_ckpt_path_by_exp_name(exp_name)

device = "cuda"
model = AutoModelForCausalLM.from_pretrained(ckpt_path, trust_remote_code=True).eval().to(device)
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)


Residual Scaling: 0.144338 (1/sqrt(2×24))
Scaled 48 parameters:
  ✓ decoder.blocks.0.attn.out_proj.weight
  ✓ decoder.blocks.0.ffn.w3.weight
  ✓ decoder.blocks.1.attn.out_proj.weight
  ✓ decoder.blocks.1.ffn.w3.weight
  ✓ decoder.blocks.2.attn.out_proj.weight
  ✓ decoder.blocks.2.ffn.w3.weight
  ✓ decoder.blocks.3.attn.out_proj.weight
  ✓ decoder.blocks.3.ffn.w3.weight
  ✓ decoder.blocks.4.attn.out_proj.weight
  ✓ decoder.blocks.4.ffn.w3.weight
  ... and 38 more



In [25]:
df_orig = pd.read_csv("/home/mascit/Downloads/dataset_Hanna_updated_preprocessed.csv")
df = df_orig[["sgRNA_23bp", "BE39:MELJUSO:zscore"]].copy()
df = df.rename({"sgRNA_23bp": "sequence", "BE39:MELJUSO:zscore": "target"}, axis=1)
df = df[~df["target"].isna()]

In [26]:
df["target"] = df["target"].apply(lambda x: 1 if x < -2 else 0)

In [None]:
df["target"] = df["target"].apply(lambda x: round(x, 1))
df["target"] = df["target"].apply(lambda x: x if x >= -4 else -4.0)

In [27]:
data = df.to_dict(orient="records")
len_data = len(data)
len_data

12032

In [31]:
df_orig.columns

Index(['sgRNA sequence', 'A_Content', 'T_Content', 'C_Content', 'G_Content',
       'GC_Content', 'Tm_Global', 'Tm_PAM_Proximal', 'Tm_Middle', 'Stem_Loop',
       'Free_Energy', 'Base_Accessibility', 'sgRNA context sequence',
       'Gene symbol', 'Ensembl gene ID', 'Ensembl transcript ID',
       'Gene strand', 'Genome assembly', 'Chromosome',
       'sgRNA genomic position', 'sgRNA strand', 'PAM', 'Edit', '# edits',
       '# silent edits', 'Nucleotide edits', 'Amino acid edits',
       'Mutation category', 'Mutation bin', 'Clinical significance',
       'On-target efficacy score', 'Match Bin I counts', 'offtargets-filter',
       'BE39:MELJUSO:zscore', 'CAS9:MELJUSO:zscore', 'BE39:A375:zscore',
       'CAS9:A375:zscore', 'BE4max:A375:zscore', 'BE39:OVCAR8:zscore',
       'BE39:HAP1:zscore', 'BE39:HA1E:zscore', 'sgRNA_23bp'],
      dtype='object')

In [28]:
batch_size = 16
features = []
idx = -1
for i in tqdm(range(0, len_data, batch_size)):
    batch = data[i:i+batch_size]
    seqs = [el["sequence"] for el in batch]
    inputs = tokenizer([f"<|im_start|>{s}<|im_end|>" for s in seqs], return_tensors="pt", add_special_tokens=False, return_attention_mask=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.inference_mode():
        out = model(**inputs)

    batch_features = out.hidden_states[:, idx]
    batch_features = F.normalize(batch_features, p = 2, dim=1)
    batch_features = batch_features.tolist()
    features.extend(batch_features)

100%|██████████| 752/752 [00:09<00:00, 81.62it/s]


In [29]:
new_ds = []
for item, feature in zip(data, features):
    new_item = {
        **item,
        **{f"feature_{i}": f for i, f in enumerate(feature)}
    }
    new_ds.append(new_item)

final_ds = pd.DataFrame(new_ds)
final_ds.to_csv(f"/media/mascit/data/Projects/python/nucelotides_downstream/data.tmp/std_cls__ds20251123__exp20251118_0_lm_30m_idx{idx}_1.csv", index=False)