# Imports

In [None]:
%pip install -r ../requirements.txt  

In [4]:
import sys, os

sys.path.append(os.path.abspath("../src"))

import numpy as np
import pandas as pd

import torch
from torch import cuda

from collections import defaultdict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizer
)
from collections import defaultdict
import torch
from captum.attr import IntegratedGradients, NoiseTunnel
import evaluate
mcc = evaluate.load("matthews_correlation")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
import inspect
from transformers_interpret import SequenceClassificationExplainer
from collections import defaultdict
from transformers import AutoConfig, AutoTokenizer
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)
from modeling_hyena import HyenaDNAForSequenceClassification

from collections import defaultdict
from captum.attr import IntegratedGradients, Saliency, NoiseTunnel
from datasets import Dataset, load_dataset

# Data collection

In [4]:
dataset = load_dataset("leannmlindsey/GUE", name="prom_core_all")
print("Available splits:", dataset.keys())

Available splits: dict_keys(['train', 'test', 'dev'])


In [5]:
print("Number of samples in each split:")
for split, dataset_split in dataset.items():
    sequences = dataset_split['sequence']
    avg_length = sum(len(seq) for seq in sequences) / len(sequences)
    print(f"{split}: {len(dataset_split)} samples, average sequence length: {avg_length:.2f}")

Number of samples in each split:
train: 47356 samples, average sequence length: 70.00
test: 5920 samples, average sequence length: 70.00
dev: 5920 samples, average sequence length: 70.00


# Loading model

In [40]:
model_name = "LongSafari/hyenadna-tiny-1k-seqlen-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=2)

Some weights of HyenaDNAForSequenceClassification were not initialized from the model checkpoint at LongSafari/hyenadna-tiny-1k-seqlen-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [42]:
model.to(device)

HyenaDNAForSequenceClassification(
  (hyena): HyenaDNAModel(
    (backbone): HyenaLMBackbone(
      (embeddings): HyenaEmbeddings(
        (word_embeddings): Embedding(16, 128)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layers): ModuleList(
        (0-1): 2 x HyenaBlock(
          (mixer): HyenaOperator(
            (dropout): Dropout(p=0.0, inplace=False)
            (in_proj): Linear(in_features=128, out_features=384, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
            (short_filter): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(2,), groups=384)
            (filter_fn): HyenaFilter(
              (dropout): Dropout(p=0.0, inplace=False)
              (pos_emb): HyenaPositionalEmbedding()
              (implicit_filter): Sequential(
                (0): Linear(in_features=5, out_features=64, bias=True)
                (1): HyenaSin()
                (2): Linear(in_features=64, out_features=64, bias=True)
  

# Training

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['sequence'], truncation=True, padding=True)

In [9]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
train_dataset = encoded_dataset["train"]
test_dataset = encoded_dataset["test"]
val_dataset = encoded_dataset["dev"]

Map:   0%|          | 0/47356 [00:00<?, ? examples/s]

Map:   0%|          | 0/5920 [00:00<?, ? examples/s]

Map:   0%|          | 0/5920 [00:00<?, ? examples/s]

In [10]:
def compute_metrics(p):
    logits = p.predictions
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=1)
    labels = p.label_ids

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
        "mcc": matthews_corrcoef(labels, preds),
    }

## Classic Fine-Tuning

#### Experiment 1 -- best model F1 = 0.82
epochs 10, lr 6e-4, lr scheduler linear, bs64

In [46]:
training_args = TrainingArguments(
    output_dir="./results/hyenadna-tiny-1k-seqlen-promoter",
    label_names=["labels"],
    num_train_epochs=10,                
    learning_rate=6e-4,                  
    optim="adamw_torch",
    weight_decay=0.01,
    lr_scheduler_type="linear",         
    warmup_ratio=0.1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    fp16=True,
    save_total_limit=3,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,         
)


trainer = Trainer(
    model = model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,   
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.can_return_loss = True
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4744,0.449067,0.786824,0.843263,0.707956,0.769708,0.581692
2,0.4257,0.428839,0.803041,0.820431,0.779121,0.799242,0.606972
3,0.4054,0.419246,0.809122,0.787023,0.850957,0.817742,0.620105
4,0.3884,0.422016,0.805068,0.792562,0.829809,0.810758,0.610685
5,0.3707,0.419422,0.810642,0.810702,0.813696,0.812196,0.621263
6,0.3498,0.418055,0.813514,0.799234,0.840551,0.819372,0.627738
7,0.3185,0.434708,0.809628,0.802614,0.824438,0.81338,0.619406
8,0.277,0.477532,0.800507,0.787772,0.826116,0.806489,0.601594
9,0.228,0.53631,0.798311,0.78707,0.821417,0.803876,0.597068
10,0.1772,0.610527,0.793243,0.794364,0.794898,0.794631,0.586468


TrainOutput(global_step=7400, training_loss=0.341518507261534, metrics={'train_runtime': 197.8185, 'train_samples_per_second': 2393.911, 'train_steps_per_second': 37.408, 'total_flos': 87614994954240.0, 'train_loss': 0.341518507261534, 'epoch': 10.0})

In [47]:
model.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64",
    safe_serialization=False
)

tokenizer.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64"
)


('./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64\\tokenizer_config.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64\\special_tokens_map.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64\\added_tokens.json')

 [7400/7400 03:19, Epoch 10/10]


In [48]:
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("=== Test metrics ===")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")

=== Test metrics ===
eval_loss: 0.4013
eval_accuracy: 0.8167
eval_precision: 0.8024
eval_recall: 0.8416
eval_f1: 0.8216
eval_mcc: 0.6342
eval_runtime: 0.8965
eval_samples_per_second: 6603.3390
eval_steps_per_second: 103.7350
epoch: 10.0000


#### Experiment 2
epochs 10, lr 5e-4, lr scheduler linear, bs32

In [49]:
training_args2 = TrainingArguments(
    output_dir="./results/hyenadna-tiny-1k-seqlen-promoter2",
    label_names=["labels"],
    num_train_epochs=10,                
    learning_rate=5e-4,                  
    optim="adamw_torch",
    weight_decay=0.01,
    lr_scheduler_type="linear",         
    warmup_ratio=0.1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    fp16=True,
    save_total_limit=3,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,         
)


trainer2 = Trainer(
    model = model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,   
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2.can_return_loss = True
trainer2.train()

  trainer2 = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.3238,0.447361,0.797635,0.803201,0.791876,0.797498,0.59535
2,0.3489,0.432583,0.804561,0.822592,0.779792,0.80062,0.610073
3,0.3265,0.454681,0.797466,0.770352,0.851292,0.808802,0.597998
4,0.2921,0.470336,0.796115,0.801361,0.790869,0.79608,0.592299
5,0.251,0.531072,0.803209,0.796988,0.817053,0.806895,0.606533
6,0.2052,0.56359,0.796284,0.803284,0.788184,0.795662,0.592706
7,0.1518,0.655575,0.793243,0.785738,0.810003,0.797686,0.586671
8,0.1068,0.768097,0.791723,0.791,0.796576,0.793778,0.583424
9,0.0679,0.936919,0.785811,0.786216,0.788855,0.787534,0.571598
10,0.0379,1.10911,0.785473,0.788196,0.784491,0.786339,0.570948


TrainOutput(global_step=7400, training_loss=0.21118232933250633, metrics={'train_runtime': 197.7669, 'train_samples_per_second': 2394.536, 'train_steps_per_second': 37.418, 'total_flos': 87614994954240.0, 'train_loss': 0.21118232933250633, 'epoch': 10.0})

In [50]:
model.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs32",
    safe_serialization=False
)

tokenizer.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs32"
)


('./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs32\\tokenizer_config.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs32\\special_tokens_map.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs32\\added_tokens.json')

 [7400/7400 03:23, Epoch 10/10]


In [51]:
test_metrics = trainer2.evaluate(eval_dataset=test_dataset)
print("=== Test metrics ===")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")

=== Test metrics ===
eval_loss: 0.4429
eval_accuracy: 0.8027
eval_precision: 0.7793
eval_recall: 0.8460
eval_f1: 0.8113
eval_mcc: 0.6076
eval_runtime: 1.3285
eval_samples_per_second: 4456.0200
eval_steps_per_second: 70.0020
epoch: 10.0000


#### Experiment 3
epochs 10, lr 2e-5, lr scheduler cosine, bs64

In [52]:
training_args3 = TrainingArguments(
    output_dir="./results/hyenadna-tiny-1k-seqlen-promoter3",
    label_names=["labels"],
    num_train_epochs=10,                
    learning_rate=2e-5,                  
    optim="adamw_torch",
    weight_decay=0.01,
    lr_scheduler_type="cosine",         
    warmup_ratio=0.1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    fp16=True,
    save_total_limit=3,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,         
)


trainer3 = Trainer(
    model = model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,   
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer3.can_return_loss = True
trainer3.train()

  trainer3 = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2662,0.475093,0.800338,0.810577,0.787177,0.798706,0.600987
2,0.2913,0.460298,0.794932,0.80233,0.78617,0.794168,0.590023
3,0.2621,0.518748,0.784966,0.763434,0.829809,0.795239,0.571869
4,0.2278,0.55673,0.786993,0.768605,0.825109,0.795856,0.575345
5,0.1812,0.63178,0.791385,0.790473,0.796576,0.793513,0.58275
6,0.1396,0.694781,0.788345,0.790768,0.787848,0.789306,0.576686
7,0.0957,0.844527,0.783108,0.779242,0.793891,0.786498,0.566247
8,0.0649,1.023687,0.779561,0.776603,0.788855,0.782681,0.559129
9,0.0374,1.185608,0.784122,0.783973,0.788184,0.786073,0.568219
10,0.0167,1.288606,0.785473,0.792337,0.777442,0.784819,0.571083


TrainOutput(global_step=7400, training_loss=0.15828348649514687, metrics={'train_runtime': 199.8985, 'train_samples_per_second': 2369.002, 'train_steps_per_second': 37.019, 'total_flos': 87614994954240.0, 'train_loss': 0.15828348649514687, 'epoch': 10.0})

In [53]:
model.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr2e-5-cosine-bs32",
    safe_serialization=False
)

tokenizer.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr2e-5-cosine-bs32",
)


('./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr2e-5-cosine-bs32\\tokenizer_config.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr2e-5-cosine-bs32\\special_tokens_map.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr2e-5-cosine-bs32\\added_tokens.json')

 [7400/7400 03:23, Epoch 10/10]


In [54]:
test_metrics = trainer3.evaluate(eval_dataset=test_dataset)
print("=== Test metrics ===")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")

=== Test metrics ===
eval_loss: 0.4830
eval_accuracy: 0.7944
eval_precision: 0.8084
eval_recall: 0.7732
eval_f1: 0.7904
eval_mcc: 0.5895
eval_runtime: 0.9936
eval_samples_per_second: 5958.2460
eval_steps_per_second: 93.6010
epoch: 10.0000


#### Experiment 4
epochs 10, lr 5e-4, lr scheduler linear, bs8

In [55]:
training_args3 = TrainingArguments(
    output_dir="./results/hyenadna-tiny-1k-seqlen-promoter4",
    label_names=["labels"],
    num_train_epochs=10,                
    learning_rate=5e-4,                  
    optim="adamw_torch",
    weight_decay=0.01,
    lr_scheduler_type="linear",         
    warmup_ratio=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    save_total_limit=3,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,         
)


trainer3 = Trainer(
    model = model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,   
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer3.can_return_loss = True
trainer3.train()

  trainer4 = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.239,0.515105,0.783446,0.804231,0.752937,0.777739,0.568223
2,0.2833,0.471888,0.8,0.811957,0.784156,0.797814,0.600428
3,0.2513,0.514518,0.785642,0.771947,0.814703,0.792749,0.572003
4,0.2142,0.585591,0.786993,0.780719,0.801947,0.791191,0.574109
5,0.1684,0.657293,0.790709,0.787129,0.800604,0.793809,0.581442
6,0.1285,0.713919,0.788514,0.801186,0.771064,0.785836,0.577526
7,0.0861,0.865254,0.782264,0.778327,0.793219,0.785702,0.56456
8,0.0585,1.026394,0.784966,0.780223,0.797247,0.788644,0.569992
9,0.0313,1.185943,0.783277,0.788239,0.778449,0.783314,0.566615
10,0.0152,1.285922,0.786318,0.789723,0.784156,0.786929,0.572651


TrainOutput(global_step=7400, training_loss=0.14758749446353397, metrics={'train_runtime': 198.4493, 'train_samples_per_second': 2386.303, 'train_steps_per_second': 37.289, 'total_flos': 87614994954240.0, 'train_loss': 0.14758749446353397, 'epoch': 10.0})

In [56]:
model.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs8",
    safe_serialization=False
)

tokenizer.save_pretrained(
    "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs8",
)


('./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs8\\tokenizer_config.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs8\\special_tokens_map.json',
 './results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr5e-4-linear-bs8\\added_tokens.json')

 [7400/7400 03:19, Epoch 10/10]

In [57]:
test_metrics = trainer3.evaluate(eval_dataset=test_dataset)
print("=== Test metrics ===")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")

=== Test metrics ===
eval_loss: 0.4613
eval_accuracy: 0.8008
eval_precision: 0.8153
eval_recall: 0.7793
eval_f1: 0.7969
eval_mcc: 0.6023
eval_runtime: 0.9632
eval_samples_per_second: 6146.1090
eval_steps_per_second: 96.5520
epoch: 10.0000


## LoRa

In [None]:
base_model = HyenaDNAForSequenceClassification.from_pretrained(
    "LongSafari/hyenadna-tiny-1k-seqlen-hf",
    trust_remote_code=True,
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(
    "LongSafari/hyenadna-tiny-1k-seqlen-hf",
    trust_remote_code=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of HyenaDNAForSequenceClassification were not initialized from the model checkpoint at LongSafari/hyenadna-tiny-1k-seqlen-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HyenaDNAForSequenceClassification(
  (hyena): HyenaDNAModel(
    (backbone): HyenaLMBackbone(
      (embeddings): HyenaEmbeddings(
        (word_embeddings): Embedding(16, 128)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layers): ModuleList(
        (0-1): 2 x HyenaBlock(
          (mixer): HyenaOperator(
            (dropout): Dropout(p=0.0, inplace=False)
            (in_proj): Linear(in_features=128, out_features=384, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
            (short_filter): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(2,), groups=384)
            (filter_fn): HyenaFilter(
              (dropout): Dropout(p=0.0, inplace=False)
              (pos_emb): HyenaPositionalEmbedding()
              (implicit_filter): Sequential(
                (0): Linear(in_features=5, out_features=64, bias=True)
                (1): HyenaSin()
                (2): Linear(in_features=64, out_features=64, bias=True)
  

In [10]:
for name, param in base_model.named_parameters():
    print(f"{name}: {param.shape}")

hyena.backbone.embeddings.word_embeddings.weight: torch.Size([16, 128])
hyena.backbone.layers.0.mixer.in_proj.weight: torch.Size([384, 128])
hyena.backbone.layers.0.mixer.in_proj.bias: torch.Size([384])
hyena.backbone.layers.0.mixer.out_proj.weight: torch.Size([128, 128])
hyena.backbone.layers.0.mixer.out_proj.bias: torch.Size([128])
hyena.backbone.layers.0.mixer.short_filter.weight: torch.Size([384, 1, 3])
hyena.backbone.layers.0.mixer.short_filter.bias: torch.Size([384])
hyena.backbone.layers.0.mixer.filter_fn.bias: torch.Size([128])
hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.0.weight: torch.Size([64, 5])
hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.0.bias: torch.Size([64])
hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.1.freq: torch.Size([1, 64])
hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.2.weight: torch.Size([64, 64])
hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.2.bias: torch.Size([64])
hyena.backbone.layers.0.mixer.filter_fn

In [11]:
base_model 

HyenaDNAForSequenceClassification(
  (hyena): HyenaDNAModel(
    (backbone): HyenaLMBackbone(
      (embeddings): HyenaEmbeddings(
        (word_embeddings): Embedding(16, 128)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layers): ModuleList(
        (0-1): 2 x HyenaBlock(
          (mixer): HyenaOperator(
            (dropout): Dropout(p=0.0, inplace=False)
            (in_proj): Linear(in_features=128, out_features=384, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
            (short_filter): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(2,), groups=384)
            (filter_fn): HyenaFilter(
              (dropout): Dropout(p=0.0, inplace=False)
              (pos_emb): HyenaPositionalEmbedding()
              (implicit_filter): Sequential(
                (0): Linear(in_features=5, out_features=64, bias=True)
                (1): HyenaSin()
                (2): Linear(in_features=64, out_features=64, bias=True)
  

In [12]:
full_target_modules = [
    # Mixer layers
    "hyena.backbone.layers.0.mixer.in_proj",
    "hyena.backbone.layers.0.mixer.out_proj",
    "hyena.backbone.layers.1.mixer.in_proj",
    "hyena.backbone.layers.1.mixer.out_proj",
    "hyena.backbone.layers.2.mixer.in_proj",
    "hyena.backbone.layers.2.mixer.out_proj",
    "hyena.backbone.layers.3.mixer.in_proj",
    "hyena.backbone.layers.3.mixer.out_proj",
    
    # MLP (FeedforwardNetwork) layers
    "hyena.backbone.layers.0.mlp.fc1",
    "hyena.backbone.layers.0.mlp.fc2",
    "hyena.backbone.layers.1.mlp.fc1",
    "hyena.backbone.layers.1.mlp.fc2",
    "hyena.backbone.layers.2.mlp.fc1",
    "hyena.backbone.layers.2.mlp.fc2",
    "hyena.backbone.layers.3.mlp.fc1",
    "hyena.backbone.layers.3.mlp.fc2",
]
mixer_only = [
    "hyena.backbone.layers.0.mixer.in_proj",
    "hyena.backbone.layers.0.mixer.out_proj",
    "hyena.backbone.layers.1.mixer.in_proj",
    "hyena.backbone.layers.1.mixer.out_proj",
    "hyena.backbone.layers.2.mixer.in_proj",
    "hyena.backbone.layers.2.mixer.out_proj",
    "hyena.backbone.layers.3.mixer.in_proj",
    "hyena.backbone.layers.3.mixer.out_proj",
]
ffn_only = [
    "hyena.backbone.layers.0.mlp.fc1",
    "hyena.backbone.layers.0.mlp.fc2",
    "hyena.backbone.layers.1.mlp.fc1",
    "hyena.backbone.layers.1.mlp.fc2",
    "hyena.backbone.layers.2.mlp.fc1",
    "hyena.backbone.layers.2.mlp.fc2",
    "hyena.backbone.layers.3.mlp.fc1",
    "hyena.backbone.layers.3.mlp.fc2",
]

In [13]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
base_model = base_model

### Experiment 1: full modules, 10 epochs, BS=64, LR=5e-4

In [12]:
peft_config1 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=full_target_modules,
)
model1 = get_peft_model(base_model, peft_config1)
model1.print_trainable_parameters()

training_args1 = TrainingArguments(
    output_dir="./results/exp1_full_10ep_5e-4",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp1_full_10ep_5e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False

)
trainer1 = Trainer(
    model=model1,
    args=training_args1,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer1.train()
model1 = model1.unload()  

trainable params: 33,024 || all params: 469,376 || trainable%: 7.0357


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: romanbokhyan (romanbokhyan-hse-university). Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4532,0.430749,0.801182,0.831982,0.757972,0.793255,0.605004
2,0.4043,0.42346,0.798649,0.84064,0.740181,0.787219,0.601954
3,0.3916,0.416961,0.809797,0.786399,0.853978,0.818796,0.6217
4,0.3807,0.426959,0.807601,0.804434,0.816046,0.810198,0.615216
5,0.3493,0.426677,0.809459,0.802946,0.823431,0.813059,0.619045
6,0.4067,0.423571,0.809966,0.805135,0.821081,0.81303,0.619991
7,0.342,0.430819,0.814696,0.822702,0.805304,0.81391,0.62957
8,0.3074,0.442434,0.808108,0.80483,0.816717,0.81073,0.616232
9,0.3415,0.446895,0.805405,0.807679,0.804968,0.806321,0.610807
10,0.2967,0.462842,0.80625,0.811353,0.801276,0.806283,0.612563


 [7400/7400 02:58, Epoch 10/10]


### Experiment 2: mixer only, 10 epochs, BS=64, LR=5e-4

In [14]:
peft_config2 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=mixer_only,
)
model2 = get_peft_model(base_model, peft_config2)
model2.print_trainable_parameters()

training_args2 = TrainingArguments(
    output_dir="./results/exp3_mixer_only_10ep_5e-4",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp3_mixer_only_10ep_5e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False

)
trainer2 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer2.train()
model2 = model2.unload()

trainable params: 12,544 || all params: 448,896 || trainable%: 2.7944


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: romanbokhyan (romanbokhyan-hse-university). Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4594,0.434438,0.802703,0.824436,0.772407,0.797574,0.606774
2,0.412,0.422418,0.806081,0.812991,0.798254,0.805556,0.612293
3,0.3955,0.421325,0.805912,0.788462,0.839543,0.813201,0.612951
4,0.4101,0.42077,0.808953,0.815789,0.801276,0.808467,0.618033
5,0.3609,0.423496,0.808108,0.796207,0.831487,0.813465,0.616704
6,0.4238,0.417598,0.814696,0.814085,0.818731,0.816402,0.629373
7,0.3645,0.424635,0.813851,0.81674,0.812353,0.814541,0.627709
8,0.3653,0.424576,0.813007,0.809319,0.822088,0.815654,0.626039
9,0.3691,0.426708,0.813851,0.81998,0.807318,0.813599,0.6278
10,0.3388,0.428368,0.812838,0.819174,0.805975,0.812521,0.625781


 [7400/7400 02:37, Epoch 10/10]


### Experiment 3: ffn only, 10 epochs, BS=64, LR=5e-4

In [17]:
peft_config3 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=ffn_only,
)
model3 = get_peft_model(base_model, peft_config3)
model3.print_trainable_parameters()

training_args3 = TrainingArguments(
    output_dir="./results/exp4_ffn_only_5ep_6e-4",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp4_ffn_only_5ep_6e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False

)
trainer3 = Trainer(
    model=model3,
    args=training_args3,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer3.train()
model3 = model3.unload()



trainable params: 20,736 || all params: 457,088 || trainable%: 4.5365


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4583,0.447005,0.790541,0.816989,0.752266,0.783293,0.583133
2,0.4107,0.434286,0.797804,0.808946,0.783149,0.795838,0.595982
3,0.4096,0.433834,0.801858,0.779567,0.84525,0.811081,0.605668
4,0.4087,0.433757,0.80625,0.813699,0.797583,0.80556,0.612655
5,0.3689,0.435935,0.808446,0.803754,0.819402,0.811503,0.616946
6,0.4389,0.432795,0.808108,0.805031,0.816381,0.810667,0.616227
7,0.361,0.433374,0.810135,0.810512,0.812689,0.811599,0.62025
8,0.3729,0.436383,0.806081,0.800657,0.818395,0.809429,0.612243
9,0.3952,0.434857,0.809966,0.81445,0.805975,0.810191,0.619976
10,0.3482,0.437095,0.810135,0.815584,0.804632,0.810071,0.620344


 [7400/7400 02:28, Epoch 10/10]


### Experiment 4: short_filter only, 10 epochs, BS=64, LR=5e-4 

In [18]:
short_filter_only = [f"hyena.backbone.layers.{i}.mixer.short_filter" for i in range(4)]
peft_config4 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=short_filter_only,
)
model4 = get_peft_model(base_model, peft_config4)
model4.print_trainable_parameters()

training_args4 = TrainingArguments(
    output_dir="./results/exp5_short_filter_only_5ep_6e-4",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp5_short_filter_only_5ep_6e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False
)

trainer4 = Trainer(
    model=model4,
    args=training_args4,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer4.train()
model4 = model4.unload()



trainable params: 24,832 || all params: 461,184 || trainable%: 5.3844


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4554,0.430216,0.798142,0.821094,0.765693,0.792427,0.597822
2,0.4124,0.426351,0.807601,0.806462,0.812689,0.809564,0.615186
3,0.4144,0.444446,0.792568,0.743668,0.896945,0.813147,0.597599
4,0.4095,0.435201,0.795777,0.795789,0.799261,0.797521,0.591531
5,0.3633,0.417335,0.812162,0.804568,0.827795,0.816016,0.624503
6,0.4178,0.410098,0.817061,0.804432,0.840886,0.822255,0.63466
7,0.3536,0.410344,0.816385,0.82464,0.806647,0.815544,0.632959
8,0.3578,0.412064,0.817905,0.817574,0.821417,0.819491,0.635792
9,0.3542,0.415035,0.819426,0.801262,0.852635,0.826151,0.640022
10,0.3209,0.414623,0.818581,0.817394,0.823431,0.820401,0.637148


 [7400/7400 02:36, Epoch 10/10]


### Experiment 5: FFN modules, 10 epochs, BS=32, LR=5e-4, r=16, r-alpha = 16

In [19]:
peft_config5 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=16, lora_alpha=16, lora_dropout=0.1,
    target_modules=ffn_only, bias='all'
)
model5 = get_peft_model(base_model, peft_config5)
model5.print_trainable_parameters()

training_args5 = TrainingArguments(
    output_dir="./results/exp4_ffn_only_5ep_6e-4",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp4_ffn_only_5ep_6e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False,
    label_names=["labels"],
)
trainer5 = Trainer(
    model=model5,
    args=training_args5,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer5.train()
model5 = model5.unload()



trainable params: 45,568 || all params: 477,568 || trainable%: 9.5417


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4868,0.435273,0.805068,0.82278,0.780799,0.80124,0.611053
2,0.4356,0.430312,0.805068,0.798691,0.819067,0.80875,0.610256
3,0.4029,0.437333,0.797297,0.759405,0.874119,0.812734,0.601184
4,0.4015,0.431223,0.802196,0.790862,0.825109,0.807623,0.60484
5,0.3702,0.43286,0.800845,0.786442,0.829473,0.807384,0.602447
6,0.3958,0.424653,0.808446,0.81092,0.807654,0.809284,0.616891
7,0.3482,0.427731,0.809122,0.821342,0.793219,0.807036,0.618679
8,0.3545,0.427939,0.808446,0.80236,0.821752,0.81194,0.616999
9,0.3781,0.428002,0.809966,0.809413,0.814032,0.811715,0.619913
10,0.3313,0.433793,0.810811,0.814552,0.807989,0.811257,0.621646


 [14800/14800 05:28, Epoch 10/10]


### Experiment 6: embeddings only, 10 epochs, BS=64, LR=5e-4

In [21]:
embeddings_only = ["hyena.backbone.embeddings.word_embeddings"]
peft_config6 = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False,
    r=16, lora_alpha=32, lora_dropout=0.1,
    target_modules=embeddings_only,
)
model6 = get_peft_model(base_model, peft_config6)
model6.print_trainable_parameters()

training_args6 = TrainingArguments(
    output_dir="./results/exp7_embeddings_only_5ep_6e-4",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp7_embeddings_only_5ep_6e-4",
    logging_steps=10,
    learning_rate=5e-4,
    weight_decay=0.01,
    save_safetensors=False,
    label_names=["labels"],

)
trainer6 = Trainer(
    model=model6,
    args=training_args6,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,

)
trainer6.train()
model6 = model6.unload()



trainable params: 2,560 || all params: 438,912 || trainable%: 0.5833


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4706,0.465025,0.786149,0.811115,0.74958,0.779135,0.574164
2,0.447,0.45025,0.788514,0.785078,0.798254,0.791611,0.577046
3,0.45,0.446567,0.792399,0.786509,0.806311,0.796287,0.584901
4,0.462,0.444129,0.797973,0.798061,0.801276,0.799665,0.595924
5,0.4053,0.443595,0.795439,0.804828,0.783484,0.794013,0.591142
6,0.4685,0.442061,0.795439,0.798246,0.794226,0.796231,0.590882
7,0.3985,0.440228,0.801182,0.805838,0.796912,0.80135,0.602414
8,0.4126,0.441922,0.802534,0.798483,0.812689,0.805523,0.605103
9,0.4171,0.440292,0.800169,0.806067,0.793891,0.799932,0.60043
10,0.4199,0.438195,0.800338,0.809934,0.788184,0.798911,0.600948


 [7400/7400 02:25, Epoch 10/10]


### Experiment 7: mixer only, 12 epochs, BS=8, LR=3e-4

In [20]:
peft_config7 = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False,
    r=16, lora_alpha=32, lora_dropout=0.1,
    target_modules=mixer_only, bias='all'
)
model7 = get_peft_model(base_model, peft_config7)
model7.print_trainable_parameters()

training_args7 = TrainingArguments(
    output_dir="./results/exp7_embeddings_only_5ep_6e-4",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=12,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs/exp7_embeddings_only_5ep_6e-4",
    logging_steps=10,
    learning_rate=3e-4,
    weight_decay=0.01,
    save_safetensors=False,
    label_names=["labels"],

)
trainer7 = Trainer(
    model=model7,
    args=training_args7,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,

)
trainer7.train()
model7 = model7.unload()



trainable params: 29,184 || all params: 461,184 || trainable%: 6.3281


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.4289,0.428381,0.805912,0.817708,0.790534,0.803891,0.612234
2,0.3459,0.430634,0.807264,0.793798,0.833501,0.813165,0.615167
3,0.4469,0.450373,0.807095,0.785514,0.848271,0.815688,0.615971
4,0.481,0.441327,0.808446,0.808425,0.811682,0.81005,0.616871
5,0.4131,0.451517,0.810135,0.802216,0.826452,0.814153,0.620468
6,0.2876,0.446828,0.809291,0.81082,0.810003,0.810411,0.618568
7,0.2912,0.487171,0.808953,0.83698,0.770393,0.802307,0.620075
8,0.444,0.482913,0.805405,0.809346,0.802283,0.805799,0.61084
9,0.4301,0.492555,0.80777,0.808375,0.810003,0.809188,0.615521
10,0.4239,0.505958,0.804561,0.801656,0.812689,0.807135,0.609129


 [71040/71040 23:49, Epoch 12/12]


# Interpretation methods

In [11]:
dir_best = "./results/hyenadna-tiny-1k-seqlen-promoter-10ep-lr6e-4-linear-bs64"
config = AutoConfig.from_pretrained(dir_best, trust_remote_code=True)
config.num_labels = 2
best_model = HyenaDNAForSequenceClassification.from_pretrained(
    dir_best,
    config=config,
    trust_remote_code=True
)

best_tokenizer = AutoTokenizer.from_pretrained(
    dir_best,
    trust_remote_code=True
)

if best_tokenizer.pad_token is None:
    best_tokenizer.add_special_tokens({"pad_token": best_tokenizer.sep_token})
    best_model.resize_token_embeddings(len(best_tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [12]:
def new_build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    cls = [self.cls_token_id]
    sep = [self.sep_token_id]
    result = cls + token_ids_0 + sep
    if token_ids_1 is not None:
        result += token_ids_1 + sep
    return result
best_tokenizer.build_inputs_with_special_tokens = new_build_inputs_with_special_tokens.__get__(best_tokenizer, PreTrainedTokenizer)
best_model.to(device)


HyenaDNAForSequenceClassification(
  (hyena): HyenaDNAModel(
    (backbone): HyenaLMBackbone(
      (embeddings): HyenaEmbeddings(
        (word_embeddings): Embedding(16, 128)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layers): ModuleList(
        (0-1): 2 x HyenaBlock(
          (mixer): HyenaOperator(
            (dropout): Dropout(p=0.0, inplace=False)
            (in_proj): Linear(in_features=128, out_features=384, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
            (short_filter): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(2,), groups=384)
            (filter_fn): HyenaFilter(
              (dropout): Dropout(p=0.0, inplace=False)
              (pos_emb): HyenaPositionalEmbedding()
              (implicit_filter): Sequential(
                (0): Linear(in_features=5, out_features=64, bias=True)
                (1): HyenaSin()
                (2): Linear(in_features=64, out_features=64, bias=True)
  

## IG

In [13]:
print(inspect.signature(SequenceClassificationExplainer.__init__))

(self, model: transformers.modeling_utils.PreTrainedModel, tokenizer: transformers.tokenization_utils.PreTrainedTokenizer, attribution_type: str = 'lig', custom_labels: Optional[List[str]] = None)


### Token-wise

In [None]:
explainer = SequenceClassificationExplainer(best_model, best_tokenizer)

true_positives = []
for example in dataset["dev"]:
    seq = example["sequence"]
    inputs = best_tokenizer(
        seq,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = best_model(**inputs).logits
    pred = logits.argmax(dim=-1).item()
    true = int(example["label"])
    if true == 1 and pred == 1:
        true_positives.append(seq)

print(f"Found {len(true_positives)} true positives.")

token_scores = defaultdict(list)
for seq in true_positives:
    attributions = explainer(seq, n_steps=50)
    for token, score in attributions:
        token_scores[token].append(score)

for token, scores in token_scores.items():
    avg_score = sum(scores) / len(scores)
    print(f"{token:}: {avg_score:}")


Found 2512 true positives.
       [CLS]: +0.0000
           G: +0.0401
           C: -0.0049
           A: -0.0336
           T: +0.0184
       [SEP]: +0.0000


2m 9.7s

### K-mer, k = 5

In [28]:
explainer = SequenceClassificationExplainer(best_model, best_tokenizer)

true_positives = []
for example in dataset["dev"]:
    seq = example["sequence"]
    inputs = best_tokenizer(
        seq,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = best_model(**inputs).logits
    pred = logits.argmax(dim=-1).item()
    true = int(example["label"])
    if true == 1 and pred == 1:
        true_positives.append(seq)

print(f"Found {len(true_positives)} true positives.")

k = 5
kmer_scores = defaultdict(list)

for seq in true_positives:
    attributions = explainer(seq, n_steps=50)
    tokens, scores = zip(*attributions)
    specials = set(best_tokenizer.all_special_tokens)
    filtered = [(t, s) for t, s in zip(tokens, scores) if t not in specials]
    tokens, scores = zip(*filtered)

    for i in range(len(tokens) - k + 1):
        kmer = "".join(tokens[i: i + k])
        kmer_score = sum(scores[i: i + k]) / k
        kmer_scores[kmer].append(kmer_score)

sorted_kmers = sorted(
    ((kmer, sum(scores) / len(scores)) for kmer, scores in kmer_scores.items()),
    key=lambda x: x[1],
    reverse=True
)

for kmer, avg_score in sorted_kmers:
    print(f"{kmer:>5}: {avg_score:+.4f}")


Found 2512 true positives.
GGCTG: +0.0652
GCTGG: +0.0635
GCTGC: +0.0617
TTTCC: +0.0614
CTGCT: +0.0610
CTTCC: +0.0584
TTCCG: +0.0581
TTCCT: +0.0574
TGCTG: +0.0566
GGGAG: +0.0555
TTTCT: +0.0551
AGCTG: +0.0544
TGGCT: +0.0542
TTCCC: +0.0529
CTGGC: +0.0529
CTTTT: +0.0523
CGGAG: +0.0514
GGGCT: +0.0508
TGCTT: +0.0501
TTGCT: +0.0499
CTGGG: +0.0497
GGAGG: +0.0495
CGCTG: +0.0480
GCGGA: +0.0477
CTGCG: +0.0474
GGAGC: +0.0468
CTTTC: +0.0467
CTGCC: +0.0460
CAGAG: +0.0459
GGTGG: +0.0456
GGCTT: +0.0456
TGGGC: +0.0454
GCTGA: +0.0451
GGCGG: +0.0450
TTTTT: +0.0449
TGGAG: +0.0448
TGGGA: +0.0447
TGGGG: +0.0437
CAGTT: +0.0436
GCCGG: +0.0434
GGCAG: +0.0432
CTGAG: +0.0426
TTTTC: +0.0423
GCCTG: +0.0421
GAGAG: +0.0421
AGTGG: +0.0418
CGGGA: +0.0415
TGGCG: +0.0414
GGGGA: +0.0414
GAGCT: +0.0412
AGAGA: +0.0409
CTGGA: +0.0409
TGTGG: +0.0409
TGCGG: +0.0408
TTTTG: +0.0405
GGGGG: +0.0404
GTGGG: +0.0404
CTGCA: +0.0403
TTGGC: +0.0402
GAGGG: +0.0401
GTGGC: +0.0400
GCTTG: +0.0400
CAGCT: +0.0399
CTTGG: +0.0399
TTCTT: +0.039

2m 9.7s

In [29]:
df_result = pd.DataFrame(sorted_kmers, columns=["kmer", "avg_score"])
df_result.to_csv("5mers_interpretation_IGseqclass.csv", index=False)
print("Results saved to sorted_kmers.csv")

Results saved to sorted_kmers.csv


## Smoothed Integrated Gradients (IG with noise)

### K-mer, k = 5

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model.to(device).eval()

embedding_layer = best_model.get_input_embeddings()

def forward_embeddings(inputs_embeds, attention_mask):
    outputs = best_model(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask
    )
    return outputs.logits

ig = IntegratedGradients(forward_embeddings)
nt = NoiseTunnel(ig)

true_positives = []
for ex in dataset["dev"]:
    seq = ex["sequence"]
    enc = best_tokenizer(seq, truncation=True, padding=True, return_tensors="pt").to(device)
    with torch.no_grad():
        pred = best_model(**enc).logits.argmax(dim=-1).item()
    if int(ex["label"]) == 1 and pred == 1:
        true_positives.append(seq)

print(f"Found {len(true_positives)} true positives.")

k = 5
kmer_scores = defaultdict(list)

for seq in true_positives:
    enc = best_tokenizer(seq, truncation=True, padding=True, return_tensors="pt").to(device)
    input_ids = enc["input_ids"]
    mask = enc.get("attention_mask", (input_ids != best_tokenizer.pad_token_id).long())

    with torch.no_grad():
        pred = best_model(**enc).logits.argmax(dim=-1).item()

    embeds = embedding_layer(input_ids)  # [1, seq_len, emb_dim]
    baseline = torch.zeros_like(embeds)  

    attributions = nt.attribute(
        inputs=embeds,
        baselines=baseline,
        target=pred,
        nt_type="smoothgrad",
        nt_samples=50,
        stdevs=0.02,
        additional_forward_args=(mask,),
    )

    token_attr = attributions.sum(dim=-1).squeeze(0).cpu().tolist()
    tokens = best_tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))

    tokens = tokens[1:-1]
    token_attr = token_attr[1:-1]

    for i in range(len(tokens) - k + 1):
        kmer = "".join(tokens[i: i + k])
        kmer_score = sum(token_attr[i: i + k]) / k
        kmer_scores[kmer].append(kmer_score)

sorted_kmers = sorted(
    ((kmer, sum(v) / len(v)) for kmer, v in kmer_scores.items()),
    key=lambda x: x[1],
    reverse=True
)

for kmer, avg in sorted_kmers:
    print(f"{kmer:>5}: {avg:+.4f}")


Found 2512 true positives.
GCTGG: +0.0912
CTGGG: +0.0875
GGTGG: +0.0852
CGCGG: +0.0825
TGGGG: +0.0825
GCGGG: +0.0808
GTTGG: +0.0807
TGGGA: +0.0805
GCGCG: +0.0804
GGAAG: +0.0801
GCCGG: +0.0798
CGCGC: +0.0788
GGGTG: +0.0786
GTGGG: +0.0784
CCGGG: +0.0781
GGCTG: +0.0780
CGGGG: +0.0769
GGGAG: +0.0766
GGTTG: +0.0765
TGGTG: +0.0756
GGCGG: +0.0755
CCCGG: +0.0754
TGGAG: +0.0752
TTTTT: +0.0743
GGCGC: +0.0742
TGCTG: +0.0739
GCGGA: +0.0739
CGCCG: +0.0737
GGGGG: +0.0725
TTGGG: +0.0723
CCGCG: +0.0722
CTGGA: +0.0721
GGAGG: +0.0716
GGGGA: +0.0715
TGGCG: +0.0715
GGGCG: +0.0712
GCTGC: +0.0711
CCTGG: +0.0709
TGCGG: +0.0692
CGCTG: +0.0691
GCGCC: +0.0691
CCGGA: +0.0689
GTTTG: +0.0683
TGTGG: +0.0683
TGTTG: +0.0681
CGGAG: +0.0678
CTGGT: +0.0674
GGAGC: +0.0671
GTTGC: +0.0671
GGGAA: +0.0669
CGGCG: +0.0667
CGGGA: +0.0667
GTGGA: +0.0664
GGCCG: +0.0663
GCCGC: +0.0661
TGGTT: +0.0660
CGGTG: +0.0659
GGGGT: +0.0658
GCTTG: +0.0655
CTGCG: +0.0655
TTGGA: +0.0654
TTTGG: +0.0653
TGGAA: +0.0653
CGGGC: +0.0652
GCGGC: +0.065

9m 49.2s

In [27]:
df_result = pd.DataFrame(sorted_kmers, columns=["kmer", "avg"])
df_result.to_csv("5mers_interpretation_smoothgrad.csv", index=False)
print("Results saved to sorted_kmers.csv")

Results saved to sorted_kmers.csv


## Interpets arrangment

In [32]:
import pandas as pd

def get_ranked_features(fw: pd.DataFrame, id_col: str = "kmer") -> pd.DataFrame:
    num = fw.drop(columns=[id_col])
    means = num.mean()
    dev = num.sub(means).div(means.abs()) * 100
    dev[id_col]    = fw[id_col]
    dev["mean_dev"] = dev.drop(columns=[id_col]).mean(axis=1)
    return dev[[id_col, "mean_dev"]].sort_values("mean_dev", ascending=False)

df_ig = (
    pd.read_csv("5mers_interpretation_IGseqclass.csv")
      .rename(columns={"avg_score": "IG_imp"})
)
df_sg = (
    pd.read_csv("5mers_interpretation_smoothgrad.csv")
      .rename(columns={"avg":      "SG_imp"})
)

merged = df_ig.merge(df_sg, on="kmer")

merged = merged[(merged["IG_imp"] > 0) & (merged["SG_imp"] > 0)]

ranked = get_ranked_features(merged)

ranked.to_csv("ranking2_IG_vs_SG_corrected.csv", index=False)
print("saved ranking2_IG_vs_SG_corrected.csv")


saved ranking2_IG_vs_SG_corrected.csv
