In [1]:
import pandas as pd
import numpy as np
import pickle

# Load data from the pickle file
file_path = '/content/drive/MyDrive/MLRG/processed_data.pkl'
with open(file_path, 'rb') as f:
    data = pickle.load(f)
# Restore species integer values
data['species'] = data['species'].apply(lambda x: np.argmax(x))

# Base encodings
base_encodings = {
    (1, 0, 0, 0): "A",
    (0, 1, 0, 0): "T",
    (0, 0, 1, 0): "C",
    (0, 0, 0, 1): "G"
}

# Function to decode the sequence and omit [0, 0, 0, 0]
def decode_sequence(encoded_seq):
    return ''.join([base_encodings[tuple(base)] for base in encoded_seq if tuple(base) in base_encodings])

# Restore upstream200 sequences
data['upstream200'] = data['upstream200'].apply(decode_sequence)

# Restore stress_name integer values
data['stress_name'] = data['stress_name'].apply(lambda x: np.argmax(x))

# Show the restored data
print(data.head())
%load_ext autoreload
%autoreload 2

   species                                        upstream200    stress  \
0        0  CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAA...  0.033641   
0        0  CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAA...  0.013922   
0        0  CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAA... -0.806374   
0        0  CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAA... -0.026784   
0        0  CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAA...  0.922333   

   stress_name  
0           10  
0            1  
0            4  
0            0  
0            8  


In [2]:
%cd /content/drive/MyDrive/MLRG

/content/drive/MyDrive/MLRG


In [3]:
! pip install stripedhyena==0.2.2

Collecting stripedhyena==0.2.2
  Downloading stripedhyena-0.2.2-py3-none-any.whl (30 kB)
Collecting flash-attn>=2.0.0 (from stripedhyena==0.2.2)
  Downloading flash_attn-2.6.1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from flash-attn>=2.0.0->stripedhyena==0.2.2)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->flash-attn>=2.0.0->stripedhyena==0.2.2)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->flash-attn>=2.0.0->stripedhyena==0.2.2)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cud

# 10-test

In [4]:
import torch
from torch import nn, optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import numpy as np
from torch.utils.data import Dataset, DataLoader

class StressPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1):
        super(StressPredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim + 2, hidden_dim)
        self.relu1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.relu2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.relu3 = nn.LeakyReLU()
        self.fc4 = nn.Linear(hidden_dim // 4, output_dim)
        self.initialize_weights()

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='leaky_relu')
                nn.init.zeros_(m.bias)

    def forward(self, hidden_state, species, stress_name):
        hidden_state = hidden_state.squeeze(1)
        species = species.view(-1, 1)
        stress_name = stress_name.view(-1, 1)
        x = torch.cat((hidden_state, species, stress_name), dim=1)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

class HiddenStatesDataset(Dataset):
    def __init__(self, hidden_states_list, scaler):
        self.hidden_states_list = hidden_states_list
        self.scaler = scaler

    def __len__(self):
        return len(self.hidden_states_list)

    def __getitem__(self, idx):
        hidden_state, species, stress_name, stress = self.hidden_states_list[idx]
        hidden_state = hidden_state.to(torch.float32).cpu().numpy()
        hidden_state = self.scaler.transform(hidden_state)
        return torch.tensor(hidden_state, dtype=torch.float32), torch.tensor(species, dtype=torch.float32), torch.tensor(stress_name, dtype=torch.float32), torch.tensor(stress, dtype=torch.float32)



In [5]:
filtered_data = data[data['species'] == 0]
for index, row in filtered_data.head(1).iterrows():
    print(row['upstream200'])

CCTTCCAAGCTTACGACGAGGGTTCGATTCCCTTCACCCGCTCCAAGCAGTACACATGCCCATGTGGCTCAGTGGTAGAGCACTCCCTTGGTAAGGGAGAGGTCGCGCGTTCGATCCGCGCCATGGGCACCACAAATTCCAAAGTCTTTTCTTCAGTCAGCAGCTTCAGTCAAAAGCGCAATCCAGGTCAGGAGTCAGCCATG


In [6]:
import torch
import pickle
import pandas as pd
from evo import Evo

# Function to save hidden states
def save_hidden_states(data, tokenizer, model, device, output_file):
    hidden_states_list = []

    for idx, row in data.iterrows():
        sequence = row['upstream200']
        input_ids = torch.tensor(
            tokenizer.tokenize(sequence),
            dtype=torch.int,
        ).to(device).unsqueeze(0)

        with torch.no_grad():
            _ = model(input_ids)

        hidden_state = hidden_states[-1][0].mean(dim=1).to(torch.bfloat16)
        hidden_states_list.append((hidden_state.cpu(), row['species'], row['stress_name'], row['stress']))

    with open(output_file, 'wb') as f:
        pickle.dump(hidden_states_list, f)



    # Filter data where species == 0
data = data[data['species'] == 0].head(10)

    # Load the evo model
device = 'cuda:0'
evo_model = Evo('evo-1-131k-base')
model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device)
model.eval()

import torch.nn as nn
def register_hooks(model):
    hooks = []
    hidden_states = []

    def hook_fn(module, input, output):
        hidden_states.append(output)

    for name, module in model.named_modules():
        if isinstance(module, nn.Module):
            hooks.append(module.register_forward_hook(hook_fn))

    return hooks, hidden_states

global hidden_states
hooks, hidden_states = register_hooks(model)

    # Save hidden states
output_file = '/content/drive/MyDrive/MLRG/hidden_states.pkl'
save_hidden_states(data, tokenizer, model, device, output_file)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

configuration_hyena.py:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

modeling_hyena.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

layers.py:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

utils.py:   0%|          | 0.00/2.87k [00:00<?, ?B/s]

model.py:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

engine.py:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

positional_embeddings.py:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

cache.py:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

tokenizer.py:   0%|          | 0.00/4.40k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [10]:
def train_model(predictor, dataloader, criterion, optimizer, num_epochs=25, accumulation_steps=4):
    for epoch in range(num_epochs):
        predictor.train()
        running_loss = 0.0
        optimizer.zero_grad()

        for i, (hidden_state, species, stress_name, stress) in enumerate(dataloader):
            hidden_state, species, stress_name, stress = hidden_state.to(device), species.to(device), stress_name.to(device), stress.to(device)
            outputs = predictor(hidden_state, species, stress_name).squeeze(1)
            # Ensure stress has the same shape as outputs
            stress = stress.view_as(outputs)
            loss = criterion(outputs, stress)
            loss.backward()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            running_loss += loss.item()

        epoch_loss = running_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

    return predictor




In [18]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score, max_error

def evaluate_model(predictor, dataloader):
    predictor.eval()
    actuals = []
    predictions = []

    with torch.no_grad():
        for hidden_state, species, stress_name, stress in dataloader:
            hidden_state, species, stress_name, stress = hidden_state.to(device), species.to(device), stress_name.to(device), stress.to(device)
            outputs = predictor(hidden_state, species, stress_name).squeeze(1)
            # Ensure stress has the same shape as outputs
            stress = stress.view_as(outputs)
            actuals.append(stress.cpu().numpy())
            predictions.append(outputs.cpu().numpy())

    actuals = [item for sublist in actuals for item in sublist]
    predictions = [item for sublist in predictions for item in sublist]

    mse = mean_squared_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    mae = mean_absolute_error(actuals, predictions)
    evs = explained_variance_score(actuals, predictions)
    max_err = max_error(actuals, predictions)

    print(f'MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}, Explained Variance Score: {evs:.4f}, Max Error: {max_err:.4f}')

def main():
    input_file = '/content/drive/MyDrive/MLRG/hidden_states.pkl'
    with open(input_file, 'rb') as f:
        hidden_states_list = pickle.load(f)

    hidden_states = [item[0].to(torch.float32).cpu().numpy().squeeze(0) for item in hidden_states_list]
    scaler = StandardScaler().fit(hidden_states)

    dataset = HiddenStatesDataset(hidden_states_list, scaler)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    device = 'cuda:0'
    hidden_dim = 512
    predictor = StressPredictor(input_dim=hidden_dim, hidden_dim=512).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(predictor.parameters(), lr=0.0005, weight_decay=1e-4)

    predictor = train_model(predictor, dataloader, criterion, optimizer, num_epochs=10, accumulation_steps=4)

    evaluate_model(predictor, dataloader)

if __name__ == "__main__":
    main()


Epoch 1/10, Loss: 0.6532
Epoch 2/10, Loss: 0.8968
Epoch 3/10, Loss: 0.3560
Epoch 4/10, Loss: 0.3974
Epoch 5/10, Loss: 0.5471
Epoch 6/10, Loss: 0.8269
Epoch 7/10, Loss: 0.5731
Epoch 8/10, Loss: 0.7208
Epoch 9/10, Loss: 0.9176
Epoch 10/10, Loss: 0.5594
MSE: 0.4347, R2: -0.1566, MAE: 0.5430, Explained Variance Score: 0.0230, Max Error: 1.1175
