# Classification

In [47]:
import pandas as pd

data = pd.read_csv("train_data.tsv", delimiter='\t')


https://huggingface.co/docs/transformers/en/model_doc/esm

other models to check: https://huggingface.co/facebook/esm2_t36_3B_UR50D

(na razie tylko dla klasyfikacji)

In [49]:
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")
model2 = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D")


# changed, we have two modules (esm + classifier)
for name, module in model2.named_children():
    if name != "classifier": 
        for param in module.parameters():
            param.requires_grad = False

inputs = tokenizer(data["sequence"][0], return_tensors="pt")
print(inputs)
with torch.no_grad():
    logits = model(**inputs).logits

from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
optimizer2 = torch.optim.Adam(params=model2.parameters(), lr=0.0001)

X = list(data["sequence"])
y = torch.Tensor([[0, 1] if dataPoint == 1 else [1, 0] for dataPoint in list(data["is_active"])])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)
X_train = tokenizer(X_train, return_tensors="pt")
X_test = tokenizer(X_test, return_tensors="pt")

In [None]:
model = model.to(device)

# BCEWithLogitsLoss - sigmoid is already built-in!
loss_fn = torch.nn.BCEWithLogitsLoss()

X_train, y_train = X_train.to(device), y_train.to(device).float()
X_test, y_test = X_test.to(device), y_test.to(device).float()

train_losses, test_losses = [], []
acc_train, acc_test = [], []
epochs = 20
batch_size = 25

for epoch in range(epochs):
    
    ### TRAIN
    model.train()
    correct_train = 0
    for i in range(0, len(X_train["input_ids"]), batch_size):
        input_ids = X_train["input_ids"][i:i+batch_size]
        attention_mask = X_train["attention_mask"][i:i+batch_size]
        train_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
        y_logits = model(**train_batch).logits

        y_pred = torch.Tensor([[1, 0] if pred[0] > pred[1] else [0, 1] for pred in y_logits]).to(device)

        loss = loss_fn(y_logits, y_train[i:i+batch_size])

        train_losses.append(loss.cpu().detach().numpy())

        correct_train += torch.eq(y_pred, y_train[i:i+batch_size]).sum().item()

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    acc_train.append(correct_train / (2*len(X_train["input_ids"])))

    ### TEST
    model.eval()
    correct_test = 0
    with torch.inference_mode():
        for i in range(0, len(X_test["input_ids"]), batch_size):
            input_ids = X_test["input_ids"][i:i+batch_size]
            attention_mask = X_test["attention_mask"][i:i+batch_size]
            test_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
            y_logits_test = model(**test_batch).logits

            #y_pred_test = torch.Tensor([[1, 0] if pred[0] > pred[1] else [0, 1] for pred in y_logits_test]).to(device)
            y_pred_test = torch.nn.functional.one_hot(torch.argmax(y_logits_test, dim=1), num_classes=2).float()

            test_loss = loss_fn(y_pred_test, y_test[i:i+batch_size])
            test_losses.append(test_loss.cpu().detach().numpy())

            correct_test += torch.eq(y_pred_test, y_test[i:i+batch_size]).sum().item()

    acc_test.append(correct_test / (2*len(X_test["input_ids"])))
    print(epoch)
    
    if epoch % 1 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy train: {acc_train[-1]:.4f} | Accuracy test: {acc_test[-1]:.4f} | Test loss: {test_loss:.5f}")



In [45]:
model2 = model2.to(device)
print(model2.device)

X_train, y_train = X_train.to(device), y_train.to(device).float()
X_test, y_test = X_test.to(device), y_test.to(device).float()

# BCEWithLogitsLoss - sigmoid is already built-in!
loss_fn = torch.nn.BCEWithLogitsLoss()

train_losses, test_losses = [], []
acc_train, acc_test = [], []
epochs = 40
batch_size = 25


best_epoch = 0
best_accuracy = 0

for epoch in range(epochs):
    
    ### TRAIN
    model2.train()
    correct_train = 0
    for i in range(0, len(X_train["input_ids"]), batch_size):
        input_ids = X_train["input_ids"][i:i+batch_size]
        attention_mask = X_train["attention_mask"][i:i+batch_size]
        train_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
        y_logits = model2(**train_batch).logits

        y_pred = torch.Tensor([[1, 0] if pred[0] > pred[1] else [0, 1] for pred in y_logits]).to(device)

        loss = loss_fn(y_logits, y_train[i:i+batch_size])

        train_losses.append(loss.cpu().detach().numpy())

        correct_train += torch.eq(y_pred, y_train[i:i+batch_size]).sum().item()

        optimizer2.zero_grad()

        loss.backward()

        optimizer2.step()
        
    ### TEST
    model2.eval()
    correct_test = 0
    with torch.inference_mode():
        for i in range(0, len(X_test["input_ids"]), batch_size):
            input_ids = X_test["input_ids"][i:i+batch_size]
            attention_mask = X_test["attention_mask"][i:i+batch_size]
            test_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
            #test_batch["input_ids"] = test_batch["input_ids"].to(device)
            #test_batch["attention_mask"] = test_batch["attention_mask"].to(device)
            y_logits_test = model2(**test_batch).logits

            #y_pred_test = torch.Tensor([[1, 0] if pred[0] > pred[1] else [0, 1] for pred in y_logits_test]).to(device)
            y_pred_test = torch.nn.functional.one_hot(torch.argmax(y_logits_test, dim=1), num_classes=2).float()

            test_loss = loss_fn(y_pred_test, y_test[i:i+batch_size])
            test_losses.append(test_loss.cpu().detach().numpy())

            correct_test += torch.eq(y_pred_test, y_test[i:i+batch_size]).sum().item()

    acc_test.append(correct_test / (2*len(X_test["input_ids"])))
    
    if correct_test > best_accuracy:
        best_accuracy = correct_test
        best_epoch = epoch
        print(f"Best accuracy: {best_accuracy} at epoch {best_epoch}")
        

    acc_train.append(correct_train / (2*len(X_train["input_ids"])))
    if epoch % 1 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy train: {acc_train[-1]:.4f} | Accuracy test: {acc_test[-1]:.4f} | Test loss: {test_loss:.5f}")


cuda:0
Best accuracy: 15214 at epoch 0
Epoch: 0 | Loss: 0.64594, Accuracy train: 0.5858 | Accuracy test: 0.6076 | Test loss: 0.65320
Best accuracy: 15450 at epoch 1
Epoch: 1 | Loss: 0.64500, Accuracy train: 0.6106 | Accuracy test: 0.6170 | Test loss: 0.65320
Best accuracy: 15692 at epoch 2
Epoch: 2 | Loss: 0.64233, Accuracy train: 0.6178 | Accuracy test: 0.6267 | Test loss: 0.62820
Best accuracy: 15778 at epoch 3
Epoch: 3 | Loss: 0.63724, Accuracy train: 0.6263 | Accuracy test: 0.6301 | Test loss: 0.62820
Best accuracy: 15802 at epoch 4
Epoch: 4 | Loss: 0.63016, Accuracy train: 0.6316 | Accuracy test: 0.6311 | Test loss: 0.62820
Best accuracy: 15882 at epoch 5
Epoch: 5 | Loss: 0.62286, Accuracy train: 0.6353 | Accuracy test: 0.6343 | Test loss: 0.62820
Best accuracy: 15908 at epoch 6
Epoch: 6 | Loss: 0.61647, Accuracy train: 0.6377 | Accuracy test: 0.6353 | Test loss: 0.62820
Best accuracy: 15966 at epoch 7
Epoch: 7 | Loss: 0.61121, Accuracy train: 0.6402 | Accuracy test: 0.6376 | Test

Zatrzymałem po około 2h na GPU Colabowym, 20 epok, celność ~0.69 na zbiorze testowym

In [None]:
# final eval
model.eval()
model2.eval()
correct_test = 0
correct_test2 = 0
with torch.inference_mode():
    for i in range(0, len(X_test["input_ids"]), batch_size):
        input_ids = X_test["input_ids"][i:i+batch_size]
        attention_mask = X_test["attention_mask"][i:i+batch_size]
        test_batch = {"input_ids": input_ids, "attention_mask": attention_mask}
        y_logits_test = model(**test_batch).logits
        y_logits_test2 = model2(**test_batch).logits

        #y_pred_test = torch.Tensor([[1, 0] if pred[0] > pred[1] else [0, 1] for pred in y_logits_test]).to(device)
        y_pred_test = torch.nn.functional.one_hot(torch.argmax(y_logits_test, dim=1), num_classes=2).float()
        y_pred_test2 = torch.nn.functional.one_hot(torch.argmax(y_logits_test2, dim=1), num_classes=2).float()

        test_loss = loss_fn(y_pred_test, y_test[i:i+batch_size])
        test_loss2 = loss_fn(y_pred_test2, y_test[i:i+batch_size])
        correct_test += torch.eq(y_pred_test, y_test[i:i+batch_size]).sum().item()
        correct_test2 += torch.eq(y_pred_test2, y_test[i:i+batch_size]).sum().item()

print("Model - Final accuracy:", correct_test / (2*len(X_test["input_ids"])))
print("Model (frozen) - Final accuracy:", correct_test2 / (2*len(X_test["input_ids"])))

In [None]:
#model.save_pretrained("./model_trained", from_pt=True)
model2.save_pretrained("./model_trained_frozen", from_pt=True)

# Regression

In [50]:
import pandas as pd

data = pd.read_csv("train_data.tsv", delimiter='\t')


In [35]:
import torch
from transformers import AutoTokenizer, EsmForSequenceClassification


from transformers import EsmModel
import torch.nn as nn

model_name = "facebook/esm2_t6_8M_UR50D"

class EsmForRegression(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.backbone = EsmModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.regressor = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # Use [CLS] token embedding
        return self.regressor(pooled)

class EsmForRegressionFrozen(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.backbone = EsmModel.from_pretrained(model_name)
        for param in self.backbone.parameters():
            param.requires_grad = False
        hidden_size = self.backbone.config.hidden_size
        self.regressor = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # Use [CLS] token embedding
        return self.regressor(pooled)


tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = EsmForRegression(model_name)
model2 = EsmForRegressionFrozen(model_name)

inputs = tokenizer(data["sequence"][0], return_tensors="pt")
print(inputs)
with torch.no_grad():
    logits = model2(**inputs)
    print(logits)
    print(data["rna_dna_ratio"][0])



Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[ 0,  6, 11, 11, 23, 11, 23, 11,  5, 11,  6, 23, 23,  5, 11, 23, 11, 23,
          5, 23,  5, 11, 11, 11,  5, 11, 23, 23, 23, 23,  5,  5,  5,  6, 23, 23,
         23,  5,  6,  6,  6,  5,  6,  6, 23, 11,  6, 11,  6, 11,  5,  5, 11, 11,
         11,  6, 11, 11, 23,  5,  5,  6,  6, 11,  6,  5, 23,  5, 23,  5,  6, 23,
          5,  5,  6, 11,  5, 11,  6, 11,  6,  6, 23,  5,  6,  5,  6, 23,  5,  6,
          6,  6,  6, 23, 11, 23,  6,  5,  5, 11, 11, 23,  5,  6,  6, 23, 23, 11,
         23, 11,  6,  5, 11, 23, 11, 11, 11,  5,  5,  6,  6, 23, 23, 11,  6, 11,
          6, 11, 11, 11, 23, 23, 23, 23, 23, 11, 23, 23,  5, 23,  5, 11, 23,  5,
          6, 11,  6, 11, 11, 11, 23,  5,  6,  6,  5,  6,  6, 11,  6,  6,  5,  5,
          6,  5, 23, 11, 11,  6,  5,  5,  6, 23,  5, 23, 11,  6,  6,  6,  5,  5,
          6, 23, 11,  6, 11, 23, 11, 11,  6, 23,  5, 11, 11,  6, 23,  5, 11, 11,
          5,  5,  5, 23,  5,  5, 23,  5, 11, 11,  6, 23, 23,  5, 23,  5, 11,  5,
          6,  

In [36]:
X_train = {key: val.to(device) for key, val in X_train.items()}
X_test = {key: val.to(device) for key, val in X_test.items()}

In [39]:
from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
optimizer2 = torch.optim.Adam(params=model2.parameters(), lr=0.0001)

X = list(data["sequence"])
y = list(data["rna_dna_ratio"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)

In [40]:
from torch.utils.data import Dataset, DataLoader

class SequenceRegressionDataset(Dataset):
    def __init__(self, sequences, targets, tokenizer):
        self.encodings = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
        self.targets = torch.tensor(targets, dtype=torch.float).unsqueeze(1)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item["labels"] = self.targets[idx].to(device)
        return item

train_dataset = SequenceRegressionDataset(X_train, y_train, tokenizer)
test_dataset = SequenceRegressionDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
model = model.to(device)
epochs = 20
loss_fn = nn.MSELoss()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    model.eval()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()

    avg_loss2 = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Avg Train Loss = {avg_loss:.4f}")
    print(f"Avg Test Loss = {avg_loss2:.4f}")

In [None]:
torch.save(model.state_dict(), "esm_regression_model.pt")

In [42]:
model2 = model2.to(device)
epochs = 20
loss_fn = nn.MSELoss()
for epoch in range(epochs):
    model2.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model2(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer2.step()
        optimizer2.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    model.eval()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model2(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()

    avg_loss2 = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Avg Train Loss = {avg_loss:.4f}")
    print(f"Avg Test Loss = {avg_loss2:.4f}")

Epoch 1: Avg Train Loss = 0.1231
Avg Test Loss = 0.1219
Epoch 2: Avg Train Loss = 0.1215
Avg Test Loss = 0.1208
Epoch 3: Avg Train Loss = 0.1205
Avg Test Loss = 0.1199
Epoch 4: Avg Train Loss = 0.1196
Avg Test Loss = 0.1188
Epoch 5: Avg Train Loss = 0.1188
Avg Test Loss = 0.1185
Epoch 6: Avg Train Loss = 0.1181
Avg Test Loss = 0.1176
Epoch 7: Avg Train Loss = 0.1176
Avg Test Loss = 0.1171
Epoch 8: Avg Train Loss = 0.1172
Avg Test Loss = 0.1166
Epoch 9: Avg Train Loss = 0.1167
Avg Test Loss = 0.1162
Epoch 10: Avg Train Loss = 0.1163
Avg Test Loss = 0.1158
Epoch 11: Avg Train Loss = 0.1160
Avg Test Loss = 0.1155
Epoch 12: Avg Train Loss = 0.1157
Avg Test Loss = 0.1156
Epoch 13: Avg Train Loss = 0.1153
Avg Test Loss = 0.1149
Epoch 14: Avg Train Loss = 0.1151
Avg Test Loss = 0.1154
Epoch 15: Avg Train Loss = 0.1149
Avg Test Loss = 0.1145
Epoch 16: Avg Train Loss = 0.1146
Avg Test Loss = 0.1142
Epoch 17: Avg Train Loss = 0.1144
Avg Test Loss = 0.1140
Epoch 18: Avg Train Loss = 0.1142
Avg Te

In [43]:
torch.save(model2.state_dict(), "esm_regression_model_frozen.pt")


In [None]:
def eval_reg(model, dataloader):
  loss_fn = nn.MSELoss()
  model.eval()
  total_loss = 0
  for batch in dataloader:
      input_ids = batch["input_ids"]
      attention_mask = batch["attention_mask"]
      labels = batch["labels"]

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_fn(outputs, labels)

      total_loss += loss.item()

  avg_loss = total_loss / len(dataloader)
  print(f"Avg MSE Loss = {avg_loss:.4f}")

In [None]:
#print("Model:")
#eval_reg(model, test_dataloader)
print("Model (frozen):")
eval_reg(model2, test_loader)