In [1]:
!pip install datasets evaluate transformers
!pip install -U datasets huggingface_hub fsspec
!pip install peft
!pip install bitsandbytes

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [2]:
# Import necessary libraries

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from peft import LoraConfig, get_peft_model
import evaluate

In [3]:
# Read a FASTA file and return a list of sequences.

def read_fasta(file_path):
    sequences=[]
    current_sequence=[]
    with open(file_path,"r") as file:
        for line in file:
            if line.startswith(">"):
                if current_sequence:
                    sequences.append(''.join(current_sequence))
                    current_sequence=[]
            else:
                current_sequence.append(line.strip())
        if current_sequence:
            sequences.append(''.join(current_sequence))
    return sequences

In [4]:
# Read the sequences from the FASTA files and create DataFrames for each dataset

sequences_do = read_fasta("clustered_fasta_files/mobidb_do_cdhit.fasta")
sequences_dd = read_fasta("clustered_fasta_files/mobidb_dd_cdhit.fasta")


df_do = pd.DataFrame(sequences_do, columns=["sequence"])
df_do["label"] = 0
df_dd = pd.DataFrame(sequences_dd, columns=["sequence"])
df_dd["label"] = 1


# Calculate the length of each sequence and add it as a new column

df_do["length"] = df_do["sequence"].apply(len)
df_dd["length"] = df_dd["sequence"].apply(len)

print("mobidb_do:",df_do["length"].describe(), sep="\n")
print("mobidb_dd:",df_dd["length"].describe(), sep="\n")

# Concatenate the DataFrames

df_all = pd.concat([df_do, df_dd], ignore_index=True)

# Filter out sequences longer than 1000 characters

print("Number of removed sequences:" ,df_all[df_all["length"] > 1000]["sequence"].count())
df_all = df_all[df_all["length"] < 1001]

# Remove all sequences that are repeated more than once

sequence_counts = df_all["sequence"].value_counts()
unique_sequences = sequence_counts[sequence_counts == 1].index
df_unique = df_all[df_all["sequence"].isin(unique_sequences)].reset_index(drop=True)
print("Unique sequences count:", df_unique.shape[0])

mobidb_do:
count    11673.000000
mean        47.345327
std         94.411344
min         11.000000
25%         14.000000
50%         20.000000
75%         41.000000
max       3768.000000
Name: length, dtype: float64
mobidb_dd:
count    21059.000000
mean        51.388432
std        107.135028
min         11.000000
25%         16.000000
50%         24.000000
75%         47.000000
max       3421.000000
Name: length, dtype: float64
Number of removed sequences: 49
Unique sequences count: 32607


In [5]:
# Split the dataset into training and testing sets, stratifying by label

train_df, test_df = train_test_split(df_unique, test_size=0.20, stratify=df_unique["label"], random_state=42, shuffle=True)

print("Train set size:", train_df.shape[0])
print("Test set size:", test_df.shape[0])
print("Train set label distribution:\n", train_df["label"].value_counts())
print("Test set label distribution:\n", test_df["label"].value_counts())
print("Train set length distribution:\n", train_df["length"].describe())
print("Test set length distribution:\n", test_df["length"].describe())
print("Columns in train set and test set:", train_df.columns.tolist(), test_df.columns.tolist())


Train set size: 26085
Test set size: 6522
Train set label distribution:
 label
1    16785
0     9300
Name: count, dtype: int64
Test set label distribution:
 label
1    4197
0    2325
Name: count, dtype: int64
Train set length distribution:
 count    26085.000000
mean        47.386429
std         74.522465
min         11.000000
25%         15.000000
50%         23.000000
75%         45.000000
max        999.000000
Name: length, dtype: float64
Test set length distribution:
 count    6522.000000
mean       47.956608
std        76.553700
min        11.000000
25%        14.250000
50%        23.000000
75%        44.000000
max       997.000000
Name: length, dtype: float64
Columns in train set and test set: ['sequence', 'label', 'length'] ['sequence', 'label', 'length']


In [6]:
# load the tokenizer for the ESM-2 model

checkpoint = "facebook/esm2_t12_35M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

# Function to tokenize the sequences in the dataset

def tokenize_function(example):
    return tokenizer(example["sequence"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [7]:
# Convert the DataFrames to Hugging Face Datasets and tokenize them

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
test_dataset = test_dataset.map(tokenize_function, batched=True)

print("Train datset:", train_dataset)
print("Test datset:", test_dataset)

Map:   0%|          | 0/26085 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/6522 [00:00<?, ? examples/s]

Train datset: Dataset({
    features: ['sequence', 'label', 'length', 'input_ids', 'attention_mask'],
    num_rows: 26085
})
Test datset: Dataset({
    features: ['sequence', 'label', 'length', 'input_ids', 'attention_mask'],
    num_rows: 6522
})


In [8]:
# Remove unnecessary columns from the datasets

train_dataset = train_dataset.remove_columns(["sequence", "length"])
test_dataset = test_dataset.remove_columns(["sequence", "length"])

print("Train datset:", train_dataset)
print("Test datset:", test_dataset)

Train datset: Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 26085
})
Test datset: Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 6522
})


In [9]:
##  Class Wighting to handle class imbalance

# Calculate Weights
label_counts = train_dataset.to_pandas()['label'].value_counts()
print(f"Label counts: {label_counts}")
num_samples_class_0 = label_counts[0]
num_samples_class_1 = label_counts[1]
total_samples = len(train_dataset)

weight_for_0 = total_samples / (2 * num_samples_class_0)
weight_for_1 = total_samples / (2 * num_samples_class_1)

class_weights = torch.tensor([weight_for_0, weight_for_1])

# Move to GPU if available and convert to float32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class_weights = class_weights.to(device).to(torch.float32)

print(f"Calculated class weights: {class_weights}")

batch_size = 4

# Step 2: Create a Custom Trainer
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=batch_size, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Define the loss function with your calculated weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Label counts: label
1    16785
0     9300
Name: count, dtype: int64
Calculated class weights: tensor([1.4024, 0.7770], device='cuda:0')


In [10]:
# LoRA configuration
lora_config = LoraConfig(
    r=4,
    lora_alpha = 8,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    use_rslora = True,
)

In [11]:
# Load the pre-trained model for sequence classification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Set up the training arguments

model_name = checkpoint.split("/")[-1]
batch_size = 4

args = TrainingArguments(
    model_name,
    warmup_steps=0,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #fp16 = True,
    optim = "paged_adamw_8bit",
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to = "none",
)


# Load the metrics
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

# Function to compute metrics during evaluation

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    results = {}
    # Use 'weighted' average to account for imbalance in the report
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="weighted"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="weighted"))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="weighted"))

    return results

# Create a Trainer instance with the model, training arguments, datasets, tokenizer, and metrics function

trainer = WeightedLossTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)


config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/136M [00:00<?, ?B/s]

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t12_35M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 370,082 || all params: 33,871,445 || trainable%: 1.0926


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [12]:
# Train the model using the Trainer instance
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.575,0.548006,0.741832,0.749946,0.73827
2,0.5527,0.534004,0.742293,0.760033,0.737351
3,0.5477,0.536137,0.744482,0.761454,0.73965


TrainOutput(global_step=19566, training_loss=0.5584686974934197, metrics={'train_runtime': 1801.3588, 'train_samples_per_second': 43.442, 'train_steps_per_second': 10.862, 'total_flos': 1808489770010280.0, 'train_loss': 0.5584686974934197, 'epoch': 3.0})

In [13]:
# Save model
model.save_pretrained("PLM_Sequence_LORA_Outputs/my_model_dir")

In [14]:
# Save tokens
tokenizer.save_pretrained("PLM_Sequence_LORA_Outputs/tokenizer_dir")

('PLM_Sequence_LORA_Outputs/tokenizer_dir/tokenizer_config.json',
 'PLM_Sequence_LORA_Outputs/tokenizer_dir/special_tokens_map.json',
 'PLM_Sequence_LORA_Outputs/tokenizer_dir/vocab.txt',
 'PLM_Sequence_LORA_Outputs/tokenizer_dir/added_tokens.json')

In [15]:
# Retrieve the log history from the trainer state
log_history = trainer.state.log_history
df_log = pd.DataFrame(log_history)

# Save the log history to a CSV file
df_log.to_csv("PLM_Sequence_LORA_Outputs/training_logs.csv", index=False)