In [None]:
## For google collab

#!pip install datasets evaluate transformers
#!pip install -U datasets huggingface_hub fsspec
#!pip install peft
#!apt-get install cd-hit

In [None]:
## Combine the DIBS and MFIB sequences, and cluster them using CD-HIT

#!cat DIBS.fasta mfib.fasta > do_transition.fasta 
#!cd-hit -i do_transition.fasta -o do_transition_cdhit.fasta -c 0.7
#!cd-hit -i fuzdb.fasta -o dd_transition_cdhit.fasta -c 0.7

In [None]:
# Import necessary libraries

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from peft import LoraConfig, get_peft_model
import evaluate

In [None]:
# Read a FASTA file and return a list of sequences.

def read_fasta(file_path):
    sequences=[]
    current_sequence=[]
    with open(file_path,"r") as file:
        for line in file:
            if line.startswith(">"):
                if current_sequence:
                    sequences.append(''''''.join(current_sequence))
                    current_sequence=[]
            else:
                current_sequence.append(line.strip())
        if current_sequence:
            sequences.append(''''''.join(current_sequence))
    return sequences

In [None]:
# Read the sequences from the FASTA files and create DataFrames for each dataset

sequences_do = read_fasta("do_transition_cdhit.fasta") # clustered MFIB+DIBS sequences by CD-HIT by 0.7
sequences_dd = read_fasta("dd_transition_cdhit.fasta") # clustered fuzdb sequences by CD-HIT by 0.7


df_do = pd.DataFrame(sequences_do, columns=["sequence"])
df_do["label"] = 0
df_dd = pd.DataFrame(sequences_dd, columns=["sequence"])
df_dd["label"] = 1


# Calculate the length of each sequence and add it as a new column

df_do["length"] = df_do["sequence"].apply(len)
df_dd["length"] = df_dd["sequence"].apply(len)

print("MFIB+DIBS:",df_do["length"].describe(), sep="\n")
print("fuzdb:",df_dd["length"].describe(), sep="\n")

# Filter out sequences longer than 1000 characters

print("Number of removed sequences:" ,df_dd[df_dd["length"] > 1000]["sequence"].count())
df_dd = df_dd[df_dd["length"] < 1001]

# Concatenate the DataFrames

df_all = pd.concat([df_do, df_dd], ignore_index=True)

# Remove all sequences that are repeated more than once

sequence_counts = df_all["sequence"].value_counts()
unique_sequences = sequence_counts[sequence_counts == 1].index
df_unique = df_all[df_all["sequence"].isin(unique_sequences)].reset_index(drop=True)
print("Unique sequences count:", df_unique.shape[0])

MFIB+DIBS:
count    614.000000
mean      49.609121
std       50.737725
min       11.000000
25%       16.000000
50%       32.000000
75%       70.000000
max      630.000000
Name: length, dtype: float64
fuzdb:
count     316.000000
mean      559.186709
std       594.045469
min        30.000000
25%       171.750000
50%       407.000000
75%       709.000000
max      4953.000000
Name: length, dtype: float64
Number of removed sequences: 42
Unique sequences count: 880


In [None]:
# Split the dataset into training and testing sets, stratifying by label

train_df, test_df = train_test_split(df_unique, test_size=0.15, stratify=df_unique["label"], random_state=42, shuffle=True)

print("Train set size:", train_df.shape[0])
print("Test set size:", test_df.shape[0])
print("Train set label distribution:
", train_df["label"].value_counts())
print("Test set label distribution:
", test_df["label"].value_counts())
print("Train set length distribution:
", train_df["length"].describe())
print("Test set length distribution:
", test_df["length"].describe())
print("Columns in train set and test set:", train_df.columns.tolist(), test_df.columns.tolist())

Train set size: 748
Test set size: 132
Train set label distribution:
 label
0    518
1    230
Name: count, dtype: int64
Test set label distribution:
 label
0    92
1    40
Name: count, dtype: int64
Train set length distribution:
 count    748.000000
mean     155.312834
std      213.010061
min       11.000000
25%       22.000000
50%       64.000000
75%      153.250000
max      979.000000
Name: length, dtype: float64
Test set length distribution:
 count    132.000000
mean     133.265152
std      196.219415
min       11.000000
25%       18.000000
50%       54.500000
75%      136.750000
max      917.000000
Name: length, dtype: float64
Columns in train set and test set: ['sequence', 'label', 'length'] ['sequence', 'label', 'length']


In [None]:
# load the tokenizer for the ESM-2 model

checkpoint = "facebook/esm2_t12_35M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

# Function to tokenize the sequences in the dataset

def tokenize_function(example):
    return tokenizer(example["sequence"], truncation=True)

In [None]:
# Convert the DataFrames to Hugging Face Datasets and tokenize them

train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
train_dataset = train_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_df, preserve_index=False)
test_dataset = test_dataset.map(tokenize_function, batched=True)

print("Train datset:", train_dataset)
print("Test datset:", test_dataset)

In [None]:
# Remove unnecessary columns from the datasets

train_dataset = train_dataset.remove_columns(["sequence", "length"])
test_dataset = test_dataset.remove_columns(["sequence", "length"])

print("Train datset:", train_dataset)
print("Test datset:", test_dataset)

In [None]:
##  Class Wighting to handle class imbalance

# Calculate Weights

label_counts = train_dataset.to_pandas()['label'].value_counts()
print(f"Label counts: {label_counts}")
num_samples_class_0 = label_counts[0]
num_samples_class_1 = label_counts[1]
total_samples = len(train_dataset)

weight_for_0 = total_samples / (2 * num_samples_class_0)
weight_for_1 = total_samples / (2 * num_samples_class_1)

class_weights = torch.tensor([weight_for_0, weight_for_1])

# Move to GPU if available and convert to float32

device = 'cuda' if torch.cuda.is_available() else 'cpu'
class_weights = class_weights.to(device).to(torch.float32)

print(f"Calculated class weights: {class_weights}")

batch_size = 4

# Step 2: Create a Custom Trainer

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=batch_size, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Define the loss function with your calculated weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
#LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [None]:
# Load the pre-trained model for sequence classification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Set up the training arguments

model_name = checkpoint.split("/")[-1]
batch_size = 4

args = TrainingArguments(
    model_name,
    warmup_steps=0,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to = "none",
)


# Load the metrics

f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

# Function to compute metrics during evaluation

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    results = {}
    # Use 'weighted' average to account for imbalance in the report
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="weighted"))
    results.update(precision_metric.compute(predictions=predictions, references=labels, average="weighted"))
    results.update(recall_metric.compute(predictions=predictions, references=labels, average="weighted"))

    return results

# Create a Trainer instance with the model, training arguments, datasets, tokenizer, and metrics function

trainer = WeightedLossTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model using the Trainer instance

trainer.train()

In [None]:
# Save model and tokenizer

trainer.save_model("my_model_dir")
tokenizer.save_pretrained("my_model_dir")

In [None]:
# Retrieve the log history from the trainer state

log_history = trainer.state.log_history
df_log = pd.DataFrame(log_history)

# Save the log history to a CSV file
df_log.to_csv("training_logs.csv", index=False)