# Protein Sequence Embedding using ESM-2 Transformer Models
## This script extracts high-dimensional embeddings for protein sequences using ESM-2 models. It reads sequences from a CSV file, processes each sequence through a pretrained ESM-2 model to generate per-residue and per-sequence embeddings, and saves the results to CSV files.

In [None]:
!pip install biopython fair-esm

In [None]:
import numpy as np
import pandas as pd
import torch
import esm
from Bio import SeqIO
import os

def load_csv_file(csv_path):

    df = pd.read_csv(csv_path)
    sequences = {}

    if 'protein_name' not in df.columns or 'sequence' not in df.columns:
        raise ValueError("CSV file must contain 'protein_name' and 'sequence' columns")

    for _, row in df.iterrows():
        sequences[row['protein_name']] = row['sequence']

    return sequences

def load_esm2_model(model_name="esm2_t6_8M_UR50D"):
    model, alphabet = esm.pretrained.load_model_and_alphabet(model_name)
    model.eval()
    return model, alphabet

def embed_protein_sequence(sequence, model, alphabet, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)

    batch_converter = alphabet.get_batch_converter()
    data = [("protein1", sequence)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[6], return_contacts=False)

    token_embeddings = results["representations"][6]

    per_residue_embedding = token_embeddings[0, 1:-1, :].cpu()
    per_sequence_embedding = per_residue_embedding.mean(dim=0)

    return per_residue_embedding, per_sequence_embedding

def get_esm_embeddings_from_csv(csv_path, model_name="esm2_t6_8M_UR50D", batch_size=1):

    sequences = load_csv_file(csv_path)

    model, alphabet = load_esm2_model(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    results = {
        'sequence_ids': [],
        'sequences': [],
        'per_residue_embeddings': [],
        'per_sequence_embeddings': [],
        'embedding_dimension': None
    }

    for i, (seq_id, sequence) in enumerate(sequences.items()):
        print(f"Processing sequence {i+1}/{len(sequences)}: {seq_id}")

        try:
            # Get embeddings
            residue_embeds, seq_embed = embed_protein_sequence(sequence, model, alphabet, device)

            # Store results
            results['sequence_ids'].append(seq_id)
            results['sequences'].append(sequence)
            results['per_residue_embeddings'].append(residue_embeds.numpy())
            results['per_sequence_embeddings'].append(seq_embed.numpy())

            if results['embedding_dimension'] is None:
                results['embedding_dimension'] = seq_embed.shape[0]

        except Exception as e:
            print(f"Error processing sequence {seq_id}: {e}")
            continue

    return results

def save_embeddings_to_csv(results, output_dir="embeddings"):
    os.makedirs(output_dir, exist_ok=True)

    # Create DataFrame for per-sequence embeddings
    seq_embeddings = np.array(results['per_sequence_embeddings'])
    embedding_dim = results['embedding_dimension']

    # Create column names for the embedding dimensions
    feature_columns = [f'feature_{i}' for i in range(embedding_dim)]

    # Create DataFrame with sequence IDs and embeddings
    df_embeddings = pd.DataFrame(seq_embeddings, columns=feature_columns)
    df_embeddings.insert(0, 'sequence_id', results['sequence_ids'])
    df_embeddings.insert(1, 'sequence', results['sequences'])

    # Save to CSV
    # csv_path = os.path.join(output_dir, 'per_sequence_embeddings.csv')
    df_embeddings.to_csv(csv_path, index=False)

    # Also save a metadata file
    metadata_df = pd.DataFrame({
        'sequence_id': results['sequence_ids'],
        'sequence_length': [len(seq) for seq in results['sequences']],
        'embedding_dimension': embedding_dim
    })
    metadata_df.to_csv(os.path.join(output_dir, 'metadata.csv'), index=False)

    print(f"Saved {len(seq_embeddings)} embeddings to CSV files in {output_dir}/")
    print(f"Embedding dimension: {embedding_dim}")
    print(f"Shape of per-sequence embeddings: {seq_embeddings.shape}")
    print(f"CSV file saved: {csv_path}")

    return df_embeddings

if __name__ == "__main__":

    csv_file_path = ".../Plant/Non_MP_final_clean.csv"
    csv_path = ".../Plant/ESM/ESM_Non_MP_Final.csv"
    embeddings = get_esm_embeddings_from_csv(csv_file_path)

    df_embeddings = save_embeddings_to_csv(embeddings)

    print(f"\nFirst few rows of the CSV:")
    print(df_embeddings.head())
    print(f"\nTotal sequences processed: {len(embeddings['sequence_ids'])}")

In [None]:
df_embeddings.to_csv(r'.../ESM_DnaBinding_Neg1.csv',index=False)