In [None]:
import pandas as pd

# Manually define the BLOSUM62 matrix
blosum62 = {
    ('A', 'A'): 4, ('R', 'R'): 5, ('N', 'N'): 6, ('D', 'D'): 6, ('C', 'C'): 9,
    ('Q', 'Q'): 5, ('E', 'E'): 5, ('G', 'G'): 6, ('H', 'H'): 8, ('I', 'I'): 4,
    ('L', 'L'): 4, ('K', 'K'): 5, ('M', 'M'): 5, ('F', 'F'): 6, ('P', 'P'): 7,
    ('S', 'S'): 4, ('T', 'T'): 5, ('W', 'W'): 11, ('Y', 'Y'): 7, ('V', 'V'): 4
}

# Define a function to calculate BLOSUM62 embeddings for a sequence
def calculate_blosum62_embedding(sequence):
    embedding = []
    for aa in sequence:
        # For each amino acid, use its substitution score with itself as the embedding value
        # If amino acid doesn't exist in the BLOSUM62 matrix, assign a default value
        embedding.append(blosum62.get((aa, aa), 0))  # Default score = 0 if the AA is not in BLOSUM62
    return embedding

# Assuming `embeddings` DataFrame has a column 'sequence' with protein sequences
# Add a new column for the BLOSUM62 embeddings
embeddings['blosum62_embedding'] = embeddings['sequence'].apply(calculate_blosum62_embedding)

# Save the updated DataFrame to a new file if needed
embeddings.to_pickle("updated_embeddings_with_blosum62.pkl")

# Display the updated DataFrame
embeddings.head()
