In [2]:
import os
import torch
from Bio import SeqIO
import pandas as pd
import random


def reduce_data(fasta_file, csv_file, reduced_amount, reduced_fasta_file, reduced_csv_file):
    # Read the FASTA file and select a random subset of sequences
    fasta_sequences = {}
    selected_identifiers = []
    with open(fasta_file, "r") as fasta:
        records = list(SeqIO.parse(fasta, "fasta"))
        random.shuffle(records)
        selected_records = records[:reduced_amount]
        for record in selected_records:
            fasta_sequences[record.id] = str(record.seq)
            selected_identifiers.append(record.id)

    # Read the CSV file and store conservation scores for matching identifiers
    csv_data = pd.read_csv(csv_file, delimiter='\t', index_col=0)
    reduced_csv_data = pd.DataFrame(
        columns=csv_data.columns)  # Create an empty DataFrame
    for identifier in selected_identifiers:
        # Search for partial matches in the index
        matching_index = csv_data.index[csv_data.index.str.contains(
            identifier)]
        if len(matching_index) > 0:
            reduced_csv_data = pd.concat(
                [reduced_csv_data, csv_data.loc[matching_index]])

    # Write reduced data to new files
    with open(reduced_fasta_file, "w") as fasta_reduced:
        with open(reduced_csv_file, "w") as csv_reduced:
            for identifier, sequence in fasta_sequences.items():
                fasta_reduced.write(f">{identifier}\n{sequence}\n")
            reduced_csv_data.to_csv(csv_reduced, sep='\t')


# reduce_data("curated_dataset/sequences.fasta", "curated_dataset/conservation_scores_formated.csv", 1000,"curated_dataset/reduced_input.fasta", "curated_dataset/reduced_input.csv")

# esm-extract esm2_t6_8M_UR50D curated_dataset/reduced_input.fasta curated_dataset/example_embeddings_esm2_reduced_input --repr_layers 0 5 6 --include mean per_tok

def get_embeddings_vectors_sample_data(folder_path):

    # Get the list of files in the folder
    files = os.listdir(folder_path)

    # Filter the files to only include .pt files
    pt_files = [file for file in files if file.endswith('.pt')]

    # Initialize a list to store the vectors
    vectors = []

    # Load the vectors from each .pt file
    for file in pt_files:
        file_path = os.path.join(folder_path, file)
        embeddings = torch.load(file_path)["representations"][6]
        #embeddings = torch.load(file_path)["representations"][6]
        float_vector = embeddings.numpy().astype(float)
        vectors.append(float_vector)
    return vectors


vectors = get_embeddings_vectors_sample_data('sample_data/example_embeddings_esm2')
print("vectors", len(vectors))


vectors 3


In [3]:
vectors[0].shape

(659, 320)

In [37]:
def get_embeddings_vectors_curated_data(folder_path):
    # Initialize a list to store the vectors
    vectors = []
    embeddings = []

    # Traverse through each folder in the specified directory
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file is a .pt file
            if file.endswith('.pt'):
                file_path = os.path.join(root, file)
                embedding = torch.load(file_path)
                selected_embedding = {"label": embedding["label"], "representations": embedding["representations"][6]}
                #float_vector = embeddings.numpy().astype(float)
                #vectors.append(float_vector)
                embeddings.append(selected_embedding)

    return vectors, embeddings

vectors, embeddings = get_embeddings_vectors_curated_data('curated_dataset/example_embeddings_esm2_reduced_input')
#print("Vectors:", len(vectors[0]))
print("embeddings len", (embeddings[2]["representations"]).shape)
print("embeddings", embeddings[2])
len("STPIRIFANGRRRVEVLRDNRLIYATSVNAGSQEIDTSSFPQGSYQLTIRIFNGSTLEQ")

embeddings len torch.Size([59, 320])
embeddings {'label': 'A0A1X7AIY7.1/282-340', 'representations': tensor([[-0.6685, -0.0708, -0.5033,  ...,  0.8954,  0.4106, -0.5340],
        [-0.6899, -0.1052,  0.0013,  ...,  0.5112, -0.0517, -0.3438],
        [-0.5972, -0.1812,  0.3029,  ...,  0.4339,  0.3076, -0.0928],
        ...,
        [-0.2558, -0.0050,  0.2232,  ...,  0.4575, -0.0603, -0.1484],
        [-0.3865, -0.1068,  0.3723,  ...,  0.4398, -0.3131, -0.0656],
        [-0.1832,  0.3298,  0.3563,  ..., -0.0673, -0.4823, -0.1598]])}


59

In [10]:
len('QIGGEDKADIAPILKEGFVGPGMQINNLLQERGEIVATVICGDNYFNENLDEATDTILGMIGQFNPDIVIAGPSFNAGRYGMACGAVCKAVSEKFNIPTLTGMYIESPGVDGYRKYTYIVETANSAVGMRTALPAMVKLALKLVDGVELGDPKEEGYIARGVRRNYFHAVRGSKRAVDMLIAKINDQPFTTEYPMPTFDRVAPNPHIVDMSKATIALVTSGGIVPKGNPDHIESSSASKFGKYDIEGFTNLTEKTHETAHGGYDPVYANLDADRVLPVDVLRELEAEGVIGKLHRYFYTTVGNGTSVANAKKFAAAIGKELVEAKVDAVILTST')

334

In [5]:
len(vectors[5])

104000

In [6]:
import torch
import torch.nn as nn

class ConservationModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(ConservationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Salida unidimensional para el score de conservación

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

def calculate_conservation_scores(embeddings_vectors, hidden_size, learning_rate, num_epochs):
    # Convertimos los vectores de embeddings en un tensor de PyTorch
    x = torch.tensor(embeddings_vectors, dtype=torch.float32)

    # Normalizamos los datos de entrada
    x = (x - x.min()) / (x.max() - x.min())

    # Creamos el modelo
    model = ConservationModel(input_size=x.shape[1], hidden_size=hidden_size)

    # Definimos la función de pérdida y el optimizador
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Entrenamiento del modelo
    for epoch in range(num_epochs):
        # Forward pass
        outputs = model(x)

        # Calculamos la pérdida
        loss = criterion(outputs, x)  # Usamos los mismos datos como entrada y objetivo para el autoencoder

        # Backward y optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Mostramos la pérdida
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Guardamos el modelo entrenado si es necesario
    torch.save(model.state_dict(), 'conservation_model.pth')

# Supongamos que tienes un array llamado "vectors" que contiene los embeddings vectors
# y quieres entrenar el modelo con un tamaño de capa oculta de 64, una tasa de aprendizaje de 0.001
# y durante 10 épocas.
calculate_conservation_scores(vectors, hidden_size=64, learning_rate=0.001, num_epochs=10)


  x = torch.tensor(embeddings_vectors, dtype=torch.float32)


ValueError: expected sequence of length 102720 at dim 1 (got 112960)

In [None]:
import numpy as np

# Iterar sobre cada lista de puntajes de conservación
for i in range(len(conservation_scores)):
    # Obtener la lista de puntajes de conservación actual
    current_score = conservation_scores[i]
    # Reemplazar los valores NaN por un valor específico, por ejemplo, 0
    current_score = [0 if np.isnan(score) else score for score in current_score]
    # Asignar la lista modificada de puntajes de conservación de nuevo al array original
    conservation_scores[i] = current_score

print("conservation_scores (después de tratar los NaN):", len(conservation_scores[3]))
print(conservation_scores)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np

# Define the model
class LinearRegressionModel(torch.nn.Module):
    def __init__(self, input_size):
        super(LinearRegressionModel, self).__init__()
        self.linear = torch.nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

# Modificar la función de entrenamiento para ajustar las etiquetas
def train_one_epoch(epoch_index, model, training_loader, optimizer, loss_fn, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(training_loader):
        inputs, labels = data

        optimizer.zero_grad()
        outputs = model(inputs)
        
        # Reducir las etiquetas a una sola dimensión (sumando)
        labels = torch.sum(labels, dim=1, keepdim=True)

        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

        # Imprimir las salidas del modelo
        print('Sample outputs:', outputs)

    return last_loss

# Prepare your data
# Rellenar las listas más cortas con ceros para que todas tengan la misma longitud
max_length = max(len(score) for score in conservation_scores)
conservation_scores = [score + [0] * (max_length - len(score)) for score in conservation_scores]

X_train = torch.tensor(vectors, dtype=torch.float32)
if isinstance(conservation_scores, np.ndarray):
    # If conservation_scores is a NumPy array, convert it to a list first
    conservation_scores = conservation_scores.tolist()
y_train = torch.tensor(conservation_scores, dtype=torch.float32)

# Create a DataLoader
batch_size = 32
training_data = TensorDataset(X_train, y_train)
training_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True)

# Initialize model, optimizer, and loss function
input_size = X_train.shape[1]
model = LinearRegressionModel(input_size=input_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

# Initialize TensorBoard writer
tb_writer = SummaryWriter()

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}:")
    last_loss = train_one_epoch(epoch, model, training_loader, optimizer, loss_fn, tb_writer)
    print(f"  Epoch loss: {last_loss}")

# Close TensorBoard writer
tb_writer.close()
