In [162]:
import os
import torch
from Bio import SeqIO
import pandas as pd
import random

In [None]:
def reduce_data(fasta_file, csv_file, reduced_amount, reduced_fasta_file, reduced_csv_file):
    # Read the FASTA file and select a random subset of sequences
    fasta_sequences = {}
    selected_identifiers = []
    with open(fasta_file, "r") as fasta:
        records = list(SeqIO.parse(fasta, "fasta"))
        random.shuffle(records)
        selected_records = records[:reduced_amount]
        for record in selected_records:
            fasta_sequences[record.id] = str(record.seq)
            selected_identifiers.append(record.id)

    # Read the CSV file and store conservation scores for matching identifiers
    csv_data = pd.read_csv(csv_file, delimiter='\t', index_col=0)
    reduced_csv_data = pd.DataFrame(
        columns=csv_data.columns)  # Create an empty DataFrame
    for identifier in selected_identifiers:
        # Search for partial matches in the index
        matching_index = csv_data.index[csv_data.index.str.contains(
            identifier)]
        if len(matching_index) > 0:
            reduced_csv_data = pd.concat(
                [reduced_csv_data, csv_data.loc[matching_index]])

    # Write reduced data to new files
    with open(reduced_fasta_file, "w") as fasta_reduced:
        with open(reduced_csv_file, "w") as csv_reduced:
            for identifier, sequence in fasta_sequences.items():
                fasta_reduced.write(f">{identifier}\n{sequence}\n")
            reduced_csv_data.to_csv(csv_reduced, sep='\t')


# reduce_data("curated_dataset/sequences.fasta", "curated_dataset/conservation_scores_formated.csv", 1000,"curated_dataset/reduced_input.fasta", "curated_dataset/reduced_input.csv")

# esm-extract esm2_t6_8M_UR50D curated_dataset/reduced_input.fasta curated_dataset/example_embeddings_esm2_reduced_input --repr_layers 0 5 6 --include mean per_tok

def get_embeddings_vectors_sample_data(folder_path):

    # Get the list of files in the folder
    files = os.listdir(folder_path)

    # Filter the files to only include .pt files
    pt_files = [file for file in files if file.endswith('.pt')]

    # Initialize a list to store the vectors
    vectors = []

    # Load the vectors from each .pt file
    for file in pt_files:
        file_path = os.path.join(folder_path, file)
        embeddings = torch.load(file_path)["representations"][6]
        #embeddings = torch.load(file_path)["representations"][6]
        float_vector = embeddings.numpy().astype(float)
        vectors.append(float_vector)
    return vectors


vectors = get_embeddings_vectors_sample_data('sample_data/example_embeddings_esm2')
print("vectors", len(vectors))


In [None]:
vectors[0].shape

In [119]:
def get_embeddings_vectors_curated_data(folder_path):
    # Initialize a list to store the vectors
    vectors = []
    embeddings = {}

    # Traverse through each folder in the specified directory
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file is a .pt file
            if file.endswith('.pt'):
                file_path = os.path.join(root, file)
                embedding = torch.load(file_path)
                embeddings[embedding["label"]] = embedding["representations"][6]

    return embeddings

embeddings_dict = get_embeddings_vectors_curated_data('curated_dataset/example_embeddings_esm2_reduced_input')
print("embeddings_dict len", (embeddings_dict["A0A1X7AIY7.1/282-340"]).shape)
print("embeddings_dict", embeddings_dict["A0A1X7AIY7.1/282-340"])
len("STPIRIFANGRRRVEVLRDNRLIYATSVNAGSQEIDTSSFPQGSYQLTIRIFNGSTLEQ")

embeddings_dict len torch.Size([59, 320])
embeddings_dict tensor([[-0.6685, -0.0708, -0.5033,  ...,  0.8954,  0.4106, -0.5340],
        [-0.6899, -0.1052,  0.0013,  ...,  0.5112, -0.0517, -0.3438],
        [-0.5972, -0.1812,  0.3029,  ...,  0.4339,  0.3076, -0.0928],
        ...,
        [-0.2558, -0.0050,  0.2232,  ...,  0.4575, -0.0603, -0.1484],
        [-0.3865, -0.1068,  0.3723,  ...,  0.4398, -0.3131, -0.0656],
        [-0.1832,  0.3298,  0.3563,  ..., -0.0673, -0.4823, -0.1598]])


59

In [None]:
len('QIGGEDKADIAPILKEGFVGPGMQINNLLQERGEIVATVICGDNYFNENLDEATDTILGMIGQFNPDIVIAGPSFNAGRYGMACGAVCKAVSEKFNIPTLTGMYIESPGVDGYRKYTYIVETANSAVGMRTALPAMVKLALKLVDGVELGDPKEEGYIARGVRRNYFHAVRGSKRAVDMLIAKINDQPFTTEYPMPTFDRVAPNPHIVDMSKATIALVTSGGIVPKGNPDHIESSSASKFGKYDIEGFTNLTEKTHETAHGGYDPVYANLDADRVLPVDVLRELEAEGVIGKLHRYFYTTVGNGTSVANAKKFAAAIGKELVEAKVDAVILTST')

In [None]:
len(vectors[5])

In [164]:
# Fonction pour charger les données à partir du CSV et les convertir en tenseurs
def load_data(csv_file):
    # Charger le CSV
    df = pd.read_csv(csv_file, delimiter=',', names=[
                     'sequence id', 'conservation score'], header=0)

    sequences = df['sequence id'].values
    conservation_scores = df['conservation score'].apply(lambda x: np.array(
    [float(i) if i != 'nan' else 0.0 for i in x.split()], dtype=np.float32)).values
    print("conservation_scores", conservation_scores)

    return sequences, conservation_scores

sequences, conservation_scores_tensors = load_data('curated_dataset/reduced_input.csv')

conservation_scores [array([0.24585 , 0.386475, 0.670898, 0.519531, 0.53125 , 0.630371,
        0.795898, 0.51709 , 0.856445, 0.831055, 0.814453, 0.785156,
        0.616699, 0.647949, 0.480957, 0.765137, 0.683105, 0.820801,
        0.615723, 0.399414, 0.724609, 0.495117, 0.275146, 0.294434,
        0.54541 , 0.170288, 0.206787, 0.299561, 0.358154, 0.534668,
        0.347168, 0.425049, 0.727051, 0.59668 , 0.594727, 0.256104,
        0.30249 , 0.517578, 0.334961, 0.266357, 0.202637, 0.489502,
        0.365967, 0.314209, 0.26123 , 0.32251 , 0.426514, 0.616211,
        0.516602, 0.35376 , 0.765625, 0.540039, 0.492188, 0.781738,
        0.776367, 0.847656, 0.63916 , 0.671875, 0.527344, 0.424805,
        0.484619, 0.515625, 0.139771, 0.491699, 0.294678, 0.426514,
        0.1521  , 0.265625, 0.212036, 0.468994, 0.211792, 0.228516,
        0.24939 , 0.462402, 0.330811, 0.269287, 0.216553, 0.325195,
        0.638184, 0.662598, 0.439941, 0.562988, 0.528809, 0.669922,
        0.789062, 0.871094, 

In [133]:
conservation_scores_tensors[1].shape

torch.Size([141])

In [169]:
# Fonction pour obtenir les embeddings correspondant aux séquences
def get_embeddings(sequences, embeddings_dict):
    embeddings = []
    for sequence_id in sequences:
        embedding = embeddings_dict[sequence_id]
        print(embedding)
        embeddings.append(embedding)
    embeddings = torch.stack(embeddings)
    return embeddings

def get_embedding(sequence_id, embeddings_dict):
    return embeddings_dict[sequence_id]


# Fonction pour entraîner le modèle en utilisant la descente de gradient stochastique (SGD)
def train_model_stochastic(model, optimizer, loss_fn, sequences, conservation_scores):
    model.train()
    for i in range(len(sequences)):
        sequence_id = sequences[i]
        embedding = get_embedding(sequence_id, embeddings_dict)
        #embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
        label = torch.tensor(conservation_scores[i], dtype=torch.float32)
        print("label", label.shape)
        print("label", label)
        print("embedding", embedding.shape)
        print("embedding", embedding)
        

        optimizer.zero_grad()
        output = model(embedding)
        print("output", output.shape)
        print("output", output)
        
        
        loss = loss_fn(output.squeeze(), label)
        loss.backward()
        optimizer.step()



# Définir le modèle de régression linéaire

class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)



# Évaluation du modèle sur l'ensemble de validation
def evaluate_model(model, loss_fn, data_loader):
    running_loss = 0.

    with torch.no_grad():
        for i, data in enumerate(data_loader):
            inputs, labels = data

            outputs = model(inputs)

            loss = loss_fn(outputs, labels)

            running_loss += loss.item()

    return running_loss / len(data_loader)

# Configuration des hyperparamètres
learning_rate = 0.001
num_epochs = 5
batch_size = 32


# Créer l'ensemble de données
dataset = [(embeddings_dict[sequence], conservation_scores) for sequence, conservation_scores in zip(sequences, conservation_scores_tensors)]
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Créer les data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)



# Initialiser le modèle, la fonction de perte et l'optimiseur
model = LinearRegression(input_size=320)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Entraînement du modèle
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}:')
    model.train()
    train_model_stochastic(model, optimizer, loss_fn,
                           sequences[:10], conservation_scores_tensors[:10])

    # Validation du modèle
    model.eval()
    val_loss = evaluate_model(model, loss_fn, val_loader)

    print(f'Validation Loss: {val_loss}')


Epoch 1/5:
label torch.Size([347])
label tensor([0.2458, 0.3865, 0.6709, 0.5195, 0.5312, 0.6304, 0.7959, 0.5171, 0.8564,
        0.8311, 0.8145, 0.7852, 0.6167, 0.6479, 0.4810, 0.7651, 0.6831, 0.8208,
        0.6157, 0.3994, 0.7246, 0.4951, 0.2751, 0.2944, 0.5454, 0.1703, 0.2068,
        0.2996, 0.3582, 0.5347, 0.3472, 0.4250, 0.7271, 0.5967, 0.5947, 0.2561,
        0.3025, 0.5176, 0.3350, 0.2664, 0.2026, 0.4895, 0.3660, 0.3142, 0.2612,
        0.3225, 0.4265, 0.6162, 0.5166, 0.3538, 0.7656, 0.5400, 0.4922, 0.7817,
        0.7764, 0.8477, 0.6392, 0.6719, 0.5273, 0.4248, 0.4846, 0.5156, 0.1398,
        0.4917, 0.2947, 0.4265, 0.1521, 0.2656, 0.2120, 0.4690, 0.2118, 0.2285,
        0.2494, 0.4624, 0.3308, 0.2693, 0.2166, 0.3252, 0.6382, 0.6626, 0.4399,
        0.5630, 0.5288, 0.6699, 0.7891, 0.8711, 0.6699, 0.8403, 0.3967, 0.6938,
        0.7891, 0.8530, 0.8628, 0.7793, 0.3401, 0.6738, 0.8076, 0.5669, 0.1830,
        0.4829, 0.5679, 0.2678, 0.2668, 0.5259, 0.2422, 0.3416, 0.3459, 0.4407,

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


label torch.Size([152])
label tensor([0.7188, 0.1293, 0.3752, 0.5693, 0.3086, 0.5938, 0.2340, 0.1895, 0.2461,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.1914, 0.0623, 0.0538, 0.0816, 0.0540, 0.0431, 0.1056,
        0.0307, 0.0578, 0.1093, 0.0815, 0.1298, 0.0240, 0.0750, 0.1160, 0.0898,
        0.0814, 0.1033, 0.1185, 0.0753, 0.1061, 0.1302, 0.1294, 0.0628, 0.0858,
        0.0732, 0.1250, 0.1417, 0.1609, 0.1882, 0.1801, 0.1560, 0.0618, 0.1763,
        0.1903, 0.2274, 0.3479, 0.3691, 0.2233, 0.2930, 0.4805, 0.3982, 0.5010,
        0.3159, 0.1116, 0.5205, 0.4585, 0.5884, 0.4607, 0.7729, 0.7886, 0.4397,
        0.4780, 0.7651, 0.4797, 0.3765, 0.1447, 0.1329, 0.2671, 0.1959, 0.2097,
        0.1270, 0.3616, 0.3271, 0.3992, 0.7085, 0.5405, 0.6484, 0.6914, 0.7798,
        0.5322, 0.3521, 0.6709, 0.6191, 0.8818, 0.5425, 0.8418, 0.3555, 0.7852,
        0.6665, 0.8042, 0.6880, 0.4749, 0.7944, 0.8105, 0.5142, 0.6450, 0.4468,
        0.